Changeset 9919 for trunk/gsdl
- Timestamp:
- 2005-05-20T12:24:06+12:00 (19 years ago)
- Location:
- trunk/gsdl/perllib
- Files:
-
- 1 added
- 4 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/lucenebuildproc.pm
r9669 r9919 46 46 } 47 47 48 sub set_gdbm_level {49 my $self = shift(@_);50 my ($level) = @_;51 52 $self->{'gdbm_level'} = $level;53 }54 48 sub preprocess_text { 55 49 my $self = shift (@_); -
trunk/gsdl/perllib/mgbuildproc.pm
r9669 r9919 30 30 package mgbuildproc; 31 31 32 eval {require bytes}; 33 34 use classify; 35 use doc; 36 use docproc; 37 use util; 32 use basebuildproc; 38 33 39 34 BEGIN { 40 @mgbuildproc::ISA = (' docproc');35 @mgbuildproc::ISA = ('basebuildproc'); 41 36 } 42 37 43 44 38 sub new { 45 my ($class, $collection, $source_dir, $build_dir, 46 $verbosity, $outhandle) = @_; 47 my $self = new docproc (); 48 49 # outhandle is where all the debugging info goes 50 # output_handle is where the output of the plugins is piped 51 # to (i.e. mg, gdbm etc.) 52 $outhandle = STDERR unless defined $outhandle; 53 54 $self->{'collection'} = $collection; 55 $self->{'source_dir'} = $source_dir; 56 $self->{'build_dir'} = $build_dir; 57 $self->{'verbosity'} = $verbosity; 58 $self->{'classifiers'} = []; 59 $self->{'mode'} = "text"; 60 $self->{'assocdir'} = $build_dir; 61 $self->{'dontgdbm'} = {}; 62 $self->{'index'} = "section:text"; 63 $self->{'indexexparr'} = []; 64 $self->{'output_handle'} = "STDOUT"; 65 $self->{'num_docs'} = 0; 66 $self->{'num_sections'} = 0; 67 $self->{'num_bytes'} = 0; 68 $self->{'num_processed_bytes'} = 0; 69 $self->{'store_text'} = 1; 70 $self->{'outhandle'} = $outhandle; 71 72 #used by browse interface 73 $self->{'doclist'} = []; 74 75 $self->{'indexing_text'} = 0; 76 39 my $class = shift @_; 40 my $self = new basebuildproc (@_); 77 41 return bless $self, $class; 78 42 } 79 43 80 sub reset {81 my $self = shift (@_);82 83 $self->{'num_docs'} = 0;84 $self->{'num_sections'} = 0;85 $self->{'num_processed_bytes'} = 0;86 $self->{'num_bytes'} = 0;87 }88 89 sub get_num_docs {90 my $self = shift (@_);91 92 return $self->{'num_docs'};93 }94 95 sub get_num_sections {96 my $self = shift (@_);97 98 return $self->{'num_sections'};99 }100 101 # num_bytes is the actual number of bytes in the collection102 # this is normally the same as what's processed during text compression103 sub get_num_bytes {104 my $self = shift (@_);105 106 return $self->{'num_bytes'};107 }108 109 # num_processed_bytes is the number of bytes actually passed110 # to mg for the current index111 sub get_num_processed_bytes {112 my $self = shift (@_);113 114 return $self->{'num_processed_bytes'};115 }116 117 sub set_output_handle {118 my $self = shift (@_);119 my ($handle) = @_;120 121 $self->{'output_handle'} = $handle;122 }123 124 sub set_mode {125 my $self = shift (@_);126 my ($mode) = @_;127 128 $self->{'mode'} = $mode;129 }130 131 sub set_assocdir {132 my $self = shift (@_);133 my ($assocdir) = @_;134 135 $self->{'assocdir'} = $assocdir;136 }137 138 sub set_dontgdbm {139 my $self = shift (@_);140 my ($dontgdbm) = @_;141 142 $self->{'dontgdbm'} = $dontgdbm;143 }144 145 sub set_index {146 my $self = shift (@_);147 my ($index, $indexexparr) = @_;148 149 $self->{'index'} = $index;150 $self->{'indexexparr'} = $indexexparr if defined $indexexparr;151 }152 153 sub set_index_languages {154 my $self = shift (@_);155 my ($lang_meta, $langarr) = @_;156 $self->{'lang_meta'} = $lang_meta;157 $self->{'langarr'} = $langarr;158 }159 160 sub get_index {161 my $self = shift (@_);162 163 return $self->{'index'};164 }165 166 sub set_classifiers {167 my $self = shift (@_);168 my ($classifiers) = @_;169 170 $self->{'classifiers'} = $classifiers;171 }172 173 sub set_indexing_text {174 my $self = shift (@_);175 my ($indexing_text) = @_;176 177 $self->{'indexing_text'} = $indexing_text;178 }179 180 sub get_indexing_text {181 my $self = shift (@_);182 183 return $self->{'indexing_text'};184 }185 186 sub set_store_text {187 my $self = shift (@_);188 my ($store_text) = @_;189 190 $self->{'store_text'} = $store_text;191 }192 193 sub get_doc_list {194 my $self = shift(@_);195 196 return @{$self->{'doclist'}};197 }198 199 200 sub process {201 my $self = shift (@_);202 my $method = $self->{'mode'};203 204 $self->$method(@_);205 }206 207 # use 'Paged' if document has no more than 2 levels208 # and each section at second level has a number for209 # Title metadata210 #also use Paged if gsdlthistype metadata is set to Paged211 sub get_document_type {212 my $self = shift (@_);213 my ($doc_obj) = @_;214 215 my $thistype = "VList";216 my $childtype = "VList";217 my $title;218 my @tmp = ();219 220 my $section = $doc_obj->get_top_section ();221 222 my $gsdlthistype = $doc_obj->get_metadata_element ($section, "gsdlthistype");223 if (defined $gsdlthistype) {224 if ($gsdlthistype eq "Paged") {225 $childtype = "Paged";226 if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {227 $thistype = "Paged";228 } else {229 $thistype = "Invisible";230 }231 232 return ($thistype, $childtype);233 } elsif ($gsdlthistype eq "Hierarchy") {234 return ($thistype, $childtype); # use VList, VList235 }236 }237 my $first = 1;238 while (defined $section) {239 @tmp = split /\./, $section;240 if (scalar(@tmp) > 1) {241 return ($thistype, $childtype);242 }243 if (!$first) {244 $title = $doc_obj->get_metadata_element ($section, "Title");245 if (!defined $title || $title !~ /^\d+$/) {246 return ($thistype, $childtype);247 }248 }249 $first = 0;250 $section = $doc_obj->get_next_section($section);251 }252 if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {253 $thistype = "Paged";254 } else {255 $thistype = "Invisible";256 }257 $childtype = "Paged";258 return ($thistype, $childtype);259 }260 261 sub assoc_files {262 my $self = shift (@_);263 my ($doc_obj, $archivedir) = @_;264 my ($afile);265 266 foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {267 # if assoc file starts with a slash, we put it relative to the assoc268 # dir, otherwise it is relative to the HASH... directory269 if ($assoc_file->[1] =~ m@^[/\\]@) {270 $afile = &util::filename_cat($self->{'assocdir'},$assoc_file->[1]);271 } else {272 $afile = &util::filename_cat($self->{'assocdir'}, $archivedir, $assoc_file->[1]);273 }274 &util::hard_link ($assoc_file->[0], $afile);275 }276 }277 278 sub infodb {279 my $self = shift (@_);280 my ($doc_obj, $filename) = @_;281 my $handle = $self->{'output_handle'};282 # $handle = "main::STDOUT";283 284 my $doctype = $doc_obj->get_doc_type();285 286 # only output this document if it is one to be indexed287 return if ($doctype ne "indexed_doc");288 289 my ($archivedir) = $filename =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;290 $archivedir = "" unless defined $archivedir;291 $archivedir =~ s/\\/\//g;292 $archivedir =~ s/^\/+//;293 $archivedir =~ s/\/+$//;294 295 # resolve the final filenames of the files associated with this document296 $self->assoc_files ($doc_obj, $archivedir);297 298 #GRB: moved 1/06/2004 from GRB01062004299 #add this document to the browse structure300 push(@{$self->{'doclist'}},$doc_obj->get_OID())301 unless ($doctype eq "classification");302 303 # classify this document304 &classify::classify_doc ($self->{'classifiers'}, $doc_obj);305 #GRB: end of moved block306 307 # this is another document308 $self->{'num_docs'} += 1 unless ($doctype eq "classification");309 310 # is this a paged or a hierarchical document311 my ($thistype, $childtype) = $self->get_document_type ($doc_obj);312 313 my $section = $doc_obj->get_top_section ();314 my $doc_OID = $doc_obj->get_OID();315 my $first = 1;316 my $url = "";317 while (defined $section) {318 # update a few statistics319 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);320 $self->{'num_sections'} += 1 unless ($doctype eq "classification");321 322 # output the section name323 if ($section eq "") { print $handle "[$doc_OID]\n"; }324 else { print $handle "[$doc_OID.$section]\n"; }325 326 # output the fact that this document is a document (unless doctype327 # has been set to something else from within a plugin328 my $dtype = $doc_obj->get_metadata_element ($section, "doctype");329 if (!defined $dtype || $dtype !~ /\w/) {330 print $handle "<doctype>doc\n";331 }332 333 # output whether this node contains text334 if ($doc_obj->get_text_length($section) > 0) {335 print $handle "<hastxt>1\n";336 } else {337 print $handle "<hastxt>0\n";338 }339 340 # output all the section metadata341 my $metadata = $doc_obj->get_all_metadata ($section);342 foreach my $pair (@$metadata) {343 my ($field, $value) = (@$pair);344 345 if ($field ne "Identifier" && $field !~ /^gsdl/ &&346 defined $value && $value ne "") {347 348 # escape problematic stuff349 $value =~ s/\\/\\\\/g;350 $value =~ s/\n/\\n/g;351 $value =~ s/\r/\\r/g;352 if ($value =~ /-{70,}/) {353 # if value contains 70 or more hyphens in a row we need354 # to escape them to prevent txt2db from treating them355 # as a separator356 $value =~ s/-/&\#045;/gi;357 }358 359 # special case for URL metadata360 if ($field =~ /^URL$/i) {361 $url .= "[$value]\n";362 if ($section eq "") {$url .= "<section>$doc_OID\n";}363 else {$url .= "<section>$doc_OID.$section\n";}364 $url .= '-' x 70 . "\n";365 }366 367 if (!defined $self->{'dontgdbm'}->{$field}) {368 print $handle "<$field>$value\n";369 }370 }371 }372 373 # output archivedir if at top level374 if ($section eq $doc_obj->get_top_section()) {375 print $handle "<archivedir>$archivedir\n";376 }377 378 # output document display type379 if ($first) {380 print $handle "<thistype>$thistype\n";381 }382 383 # output a list of children384 my $children = $doc_obj->get_children ($section);385 if (scalar(@$children) > 0) {386 print $handle "<childtype>$childtype\n";387 print $handle "<contains>";388 my $firstchild = 1;389 foreach my $child (@$children) {390 print $handle ";" unless $firstchild;391 $firstchild = 0;392 if ($child =~ /^.*?\.(\d+)$/) {393 print $handle "\".$1";394 } else {395 print $handle "\".$child";396 }397 # if ($child eq "") { print $handle "$doc_OID"; }398 # elsif ($section eq "") { print $handle "$doc_OID.$child"; }399 # else { print $handle "$doc_OID.$section.$child"; }400 }401 print $handle "\n";402 }403 404 # output the matching document number405 print $handle "<docnum>$self->{'num_sections'}\n";406 407 print $handle '-' x 70, "\n";408 409 410 # output a database entry for the document number411 print $handle "[$self->{'num_sections'}]\n";412 if ($section eq "") { print $handle "<section>$doc_OID\n"; }413 else { print $handle "<section>$doc_OID.$section\n"; }414 print $handle '-' x 70, "\n";415 416 # output entry for url417 if ($url ne "") {418 print $handle $url;419 }420 421 $first = 0;422 $section = $doc_obj->get_next_section($section);423 }424 425 #GRB01062004: see code above moved from here426 }427 44 428 45 sub find_paragraphs { … … 442 59 my ($doc_obj) = @_; 443 60 my $handle = $self->{'output_handle'}; 444 my $indexed_doc = 1; 445 61 446 62 # only output this document if it is one to be indexed 447 63 return if ($doc_obj->get_doc_type() ne "indexed_doc"); 448 64 449 65 # see if this document belongs to this subcollection 450 foreach my $indexexp (@{$self->{'indexexparr'}}) { 451 $indexed_doc = 0; 452 my ($field, $exp, $options) = split /\//, $indexexp; 453 if (defined ($field) && defined ($exp)) { 454 my ($bool) = $field =~ /^(.)/; 455 $field =~ s/^.// if $bool eq '!'; 456 if ($field =~ /^filename$/i) { 457 $field = $doc_obj->get_source_filename(); 458 } else { 459 $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field); 460 } 461 next unless defined $field; 462 if ($bool eq '!') { 463 if ($options =~ /^i$/i) { 464 if ($field !~ /$exp/i) {$indexed_doc = 1; last;} 465 } else { 466 if ($field !~ /$exp/) {$indexed_doc = 1; last;} 467 } 468 } else { 469 if ($options =~ /^i$/i) { 470 if ($field =~ /$exp/i) {$indexed_doc = 1; last;} 471 } else { 472 if ($field =~ /$exp/) {$indexed_doc = 1; last;} 473 } 474 } 475 } 476 } 477 # if this doc is so far in the sub collection, and we have lang info, 478 # now we check the languages to see if it matches 479 if($indexed_doc && defined $self->{'lang_meta'}) { 480 $indexed_doc = 0; 481 my $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'lang_meta'}); 482 if (defined $field) { 483 foreach my $lang (@{$self->{'langarr'}}) { 484 my ($bool) = $lang =~ /^(.)/; 485 if ($bool eq '!') { 486 $lang =~ s/^.//; 487 if ($field !~ /$lang/) { 488 $indexed_doc = 1; last; 489 } 490 } else { 491 if ($field =~ /$lang/) { 492 $indexed_doc = 1; last; 493 } 494 } 495 } 496 } 497 } 66 my $indexed_doc = $self->is_subcollection_doc($doc_obj); 498 67 499 68 # this is another document -
trunk/gsdl/perllib/mgppbuilder.pm
r9853 r9919 211 211 } 212 212 } 213 213 214 print $outhandle "doclevel = ". $self->{'doc_level'}."\n"; 214 215 # get the list of plugins for this collection … … 350 351 $handle = mgppbuilder::PIPEOUT; 351 352 } 353 354 # gdbm_level 355 my $gdbm_level = "document"; 356 if ($self->{'levels'}->{'section'}) { 357 $gdbm_level = "section"; 358 } 359 352 360 $self->{'buildproc'}->set_output_handle ($handle); 353 361 $self->{'buildproc'}->set_mode ('text'); … … 361 369 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'}); 362 370 $self->{'buildproc'}->set_levels ($self->{'levels'}); 371 $self->{'buildproc'}->set_gdbm_level ($gdbm_level); 363 372 $self->{'buildproc'}->reset(); 364 373 &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'}, … … 693 702 $handle = mgppbuilder::PIPEOUT; 694 703 } 695 704 705 # gdbm_level 706 my $gdbm_level = "document"; 707 if ($self->{'levels'}->{'section'}) { 708 $gdbm_level = "section"; 709 } 710 696 711 # set up the document processr 697 712 $self->{'buildproc'}->set_output_handle ($handle); … … 702 717 $self->{'buildproc'}->set_store_text(1); 703 718 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'}); 704 $self->{'buildproc'}->set_levels ($self->{'levels'}); 719 $self->{'buildproc'}->set_levels ($self->{'levels'}); 720 $self->{'buildproc'}->set_gdbm_level ($gdbm_level); 721 705 722 $self->{'buildproc'}->reset(); 706 723 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, -
trunk/gsdl/perllib/mgppbuildproc.pm
r9669 r9919 30 30 package mgppbuildproc; 31 31 32 eval {require bytes}; 33 34 use classify; 35 use doc; 36 use docproc; 37 use util; 32 use basebuildproc; 38 33 39 34 40 35 BEGIN { 41 @mgppbuildproc::ISA = (' docproc');36 @mgppbuildproc::ISA = ('basebuildproc'); 42 37 } 43 38 44 39 #this must be the same as in mgppbuilder 45 40 our %level_map = ('document'=>'Doc', 46 47 41 'section'=>'Sec', 42 'paragraph'=>'Para'); 48 43 49 44 sub new { 50 my ($class, $collection, $source_dir, $build_dir, 51 $verbosity, $outhandle) = @_; 52 my $self = new docproc (); 53 54 # outhandle is where all the debugging info goes 55 # output_handle is where the output of the plugins is piped 56 # to (i.e. mg, gdbm etc.) 57 $outhandle = STDERR unless defined $outhandle; 58 59 $self->{'collection'} = $collection; 60 $self->{'source_dir'} = $source_dir; 61 $self->{'build_dir'} = $build_dir; 62 $self->{'verbosity'} = $verbosity; 63 $self->{'classifiers'} = []; 64 $self->{'mode'} = "text"; 65 $self->{'assocdir'} = $build_dir; 66 $self->{'dontgdbm'} = {}; 45 my $class = shift @_; 46 my $self = new basebuildproc (@_); 47 48 # use a different index specification to the default 67 49 $self->{'index'} = "text"; 68 $self->{'indexexparr'} = []; 69 $self->{'output_handle'} = "STDOUT"; 70 $self->{'num_docs'} = 0; 71 $self->{'num_sections'} = 0; 72 $self->{'num_bytes'} = 0; 73 $self->{'num_processed_bytes'} = 0; 74 $self->{'store_text'} = 1; 75 $self->{'outhandle'} = $outhandle; 76 77 #used by browse interface 78 $self->{'doclist'} = []; 79 80 $self->{'indexing_text'} = 0; 81 82 #new ones for mgpp 50 83 51 $self->{'dontindex'} = {}; 84 52 $self->{'indexfieldmap'} = {}; 85 53 $self->{'indexfields'} = {}; # only put in the ones that are not specified directly in the index 86 54 $self->{'strip_html'}=1; 87 88 55 89 56 return bless $self, $class; 90 57 } 91 58 92 sub reset {93 my $self = shift (@_);94 95 $self->{'num_docs'} = 0;96 $self->{'num_sections'} = 0;97 $self->{'num_processed_bytes'} = 0;98 $self->{'num_bytes'} = 0;99 }100 101 sub get_num_docs {102 my $self = shift (@_);103 104 return $self->{'num_docs'};105 }106 107 sub get_num_sections {108 my $self = shift (@_);109 110 return $self->{'num_sections'};111 }112 113 # num_bytes is the actual number of bytes in the collection114 # this is normally the same as what's processed during text compression115 sub get_num_bytes {116 my $self = shift (@_);117 118 return $self->{'num_bytes'};119 }120 121 # num_processed_bytes is the number of bytes actually passed122 # to mgpp for the current index123 sub get_num_processed_bytes {124 my $self = shift (@_);125 126 return $self->{'num_processed_bytes'};127 }128 129 sub set_output_handle {130 my $self = shift (@_);131 my ($handle) = @_;132 133 $self->{'output_handle'} = $handle;134 }135 136 sub set_mode {137 my $self = shift (@_);138 my ($mode) = @_;139 140 $self->{'mode'} = $mode;141 }142 143 sub set_assocdir {144 my $self = shift (@_);145 my ($assocdir) = @_;146 147 $self->{'assocdir'} = $assocdir;148 }149 150 sub set_dontgdbm {151 my $self = shift (@_);152 my ($dontgdbm) = @_;153 154 $self->{'dontgdbm'} = $dontgdbm;155 }156 157 sub set_index {158 my $self = shift (@_);159 my ($index, $indexexparr) = @_;160 161 $self->{'index'} = $index;162 $self->{'indexexparr'} = $indexexparr if defined $indexexparr;163 }164 165 sub set_index_languages {166 my $self = shift (@_);167 my ($lang_meta, $langarr) = @_;168 $self->{'lang_meta'} = $lang_meta;169 $self->{'langarr'} = $langarr;170 }171 172 sub get_index {173 my $self = shift (@_);174 175 return $self->{'index'};176 }177 178 sub set_classifiers {179 my $self = shift (@_);180 my ($classifiers) = @_;181 182 $self->{'classifiers'} = $classifiers;183 }184 185 sub set_indexing_text {186 my $self = shift (@_);187 my ($indexing_text) = @_;188 189 $self->{'indexing_text'} = $indexing_text;190 }191 192 sub get_indexing_text {193 my $self = shift (@_);194 195 return $self->{'indexing_text'};196 }197 198 sub set_store_text {199 my $self = shift (@_);200 my ($store_text) = @_;201 202 $self->{'store_text'} = $store_text;203 }204 205 sub get_doc_list {206 my $self = shift(@_);207 208 return @{$self->{'doclist'}};209 }210 59 211 60 sub set_indexfieldmap { … … 235 84 } 236 85 237 sub process { 238 my $self = shift (@_); 239 my $method = $self->{'mode'}; 240 241 $self->$method(@_); 242 } 243 244 # use 'Paged' if document has no more than 2 levels 245 # and each section at second level has a number for 246 # Title metadata 247 # also use Paged if gsdlthistype metadata is set to Paged 248 sub get_document_type { 249 my $self = shift (@_); 250 my ($doc_obj) = @_; 251 252 my $thistype = "VList"; 253 my $childtype = "VList"; 254 my $title; 255 my @tmp = (); 256 257 my $section = $doc_obj->get_top_section (); 258 259 my $gsdlthistype = $doc_obj->get_metadata_element ($section, "gsdlthistype"); 260 if (defined $gsdlthistype) { 261 if ($gsdlthistype eq "Paged") { 262 $childtype = "Paged"; 263 if ($doc_obj->get_text_length ($doc_obj->get_top_section())) { 264 $thistype = "Paged"; 265 } else { 266 $thistype = "Invisible"; 267 } 268 269 return ($thistype, $childtype); 270 } elsif ($gsdlthistype eq "Hierarchy") { 271 return ($thistype, $childtype); # use VList, VList 272 } 273 } 274 my $first = 1; 275 while (defined $section) { 276 @tmp = split /\./, $section; 277 if (scalar(@tmp) > 1) { 278 return ($thistype, $childtype); 279 } 280 if (!$first) { 281 $title = $doc_obj->get_metadata_element ($section, "Title"); 282 if (!defined $title || $title !~ /^\d+$/) { 283 return ($thistype, $childtype); 284 } 285 } 286 $first = 0; 287 $section = $doc_obj->get_next_section($section); 288 } 289 if ($doc_obj->get_text_length ($doc_obj->get_top_section())) { 290 $thistype = "Paged"; 291 } else { 292 $thistype = "Invisible"; 293 } 294 $childtype = "Paged"; 295 return ($thistype, $childtype); 296 } 297 298 sub assoc_files { 299 my $self = shift (@_); 300 my ($doc_obj, $archivedir) = @_; 301 my ($afile); 302 303 foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) { 304 # if assoc file starts with a slash, we put it relative to the assoc 305 # dir, otherwise it is relative to the HASH... directory 306 if ($assoc_file->[1] =~ m@^[/\\]@) { 307 $afile = &util::filename_cat($self->{'assocdir'},$assoc_file->[1]); 308 } else { 309 $afile = &util::filename_cat($self->{'assocdir'}, $archivedir, $assoc_file->[1]); 310 } 311 &util::hard_link ($assoc_file->[0], $afile); 312 } 313 } 314 315 sub infodb { 316 my $self = shift (@_); 317 my ($doc_obj, $filename) = @_; 318 my $handle = $self->{'output_handle'}; 319 320 my $doctype = $doc_obj->get_doc_type(); 321 322 # only output this document if it is one to be indexed 323 return if ($doctype ne "indexed_doc"); 324 86 87 sub get_gdbm_level { 88 my $self = shift (@_); 89 325 90 #if a Section level index is not built, the gdbm file should be at doc 326 91 #level not Section 327 my $docs_only = 1;328 92 if ($self->{'levels'}->{'section'}) { 329 $docs_only = 0; 330 } 331 332 my ($archivedir) = $filename =~ /^(.*?)(?:\/|\\)[^\/\\]*$/; 333 $archivedir = "" unless defined $archivedir; 334 $archivedir =~ s/\\/\//g; 335 $archivedir =~ s/^\/+//; 336 $archivedir =~ s/\/+$//; 337 338 # resolve the final filenames of the files associated with this document 339 $self->assoc_files ($doc_obj, $archivedir); 340 341 #GRB: moved 1/06/2004 from GRB01062004 342 #add this document to the browse structure 343 push(@{$self->{'doclist'}},$doc_obj->get_OID()) 344 unless ($doctype eq "classification"); 345 346 # classify this document 347 &classify::classify_doc ($self->{'classifiers'}, $doc_obj); 348 #GRB: end of moved block 349 350 # this is another document 351 $self->{'num_docs'} += 1 unless ($doctype eq "classification"); 352 353 # is this a paged or a hierarchical document 354 my ($thistype, $childtype) = $self->get_document_type ($doc_obj); 355 356 my $section = $doc_obj->get_top_section (); 357 my $doc_OID = $doc_obj->get_OID(); 358 my $first = 1; 359 my $url = ""; 360 while (defined $section) { 361 # update a few statistics 362 $self->{'num_bytes'} += $doc_obj->get_text_length ($section); 363 $self->{'num_sections'} += 1 unless ($doctype eq "classification"); 364 365 # output the section name 366 if ($section eq "") { print $handle "[$doc_OID]\n"; } 367 else { print $handle "[$doc_OID.$section]\n"; } 368 369 # output the fact that this document is a document (unless doctype 370 # has been set to something else from within a plugin 371 my $dtype = $doc_obj->get_metadata_element ($section, "doctype"); 372 if (!defined $dtype || $dtype !~ /\w/) { 373 print $handle "<doctype>doc\n"; 374 } 375 376 # output whether this node contains text 377 if ($doc_obj->get_text_length($section) > 0) { 378 print $handle "<hastxt>1\n"; 379 } else { 380 print $handle "<hastxt>0\n"; 381 } 382 383 # output all the section metadata 384 my $metadata = $doc_obj->get_all_metadata ($section); 385 foreach my $pair (@$metadata) { 386 my ($field, $value) = (@$pair); 387 388 if ($field ne "Identifier" && $field !~ /^gsdl/ && 389 defined $value && $value ne "") { 390 391 # escape problematic stuff 392 $value =~ s/\\/\\\\/g; 393 $value =~ s/\n/\\n/g; 394 $value =~ s/\r/\\r/g; 395 396 # special case for URL metadata 397 if ($field =~ /^URL$/i) { 398 $url .= "[$value]\n"; 399 if ($section eq "") {$url .= "<section>$doc_OID\n";} 400 else {$url .= "<section>$doc_OID.$section\n";} 401 $url .= '-' x 70 . "\n"; 402 } 403 404 if (!defined $self->{'dontgdbm'}->{$field}) { 405 print $handle "<$field>$value\n"; 406 } 407 } 408 } 409 410 # output archivedir if at top level 411 if ($section eq $doc_obj->get_top_section()) { 412 print $handle "<archivedir>$archivedir\n"; 413 } 414 415 # output document display type 416 if ($first) { 417 print $handle "<thistype>$thistype\n"; 418 } 419 420 if (!$docs_only) { 421 # output a list of children 422 my $children = $doc_obj->get_children ($section); 423 if (scalar(@$children) > 0) { 424 print $handle "<childtype>$childtype\n"; 425 print $handle "<contains>"; 426 my $firstchild = 1; 427 foreach my $child (@$children) { 428 print $handle ";" unless $firstchild; 429 $firstchild = 0; 430 if ($child =~ /^.*?\.(\d+)$/) { 431 print $handle "\".$1"; 432 } else { 433 print $handle "\".$child"; 434 } 435 # if ($child eq "") { print $handle "$doc_OID"; } 436 # elsif ($section eq "") { print $handle "$doc_OID.$child"; } 437 # else { print $handle "$doc_OID.$section.$child"; } 438 } 439 print $handle "\n"; 440 } 441 #output the matching doc number 442 print $handle "<docnum>$self->{'num_sections'}\n"; 443 444 } # if (!$docs_only) 445 else { #docs only, doc num is num_docs not num_sections 446 # output the matching document number 447 print $handle "<docnum>$self->{'num_docs'}\n"; 448 } 449 450 print $handle '-' x 70, "\n"; 451 452 453 # output a database entry for the document number 454 if ($docs_only) { 455 print $handle "[$self->{'num_docs'}]\n"; 456 print $handle "<section>$doc_OID\n"; 457 } 458 else { 459 print $handle "[$self->{'num_sections'}]\n"; 460 if ($section eq "") { print $handle "<section>$doc_OID\n"; } 461 else { print $handle "<section>$doc_OID.$section\n"; } 462 } 463 print $handle '-' x 70, "\n"; 464 465 # output entry for url 466 if ($url ne "") { 467 print $handle $url; 468 } 469 470 $first = 0; 471 $section = $doc_obj->get_next_section($section); 472 last if ($docs_only); # if no sections wanted, only gdbm the docs 473 } 474 475 #GRB01062004: see code above moved from here 476 } 93 return "section"; 94 } 95 return "document"; 96 } 97 477 98 478 99 #sub find_paragraphs { … … 533 154 my $handle = $self->{'output_handle'}; 534 155 my $outhandle = $self->{'outhandle'}; 535 my $indexed_doc = 1;536 156 537 157 # only output this document if it is one to be indexed 538 158 return if ($doc_obj->get_doc_type() ne "indexed_doc"); 539 159 540 # see if this document belongs to this subcollection 541 foreach my $indexexp (@{$self->{'indexexparr'}}) { 542 $indexed_doc = 0; 543 my ($field, $exp, $options) = split /\//, $indexexp; 544 if (defined ($field) && defined ($exp)) { 545 my ($bool) = $field =~ /^(.)/; 546 $field =~ s/^.// if $bool eq '!'; 547 if ($field =~ /^filename$/i) { 548 $field = $doc_obj->get_source_filename(); 549 } else { 550 $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field); 551 } 552 next unless defined $field; 553 if ($bool eq '!') { 554 if ($options =~ /^i$/i) { 555 if ($field !~ /$exp/i) {$indexed_doc = 1; last;} 556 } else { 557 if ($field !~ /$exp/) {$indexed_doc = 1; last;} 558 } 559 } else { 560 if ($options =~ /^i$/i) { 561 if ($field =~ /$exp/i) {$indexed_doc = 1; last;} 562 } else { 563 if ($field =~ /$exp/) {$indexed_doc = 1; last;} 564 } 565 } 566 } 567 } 568 569 # if this doc is so far in the sub collection, and we have lang info, 570 # now we check the languages to see if it matches 571 if($indexed_doc && defined $self->{'lang_meta'}) { 572 $indexed_doc = 0; 573 my $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'lang_meta'}); 574 if (defined $field) { 575 foreach my $lang (@{$self->{'langarr'}}) { 576 my ($bool) = $lang =~ /^(.)/; 577 if ($bool eq '!') { 578 $lang =~ s/^.//; 579 if ($field !~ /$lang/) { 580 $indexed_doc = 1; last; 581 } 582 } else { 583 if ($field =~ /$lang/) { 584 $indexed_doc = 1; last; 585 } 586 } 587 } 588 } 589 } 590 160 my $indexed_doc = $self->is_subcollection_doc($doc_obj); 161 591 162 # this is another document 592 163 $self->{'num_docs'} += 1;
Note:
See TracChangeset
for help on using the changeset viewer.