Changeset 17564 for gsdl/trunk
- Timestamp:
- 2008-10-20T15:33:25+13:00 (16 years ago)
- Location:
- gsdl/trunk/perllib
- Files:
-
- 4 edited
Legend:
- Unmodified
- Added
- Removed
-
gsdl/trunk/perllib/basebuildproc.pm
r17111 r17564 144 144 $self->{'num_docs'} = 0; 145 145 $self->{'num_sections'} = 0; 146 $self->{'num_bytes'} = 0; 146 # reconstructed docs have no text, just metadata, so we need to 147 # remember how many bytes we had initially 148 $self->{'num_bytes'} = $self->{'starting_num_bytes'}; 147 149 148 150 $self->{'num_processed_bytes'} = 0; … … 425 427 # update a few statistics 426 428 $self->{'num_bytes'} += $doc_obj->get_text_length ($section); 429 print STDERR "num bytes added = ".$doc_obj->get_text_length ($section)."\n"; 427 430 $self->{'num_sections'} += 1 unless ($doctype eq "classification"); 428 431 -
gsdl/trunk/perllib/lucenebuilder.pm
r17286 r17564 42 42 43 43 use mgppbuilder; 44 use strict; no strict 'refs'; 44 use strict; 45 no strict 'refs'; 45 46 46 47 … … 168 169 $self->{'buildproc'}->set_index ($textindex); 169 170 $self->{'buildproc'}->set_indexing_text (0); 170 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});171 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'}); 171 172 $self->{'buildproc'}->set_levels ($levels); 172 173 $self->{'buildproc'}->set_db_level ($db_level); … … 339 340 $self->{'buildproc'}->set_index_languages ($language_metadata, $langarr) if (defined $language); 340 341 $self->{'buildproc'}->set_indexing_text (1); 341 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});342 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'}); 342 343 $self->{'buildproc'}->set_levels ($local_levels); 343 344 $self->{'buildproc'}->set_db_level($db_level); -
gsdl/trunk/perllib/mgppbuilder.pm
r17110 r17564 63 63 'wa'=>1); 64 64 65 # change this so a user can add their own ones in via a file or cfg66 #add AND, OR, NOT NEAR to this list - these cannot be used as field names67 #also add the level names (Doc, Sec, Para)68 our %static_indexfield_map = ('Title'=>'TI',69 'TI'=>1,70 'Subject'=>'SU',71 'SU'=>1,72 'Creator'=>'CR',73 'CR'=>1,74 'Organization'=>'ORG',75 'ORG'=>1,76 'Source'=>'SO',77 'SO'=>1,78 'Howto'=>'HT',79 'HT'=>1,80 'ItemTitle'=>'IT',81 'IT'=>1,82 'ProgNumber'=>'PN',83 'PN'=>1,84 'People'=>'PE',85 'PE'=>1,86 'Coverage'=>'CO',87 'CO'=>1,88 'allfields'=>'ZZ',89 'ZZ'=>1,90 'text'=>'TX',91 'TX'=>1,92 'AND'=>1,93 'OR'=>1,94 'NOT'=>1,95 'NEAR'=>1,96 'Doc'=>1,97 'Sec'=>1,98 'Para'=>1);99 65 100 66 my $maxdocsize = $basebuilder::maxdocsize; … … 106 72 $self = bless $self, $class; 107 73 108 $self->{'indexfieldmap'} = \%static_indexfield_map;74 #$self->{'indexfieldmap'} = \%static_indexfield_map; 109 75 110 76 # get the levels (Section, Paragraph) for indexing and compression … … 248 214 $self->{'buildproc'}->set_index ($textindex); 249 215 $self->{'buildproc'}->set_indexing_text (0); 250 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});216 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'}); 251 217 $self->{'buildproc'}->set_levels ($self->{'levels'}); 252 218 $self->{'buildproc'}->set_db_level ($db_level); … … 506 472 $self->{'buildproc'}->set_index_languages ($language_metadata, $langarr) if (defined $language); 507 473 $self->{'buildproc'}->set_indexing_text (1); 508 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});474 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'}); 509 475 $self->{'buildproc'}->set_levels ($self->{'levels'}); 510 476 $self->{'buildproc'}->set_db_level ($db_level); … … 797 763 my @indexmap = (); 798 764 765 print STDERR "in final field list\n"; 799 766 if (scalar(keys %{$self->{'buildproc'}->{'indexfieldmap'}}) == 0) { 800 767 # set the default mapping 801 $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});768 #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'}); 802 769 } 803 770 # we read the stuff in from the build.cfg file - if its there … … 822 789 823 790 if (defined $buildcfg->{'indexfieldmap'}) { 791 print STDERR "found index field map\n"; 824 792 foreach $field (@{$buildcfg->{'indexfieldmap'}}) { 825 793 push (@indexfieldmap, "$field"); -
gsdl/trunk/perllib/mgppbuildproc.pm
r17117 r17564 46 46 'paragraph'=>'Para'); 47 47 48 # change this so a user can add their own ones in via a file or cfg 49 #add AND, OR, NOT NEAR to this list - these cannot be used as field names 50 #also add the level names (Doc, Sec, Para) 51 our %static_indexfield_map = ('Title'=>'TI', 52 'TI'=>1, 53 'Subject'=>'SU', 54 'SU'=>1, 55 'Creator'=>'CR', 56 'CR'=>1, 57 'Organization'=>'ORG', 58 'ORG'=>1, 59 'Source'=>'SO', 60 'SO'=>1, 61 'Howto'=>'HT', 62 'HT'=>1, 63 'ItemTitle'=>'IT', 64 'IT'=>1, 65 'ProgNumber'=>'PN', 66 'PN'=>1, 67 'People'=>'PE', 68 'PE'=>1, 69 'Coverage'=>'CO', 70 'CO'=>1, 71 'allfields'=>'ZZ', 72 'ZZ'=>1, 73 'text'=>'TX', 74 'TX'=>1, 75 'AND'=>1, 76 'OR'=>1, 77 'NOT'=>1, 78 'NEAR'=>1, 79 'Doc'=>1, 80 'Sec'=>1, 81 'Para'=>1); 82 83 48 84 sub new { 49 85 my $class = shift @_; … … 62 98 63 99 64 sub set_indexfieldmap { 65 my $self = shift (@_); 66 my ($indexmap) = @_; 67 68 $self->{'indexfieldmap'} = $indexmap; 69 } 100 #sub set_indexfieldmap { 101 # my $self = shift (@_); 102 # my ($indexmap) = @_; 103 104 # $self->{'default_index_field_mapping'} = $indexmap; 105 #$self->{'indexfieldmap'} = $indexmap; 106 #} 70 107 71 108 sub get_indexfieldmap { … … 243 280 244 281 # metadata - output all metadata we know about except gsdl stuff 282 # each metadata is in a separate index field 245 283 elsif ($real_field eq "metadata") { 246 284 my $shortname = ""; … … 263 301 $self->{'indexfieldmap'}->{$shortname} = 1; 264 302 } 303 # should this line only be done if the following test is true? 265 304 $new_text .= "$paratag<$shortname>$mvalue</$shortname>\n"; 266 305 if (!defined $self->{'indexfields'}->{$mfield}) { … … 274 313 # a comma separated list 275 314 my $shortname=""; 315 my $new_field = 0; # have we found a new field name? 316 276 317 if (defined $self->{'indexfieldmap'}->{$real_field}) { 277 318 $shortname = $self->{'indexfieldmap'}->{$real_field}; … … 279 320 else { 280 321 $shortname = $self->create_shortname($real_field); 281 $self->{'indexfieldmap'}->{$real_field} = $shortname; 282 $self->{'indexfieldmap'}->{$shortname} = 1; 322 $new_field = 1; # we want to record this shortname, but only if we have actually found some metadata values 283 323 } 284 # we only want one tag around the index 285 $new_text .= "$paratag<$shortname>"; 286 my @metadata_list = (); 324 my @metadata_list = (); # put any meta values in here 325 my $section_text = ""; # put any text in here 287 326 foreach my $submeta (split /,/, $real_field) { 288 if ($submeta eq "text") { 289 my $section_text = $doc_obj->get_text($section); 290 if ($self->{'indexing_text'}) { 291 if ($paratag ne "") { 292 # we fiddle around with splitting text into paragraphs 293 $new_text .= "</$shortname>$paratag<$shortname>\n"; 294 $section_text = $self->preprocess_text($section_text, $self->{'strip_html'}, "</$shortname>$paratag<$shortname>"); 327 if ($submeta eq "text") { 328 # no point in indexing text more than once 329 if ($section_text eq "") { 330 $section_text = $doc_obj->get_text($section); 331 if ($self->{'indexing_text'}) { 332 if ($paratag ne "") { 333 # we fiddle around with splitting text into paragraphs 334 $section_text = $self->preprocess_text($section_text, $self->{'strip_html'}, "</$shortname>$paratag<$shortname>"); 335 } 336 else { 337 $section_text = $self->preprocess_text($section_text, $self->{'strip_html'}, ""); 338 } 295 339 } 296 else {297 $section_text = $self->preprocess_text($section_text, $self->{'strip_html'}, "");298 }299 $new_text .= "$section_text</$shortname><$shortname>\n";300 340 } 301 else {302 # leave html stuff in, and don't add Paragraph tags - never retrieve paras at the moment303 $new_text .= $section_text;304 }305 341 } 306 342 else { 343 # its a metadata element 307 344 my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)}; 308 345 if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) { … … 313 350 push (@metadata_list, @section_metadata); 314 351 } 352 } # for each field in index 353 354 355 # now we add the text and/or the metadata into new_text 356 if ($section_text ne "" || scalar(@metadata_list)) { 357 $new_text .= "$paratag<$shortname>"; 358 359 if ($section_text ne "") { 360 $new_text .= "$section_text "; 361 if ($paratag ne "" && scalar(@metadata_list)) { 362 $new_text .= "</$shortname>$paratag<$shortname>"; 363 } 364 } 365 foreach my $item (@metadata_list) { 366 $new_text .= "$item "; 367 } 368 $new_text .= "</$shortname>"; 369 370 if ($new_field) { 371 # we need to add to the list in indexfields 372 373 $self->{'indexfieldmap'}->{$real_field} = $shortname; 374 $self->{'indexfieldmap'}->{$shortname} = 1; 375 } 315 376 } 316 foreach my $item (@metadata_list) {317 #$new_text .= "$paratag<$shortname>$item</$shortname>\n";318 $new_text .= "$item ";319 }320 $new_text .= "</$shortname>";321 377 } 322 378 323 379 # filter the text 324 380 $new_text = $self->filter_text ($field, $new_text); … … 341 397 342 398 my ($realname) = @_; 343 #take the first two chars 399 # try our predefined static mapping 400 if (defined $static_indexfield_map{$realname}) { 401 return $static_indexfield_map{$realname}; 402 } 403 #try the first two chars 344 404 my $shortname; 345 405 if ($realname =~ /^[^\w]*(\w)[^\w]*(\w)/) { … … 354 414 #if already used, take the first and third letdigs and so on 355 415 my $count = 1; 356 while (defined $self->{'indexfieldmap'}->{$shortname} ) {416 while (defined $self->{'indexfieldmap'}->{$shortname} || defined $static_indexfield_map{$shortname}) { 357 417 if ($realname =~ /^[^\w]*(\w)([^\w]*\w){$count}[^\w]*(\w)/) { 358 418 $shortname = "$1$3";
Note:
See TracChangeset
for help on using the changeset viewer.