Changeset 12844 for trunk/gsdl/perllib/lucenebuildproc.pm
- Timestamp:
- 2006-09-25T14:17:10+12:00 (18 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/lucenebuildproc.pm
r12426 r12844 31 31 # Use same basic XML structure setup by mgppbuilder/mgppbuildproc 32 32 33 use mgppbuildproc; 33 use mgppbuildproc; 34 34 use ghtml; 35 35 use strict; … … 37 37 38 38 39 use IncrementalBuildUtils; 40 39 41 sub BEGIN { 40 42 @lucenebuildproc::ISA = ('mgppbuildproc'); … … 45 47 my $class = shift @_; 46 48 my $self = new mgppbuildproc (@_); 49 50 $self->{'numincdocs'} = 0; 47 51 48 52 return bless $self, $class; … … 86 90 87 91 my $docid=""; 88 if ($ldoc_level) { 89 if ($self->{'gdbm_level'} eq 'document') { 90 my $doc_sec_num = $self->{'num_docs'}; 92 if ($ldoc_level) 93 { 94 if ($self->{'gdbm_level'} eq 'document') 95 { 96 my $doc_sec_num = $self->{'num_docs'}; 91 97 $docid = "gs2:id=\"$doc_sec_num\""; 92 } else { 98 } 99 else 100 { 93 101 # default is section level 94 my $doc_sec_num = $self->{'num_sections'} +1;102 my $doc_sec_num = $self->{'num_sections'} + 1; 95 103 $docid = "gs2:id=\"$doc_sec_num\""; 96 97 }104 } 105 } 98 106 my $documenttag = "<$doc_level $gs2ns file=\"$file\" $docid >\n"; 99 107 my $documentendtag = "\n</$doc_level>\n"; 100 101 my ($sectiontag) = ""; 102 if ($lsec_level) { 108 109 my ($sectiontag) = ""; 110 if ($lsec_level) 111 { 103 112 $sectiontag = $mgppbuildproc::level_map{'section'}; 104 }113 } 105 114 my ($parastarttag) = ""; 106 115 my ($paraendtag) = ""; 107 if ($self->{'levels'}->{'paragraph'}) { 108 if ($self->{'strip_html'}) { 116 if ($self->{'levels'}->{'paragraph'}) 117 { 118 if ($self->{'strip_html'}) 119 { 109 120 $parastarttag = "<".$mgppbuildproc::level_map{'paragraph'}.">"; 110 121 $paraendtag = "</".$mgppbuildproc::level_map{'paragraph'}.">"; 111 } else { 122 } 123 else 124 { 112 125 print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n"; 113 114 }115 126 } 127 } 128 116 129 my $doc_section = 0; # just for this document 117 130 … … 120 133 # get the text for this document 121 134 my $section = $doc_obj->get_top_section(); 122 while (defined $section) { 135 while (defined $section) 136 { 123 137 # update a few statistics 124 138 $doc_section++; 125 $self->{'num_sections'} += 1; 126 127 if ($sectiontag ne "") { 128 my $secid = "gs2:id=\"".$self->{'num_sections'}."\""; 139 $self->{'num_sections'}++; 140 141 if ($sectiontag ne "") 142 { 143 my $secid = "gs2:id=\"".$self->{'num_sections'}."\""; 129 144 $text .= "\n<$sectiontag $secid >\n"; 130 145 } 131 146 132 147 # if we are doing subcollections, then some docs shouldn't be indexed. 133 # but we need to put the section tag placeholders in there so the 148 # but we need to put the section tag placeholders in there so the 134 149 # sections match up with gdbm db 135 150 my $indexed_section = $doc_obj->get_metadata_element($section, "gsdldoctype") || "indexed_section"; 136 151 if (!$indexed_doc || ($indexed_section ne "indexed_section" && $indexed_section ne "indexed_doc")) { 137 152 $text .= "\n</$sectiontag>\n" if ($sectiontag ne ""); 138 153 $section = $doc_obj->get_next_section($section); 139 154 next; 140 141 155 } 156 142 157 $self->{'num_bytes'} += $doc_obj->get_text_length ($section); 143 foreach my $field (split (/;/, $fields)) { 158 foreach my $field (split (/;/, $fields)) 159 { 144 160 # only deal with this field if it doesn't start with top or 145 161 # this is the first section … … 147 163 next if (($real_field =~ s/^top//) && ($doc_section != 1)); 148 164 149 my $new_text = ""; 150 151 # we get allfields by default - do nothing 152 if ($real_field eq "allfields") { 153 154 } 165 my $new_text = ""; 166 my $tmp_text = ""; 167 168 # If allfields is requested add all metadata fields and text as 169 # belonging to the ZZ field 170 if ($real_field eq "allfields") { 171 # Text first - no html nor paragraph tags 172 $new_text .= "$parastarttag<ZZ index=\"1\">\n"; 173 $tmp_text = $self->preprocess_text($doc_obj->get_text ($section), 1, ""); 174 &ghtml::htmlsafe($tmp_text); 175 $new_text .= "$tmp_text</ZZ>$paraendtag\n"; 176 # Then Metadata 177 my $metadata = $doc_obj->get_all_metadata ($section); 178 foreach my $pair (@$metadata) { 179 my ($mfield, $mvalue) = (@$pair); 180 &ghtml::htmlsafe($mvalue); 181 # check fields here, maybe others dont want - change to use dontindex!! 182 if ($mfield ne "Identifier" 183 && $mfield !~ /^gsdl/ 184 && $mfield ne "classifytype" 185 && $mfield ne "assocfilepath" 186 && defined $mvalue && $mvalue ne "") { 187 $new_text .= "$parastarttag<ZZ index=\"1\">$mvalue</ZZ>$paraendtag\n"; 188 } 189 if (!defined $self->{'indexfields'}->{$mfield}) { 190 $self->{'indexfields'}->{$mfield} = 1; 191 } 192 } 193 } 155 194 # metadata - output all metadata we know about except gsdl stuff 156 elsif ($real_field eq "metadata" ) {195 elsif ($real_field eq "metadata" || $real_field eq "allfields") { 157 196 my $shortname = ""; 158 197 my $metadata = $doc_obj->get_all_metadata ($section); 159 198 foreach my $pair (@$metadata) { 160 199 my ($mfield, $mvalue) = (@$pair); 200 &ghtml::htmlsafe($mvalue); 161 201 # check fields here, maybe others dont want - change to use dontindex!! 162 202 if ($mfield ne "Identifier" … … 165 205 && $mfield ne "assocfilepath" 166 206 && defined $mvalue && $mvalue ne "") { 167 207 168 208 if (defined $self->{'indexfieldmap'}->{$mfield}) { 169 209 $shortname = $self->{'indexfieldmap'}->{$mfield}; … … 173 213 $self->{'indexfieldmap'}->{$mfield} = $shortname; 174 214 $self->{'indexfieldmap'}->{$shortname} = 1; 175 } 215 } 176 216 $new_text .= "$parastarttag<$shortname index=\"1\">$mvalue</$shortname>$paraendtag\n"; 177 217 if (!defined $self->{'indexfields'}->{$mfield}) { 178 218 $self->{'indexfields'}->{$mfield} = 1; 179 } 219 } 180 220 } 181 221 } 182 222 } 183 else { 223 else { 184 224 #individual metadata and or text specified - could be a comma separated list 185 225 my $shortname=""; … … 192 232 $self->{'indexfieldmap'}->{$shortname} = 1; 193 233 } 194 234 195 235 my @metadata_list = (); 196 236 foreach my $submeta (split /,/, $real_field) { … … 209 249 $new_text .= "$section_text</$shortname>$paraendtag\n"; 210 250 } 211 else { 212 # leave html stuff in, but escape the tags, and don't add Paragraph tags - never retrieve paras at the moment 213 &ghtml::htmlsafe($ section_text);214 $new_text .= $ section_text;251 else { # leave html stuff in, but escape the tags, and dont add Paragraph tags - never retrieve paras at the moment 252 $tmp_text .= $doc_obj->get_text ($section); 253 &ghtml::htmlsafe($tmp_text); 254 $new_text .= $tmp_text; 215 255 } 216 256 } … … 229 269 } 230 270 } 231 232 271 # filter the text 233 272 $self->filter_text ($field, $new_text); 234 273 $self->{'num_processed_bytes'} += length ($new_text); 274 235 275 $text .= "$new_text"; 236 276 } # foreach field 237 277 238 278 $text .= "\n</$sectiontag>\n" if ($sectiontag ne ""); 239 279 240 280 $section = $doc_obj->get_next_section($section); 241 281 } #while defined section 242 print $handle "$text\n$documentendtag"; 282 print $handle "$text\n$documentendtag"; 243 283 #print STDOUT "$text\n$documentendtag"; 244 284 } 245 285 286 # /** We make this builder pretend to be a document processor so we can get 287 # * information back from the plugins. 288 # * 289 # * @param $self A reference to this Lucene builder 290 # * @param $doc_obj A reference to a document object representing what was 291 # * parsed by the GAPlug 292 # * @param $file The name of the file parsed as a string 293 # * 294 # * @author John Thompson, DL Consulting Ltd 295 # */ 296 sub process() 297 { 298 my $self = shift (@_); 299 my ($doc_obj, $file) = @_; 300 301 # If this is called from any stage other than an incremental infodb we want 302 # to pass through to the superclass of build 303 if ($self->get_mode() eq "incinfodb") 304 { 305 print STDERR "*** Processing a document added using INCINFODB ***\n"; 306 my ($archivedir) = $file =~ /^(.*?)(?:\/|\\)[^\/\\]*$/; 307 $archivedir = "" unless defined $archivedir; 308 $archivedir =~ s/\\/\//g; 309 $archivedir =~ s/^\/+//; 310 $archivedir =~ s/\/+$//; 311 312 # Number of files 313 print STDERR "There are " . scalar($doc_obj->get_assoc_files()) . " associated documents...\n"; 314 315 # resolve the final filenames of the files associated with this document 316 $self->assoc_files ($doc_obj, $archivedir); 317 318 # is this a paged or a hierarchical document 319 my ($thistype, $childtype) = $self->get_document_type ($doc_obj); 320 321 # Determine the actual docnum by checking if we've processed any 322 # previous incrementally added documents. If so, carry on from there. 323 # Otherwise we set the counter to be the same as the number of 324 # sections encountered during the previous build 325 if ($self->{'numincdocs'} == 0) 326 { 327 $self->{'numincdocs'} = $self->{'starting_num_sections'} + 1; 328 } 329 330 my $section = $doc_obj->get_top_section (); 331 print STDERR "+ top section: '$section'\n"; 332 my $doc_OID = $doc_obj->get_OID(); 333 my $url = ""; 334 while (defined $section) 335 { 336 print STDERR "+ processing section: '$section'\n"; 337 # Attach all the other metadata to this document 338 # output the fact that this document is a document (unless doctype 339 # has been set to something else from within a plugin 340 my $dtype = $doc_obj->get_metadata_element ($section, "doctype"); 341 if (!defined $dtype || $dtype !~ /\w/) 342 { 343 $doc_obj->add_utf8_metadata($section, "doctype", $dtype); 344 } 345 # output whether this node contains text 346 if ($doc_obj->get_text_length($section) > 0) 347 { 348 $doc_obj->add_utf8_metadata($section, "hastxt", 1); 349 } 350 else 351 { 352 $doc_obj->add_utf8_metadata($section, "hastxt", 0); 353 } 354 355 # output archivedir if at top level 356 if ($section eq $doc_obj->get_top_section()) 357 { 358 $doc_obj->add_utf8_metadata($section, "archivedir", $archivedir); 359 $doc_obj->add_utf8_metadata($section, "thistype", $thistype); 360 } 361 362 # output a list of children 363 my $children = $doc_obj->get_children ($section); 364 if (scalar(@$children) > 0) 365 { 366 $doc_obj->add_utf8_metadata($section, "childtype", $childtype); 367 my @contains = (); 368 foreach my $child (@$children) 369 { 370 if ($child =~ /^.*?\.(\d+)$/) 371 { 372 push (@contains, "\".$1"); 373 } 374 else 375 { 376 push (@contains, "\".$child"); 377 } 378 } 379 $doc_obj->add_utf8_metadata($section, "contains", join(";", @contains)); 380 } 381 #output the matching doc number 382 print STDERR "+ docnum=" . $self->{'numincdocs'} . "\n"; 383 $doc_obj->add_utf8_metadata($section, "docnum", $self->{'numincdocs'}); 384 385 $self->{'numincdocs'}++; 386 $section = $doc_obj->get_next_section($section); 387 # if no sections wanted, only gdbm the docs 388 last if ($self->{'gdbm_level'} eq "document"); 389 } 390 print STDERR "\n*** incrementally add metadata from document at: " . $file . "\n"; 391 &IncrementalBuildUtils::addDocument($self->{'collection'}, $doc_obj, $doc_obj->get_top_section()); 392 } 393 else 394 { 395 $self->mgppbuildproc::process(@_); 396 } 397 } 398 # /** process() **/ 399 400 sub get_num_docs { 401 my $self = shift (@_); 402 #rint STDERR "get_num_docs(): $self->{'num_docs'}\n"; 403 return $self->{'num_docs'}; 404 } 405 406 sub get_num_sections { 407 my $self = shift (@_); 408 #rint STDERR "get_num_sections(): $self->{'num_sections'}\n"; 409 return $self->{'num_sections'}; 410 } 411 412 # num_bytes is the actual number of bytes in the collection 413 # this is normally the same as what's processed during text compression 414 sub get_num_bytes { 415 my $self = shift (@_); 416 #rint STDERR "get_num_bytes(): $self->{'num_bytes'}\n"; 417 return $self->{'num_bytes'}; 418 } 419 246 420 1; 247 421
Note:
See TracChangeset
for help on using the changeset viewer.