Changeset 1852 for trunk/gsdl/perllib/mgppbuildproc.pm
- Timestamp:
- 2001-01-22T15:30:56+13:00 (23 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/mgppbuildproc.pm
r1772 r1852 1 1 ########################################################################### 2 2 # 3 # mg buildproc.pm --3 # mgppbuildproc.pm -- 4 4 # A component of the Greenstone digital library software 5 5 # from the New Zealand Digital Library Project at the … … 25 25 26 26 # This document processor outputs a document 27 # for mg to process27 # for mgpp to process 28 28 29 29 … … 67 67 $self->{'num_processed_bytes'} = 0; 68 68 $self->{'outhandle'} = $outhandle; 69 $self->{'dontindex'} = {}; 70 $self->{'indexfieldmap'} = {}; 69 71 70 72 $self->{'indexing_text'} = 0; 71 73 $self->{'indexfields'} = {}; 74 $self->{'strip_html'}=1; 75 72 76 73 77 return bless $self, $class; … … 171 175 172 176 return $self->{'indexing_text'}; 177 } 178 179 sub set_indexfieldmap { 180 my $self = shift (@_); 181 my ($indexmap) = @_; 182 183 $self->{'indexfieldmap'} = $indexmap; 184 } 185 186 sub get_indexfieldmap { 187 my $self = shift (@_); 188 189 return $self->{'indexfieldmap'}; 190 } 191 192 sub set_levels { 193 my $self = shift (@_); 194 my ($levels) = @_; 195 196 $self->{'levels'} = $levels; 197 } 198 199 sub set_strip_html { 200 my $self = shift (@_); 201 my ($strip) = @_; 202 $self->{'strip_html'}=$strip; 173 203 } 174 204 … … 238 268 my ($doc_obj, $filename) = @_; 239 269 my $handle = $self->{'output_handle'}; 240 # $handle = "main::STDOUT";241 270 242 271 my $doctype = $doc_obj->get_doc_type(); … … 244 273 # only output this document if it is one to be indexed 245 274 return if ($doctype ne "indexed_doc"); 275 276 #if a Section level index is not built, the gdbm file should be at doc 277 #level not Section 278 my $docs_only = 1; 279 if ($self->{'levels'}->{'Section'}) { 280 $docs_only = 0; 281 } 246 282 247 283 my ($archivedir) = $filename =~ /^(.*?)(?:\/|\\)[^\/\\]*$/; … … 287 323 288 324 # output all the section metadata 289 #my $found_doctype = 0;290 325 my $metadata = $doc_obj->get_all_metadata ($section); 291 326 foreach $pair (@$metadata) { 292 327 my ($field, $value) = (@$pair); 293 328 294 #$found_doctype = 1 if $field eq "doctype";295 329 if ($field ne "Identifier" && $field !~ /^gsdl/ && 296 330 defined $value && $value ne "") { … … 315 349 } 316 350 317 # output the fact that this document is a document318 # (unless doctype was already output as part of319 # metadata)320 #if (!$found_doctype && !defined $self->{'dontgdbm'}->{'doctype'}) {321 # print $handle "<doctype>doc\n";322 #}323 324 325 326 351 # output archivedir if at top level 327 352 if ($section eq $doc_obj->get_top_section()) { … … 334 359 } 335 360 336 # output a list of children 337 my $children = $doc_obj->get_children ($section); 338 if (scalar(@$children) > 0) { 339 print $handle "<childtype>$childtype\n"; 340 print $handle "<contains>"; 341 my $firstchild = 1; 342 foreach $child (@$children) { 343 print $handle ";" unless $firstchild; 344 $firstchild = 0; 345 if ($child =~ /^.*?\.(\d+)$/) { 346 print $handle "\".$1"; 347 } else { 348 print $handle "\".$child"; 349 } 361 if (!$docs_only) { 362 # output a list of children 363 my $children = $doc_obj->get_children ($section); 364 if (scalar(@$children) > 0) { 365 print $handle "<childtype>$childtype\n"; 366 print $handle "<contains>"; 367 my $firstchild = 1; 368 foreach $child (@$children) { 369 print $handle ";" unless $firstchild; 370 $firstchild = 0; 371 if ($child =~ /^.*?\.(\d+)$/) { 372 print $handle "\".$1"; 373 } else { 374 print $handle "\".$child"; 375 } 350 376 # if ($child eq "") { print $handle "$doc_OID"; } 351 377 # elsif ($section eq "") { print $handle "$doc_OID.$child"; } 352 378 # else { print $handle "$doc_OID.$section.$child"; } 353 } 354 print $handle "\n"; 355 } 356 357 # output the matching document number 358 print $handle "<docnum>$self->{'num_sections'}\n"; 359 379 } 380 print $handle "\n"; 381 } 382 #output the matching doc number 383 print $handle "<docnum>$self->{'num_sections'}\n"; 384 385 } # if (!$docs_only) 386 else { #docs only, doc num is num_docs not num_sections 387 # output the matching document number 388 print $handle "<docnum>$self->{'num_docs'}\n"; 389 } 390 360 391 print $handle '-' x 70, "\n"; 361 392 362 393 363 394 # output a database entry for the document number 364 print $handle "[$self->{'num_sections'}]\n"; 365 if ($section eq "") { print $handle "<section>$doc_OID\n"; } 366 else { print $handle "<section>$doc_OID.$section\n"; } 395 if ($docs_only) { 396 print $handle "[$self->{'num_docs'}]\n"; 397 print $handle "<section>$doc_OID\n"; 398 } 399 else { 400 print $handle "[$self->{'num_sections'}]\n"; 401 if ($section eq "") { print $handle "<section>$doc_OID\n"; } 402 else { print $handle "<section>$doc_OID.$section\n"; } 403 } 367 404 print $handle '-' x 70, "\n"; 368 405 … … 374 411 $first = 0; 375 412 $section = $doc_obj->get_next_section($section); 413 last if ($docs_only); # if no sections wanted, only gdbm the docs 376 414 } 377 415 … … 384 422 $_[1] =~ s/(<p\b)/<Paragraph>$1/gi; 385 423 } 424 425 #this function strips the html tags from the doc if ($strip_html) and 426 # if ($para) replaces <p> with <Paragraph> tags. 427 # if both are false, the original text is returned 428 #assumes that <pre> and </pre> have no spaces, and removes all < and > inside 429 #these tags 430 sub preprocess_text { 431 my $self = shift (@_); 432 my ($text, $strip_html, $para) = @_; 433 434 my ($outtext) = ""; 435 if ($strip_html) { 436 while ($text =~ /<([^>]*)>/ && $text ne "") { 437 438 $tag = $1; 439 $outtext .= $`." "; #add everything before the matched tag 440 $text = $'; #everything after the matched tag 441 if ($para && $tag =~ /^\s*p\s/) { 442 $outtext .= "<Paragraph> "; 443 } 444 elsif ($tag =~ /^pre$/) { # a pre tag 445 $text =~ /<\/pre>/; # find the closing pre tag 446 my $tmp_text = $`; #everything before the closing pre tag 447 $text = $'; #everything after the </pre> 448 $tmp_text =~ s/[<>]//g; # remove all < and > 449 $outtext.= $tmp_text . " "; 450 } 451 } 452 453 $outtext .= $text; # add any remaining text 454 return $outtext; 455 } #if strip_html 456 457 if ($para) { 458 $text =~ s/(<p\b)/<Paragraph>$1/gi; 459 return $text; 460 } 461 return $text; 462 } 463 464 386 465 387 466 sub filter_text { … … 436 515 # get the parameters for the output 437 516 my ($fields) = $self->{'index'}; 438 #print STDERR "fields are $fields\n"; 439 $fields =~ s/\ball\b/Title,Creator,text/; # add in others here 440 517 518 my ($sectiontag) = ""; 519 if ($self->{'levels'}->{'Section'}) { 520 $sectiontag = "\n<Section>\n"; 521 } 522 my ($paratag) = ""; 523 if ($self->{'levels'}->{'Paragraph'}) { 524 $paratag = "<Paragraph>"; 525 } 441 526 my $doc_section = 0; # just for this document 442 527 my $text = ""; … … 455 540 $doc_section++; 456 541 $self->{'num_sections'} += 1; 457 $text .= "<Section>\n"; 542 $text .= $sectiontag; 543 458 544 if ($indexed_doc) { 459 545 $self->{'num_bytes'} += $doc_obj->get_text_length ($section); … … 464 550 if (!($real_field =~ s/^top//) || ($doc_section == 1)) { 465 551 my $new_text = ""; 552 my $tmp_text = ""; 466 553 if ($real_field eq "text") { 467 #print STDERR "in text bit"; 468 #$new_text = "<Paragraph>"; 469 $new_text .= $doc_obj->get_text ($section); 470 #$self->find_paragraphs($new_text); 554 if ($self->{'indexing_text'}) { #tag the text with <Text>...</Text>, add the <Paragraph> tags and strip out html if needed 555 $new_text .= "<TX>\n"; 556 $tmp_text .= $doc_obj->get_text ($section); 557 $tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, $self->{'levels'}->{'Paragraph'}); 558 559 $new_text .= "$tmp_text</TX>\n"; 560 if (!defined $self->{'indexfields'}->{'TextOnly'}) { 561 $self->{'indexfields'}->{'TextOnly'} = 1; 562 } 563 } 564 else { # leave html stuff in, and dont add Paragraph tags - never retrieve paras at the moment 565 $new_text .= $doc_obj->get_text ($section); 566 #if ($self->{'levels'}->{'Paragraph'}) { 567 #$self->find_paragraphs($new_text); 568 #} 569 } 471 570 } else { # metadata field 472 571 if ($real_field eq "metadata") { # insert all metadata 473 474 #print STDERR "in metadata bit\n";572 #except gsdl stuff 573 my $shortname = ""; 475 574 my $metadata = $doc_obj->get_all_metadata ($section); 476 575 foreach $pair (@$metadata) { 477 576 my ($mfield, $mvalue) = (@$pair); 478 #print STDERR "$mfield, $mvalue\n"; 479 # check fields here, maybe others dont want 577 # check fields here, maybe others dont want - change to use dontindex!! 480 578 if ($mfield ne "Identifier" && $mfield ne "classifytype" && 481 579 $mfield !~ /^gsdl/ && defined $mvalue && $mvalue ne "") { 482 483 $new_text .= "<$mfield>$mvalue</$mfield>\n"; 484 #print STDERR "metadata=$mfield:$mvalue"; 485 if (!defined $self->{'indexfields'}->{$mfield}) { 486 $self->{'indexfields'}->{$mfield} = 1; 487 } 580 581 if (defined $self->{'indexfieldmap'}->{$mfield}) { 582 $shortname = $self->{'indexfieldmap'}->{$mfield}; 583 } 584 else { 585 $shortname = $self->create_shortname($mfield); 586 $self->{'indexfieldmap'}->{$mfield} = $shortname; 587 $self->{'indexfieldmap'}->{$shortname} = 1; 588 } 589 $new_text .= "$paratag<$shortname>$mvalue</$shortname>\n"; 590 if (!defined $self->{'indexfields'}->{$mfield}) { 591 $self->{'indexfields'}->{$mfield} = 1; 592 } 488 593 } 489 594 } 490 595 491 596 } 492 597 else { #individual metadata specified 598 my $shortname=""; 493 599 if (!defined $self->{'indexfields'}->{$real_field}) { 494 600 $self->{'indexfields'}->{$real_field} = 1; 495 } 601 } 602 if (defined $self->{'indexfieldmap'}->{$real_field}) { 603 $shortname = $self->{'indexfieldmap'}->{$real_field}; 604 } 605 else { 606 $shortname = $self->create_shortname($real_field); 607 $self->{'indexfieldmap'}->{$real_field} = $shortname; 608 $self->{'indexfieldmap'}->{$shortname} = 1; 609 } 496 610 foreach $item (@{$doc_obj->get_metadata ($section, $real_field)}) { 497 $new_text .= " <$real_field>$item</$real_field>\n";611 $new_text .= "$paratag<$shortname>$item</$shortname>\n"; 498 612 } 499 613 } … … 508 622 $new_text =~ /[\(\)\{\}]/) { 509 623 } 510 624 $self->{'num_processed_bytes'} += length ($new_text); 511 625 $text .= "$new_text"; 512 626 } … … 519 633 } 520 634 635 sub create_shortname { 636 $self = shift(@_); 637 638 my ($realname) = @_; 639 #take the first two chars 640 my ($shortname) = $realname =~ /^(\w\w)/; 641 $shortname =~ tr/a-z/A-Z/; 642 643 #if already used, take the first and third letters and so on 644 $count = 1; 645 while (defined $self->{'indexfieldmap'}->{$shortname}) { 646 if ($realname =~ /^(\w).{$count}(\w)/) { 647 $shortname = "$1$2"; 648 $count++; 649 $shortname =~ tr/a-z/A-Z/; 650 651 } 652 else { 653 $realname =~ s/^.//; 654 $count = 0; 655 } 656 } 657 658 return $shortname; 659 } 660 521 661 1; 522 662
Note:
See TracChangeset
for help on using the changeset viewer.