Changeset 4769
- Timestamp:
- 2003-06-23T15:01:38+12:00 (21 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/mgppbuildproc.pm
r3834 r4769 79 79 $self->{'dontindex'} = {}; 80 80 $self->{'indexfieldmap'} = {}; 81 $self->{'indexfields'} = {}; 81 $self->{'indexfields'} = {}; # only put in the ones that are not specified directly in the index 82 82 $self->{'strip_html'}=1; 83 83 … … 444 444 } 445 445 446 sub find_paragraphs {447 $_[1] =~ s/(<p\b)/<Paragraph>$1/gi;448 }446 #sub find_paragraphs { 447 # $_[1] =~ s/(<p\b)/<Paragraph>$1/gi; 448 #} 449 449 450 450 #this function strips the html tags from the doc if ($strip_html) and … … 456 456 my $self = shift (@_); 457 457 my ($text, $strip_html, $para) = @_; 458 459 458 my ($outtext) = ""; 460 459 if ($strip_html) { … … 464 463 $outtext .= $`." "; #add everything before the matched tag 465 464 $text = $'; #everything after the matched tag 466 if ($para && $tag =~ /^\s*p\s/ ) {467 $outtext .= "<Paragraph> ";465 if ($para && $tag =~ /^\s*p\s/i) { 466 $outtext .= $para; 468 467 } 469 468 elsif ($tag =~ /^pre$/) { # a pre tag … … 480 479 } #if strip_html 481 480 482 if ($para) {483 $text =~ s/(<p\b)/<Paragraph>$1/gi;484 return $text;485 }481 #if ($para) { 482 #$text =~ s/(<p\b)/$para$1/gi; 483 #return $text; 484 # } 486 485 return $text; 487 486 } … … 501 500 my ($doc_obj) = @_; 502 501 my $handle = $self->{'output_handle'}; 502 my $outhandle = $self->{'outhandle'}; 503 503 my $indexed_doc = 1; 504 504 … … 541 541 my ($fields) = $self->{'index'}; 542 542 543 my ($documenttag) = ""; 544 my($documentendtag) = ""; 545 #if ($self->{'levels'}->{'Document'}) { 546 $documenttag = "\n<Document>\n"; 547 $documentendtag = "</Document>\n"; 548 #} 543 549 my ($sectiontag) = ""; 544 550 if ($self->{'levels'}->{'Section'}) { … … 547 553 my ($paratag) = ""; 548 554 if ($self->{'levels'}->{'Paragraph'}) { 549 $paratag = "<Paragraph>"; 555 if ($self->{'strip_html'}) { 556 $paratag = "<Paragraph>"; 557 } else { 558 print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n"; 559 } 550 560 } 551 561 my $doc_section = 0; # just for this document 552 my $text = "<Document>\n"; 562 563 my $text = $documenttag; 553 564 554 565 # get the text for this document … … 558 569 $doc_section++; 559 570 $self->{'num_sections'} += 1; 560 $text .= $sectiontag;571 $text .= "$sectiontag"; 561 572 562 573 if ($indexed_doc) { 574 if ($self->{'indexing_text'}) { 575 $text .= "$paratag"; # only add para tags for indexing 576 # note that we assume that metadata will not be asked for for the compressed text, so we add para tags without checking for indexing_text 577 } 563 578 $self->{'num_bytes'} += $doc_obj->get_text_length ($section); 564 579 foreach my $field (split (/,/, $fields)) { … … 571 586 if ($real_field eq "text") { 572 587 if ($self->{'indexing_text'}) { #tag the text with <Text>...</Text>, add the <Paragraph> tags and strip out html if needed 573 $new_text .= " <TX>\n";574 $tmp_text .= $doc_obj->get_text ($section) if $self->{'store_text'};575 $tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, $self->{'levels'}->{'Paragraph'});576 588 $new_text .= "$paratag<TX>\n"; 589 $tmp_text .= $doc_obj->get_text ($section); 590 $tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, "</TX>$paratag<TX>"); 591 577 592 $new_text .= "$tmp_text</TX>\n"; 578 if (!defined $self->{'indexfields'}->{'TextOnly'}) {579 $self->{'indexfields'}->{'TextOnly'} = 1;580 }593 #if (!defined $self->{'indexfields'}->{'TextOnly'}) { 594 #$self->{'indexfields'}->{'TextOnly'} = 1; 595 #} 581 596 } 582 597 else { # leave html stuff in, and dont add Paragraph tags - never retrieve paras at the moment 583 598 $new_text .= $doc_obj->get_text ($section) if $self->{'store_text'}; 584 #if ($self->{'levels'}->{'Paragraph'}) {585 #$self->find_paragraphs($new_text);586 #}587 599 } 588 600 } else { # metadata field 589 if ($real_field eq "metadata") { # insert all metadata 601 if ($real_field eq "allfields") { #ignore 602 } 603 elsif ($real_field eq "metadata") { # insert all metadata 590 604 #except gsdl stuff 591 605 my $shortname = ""; … … 617 631 else { #individual metadata specified 618 632 my $shortname=""; 619 if (!defined $self->{'indexfields'}->{$real_field}) {620 $self->{'indexfields'}->{$real_field} = 1;621 }633 #if (!defined $self->{'indexfields'}->{$real_field}) { 634 #$self->{'indexfields'}->{$real_field} = 1; 635 #} 622 636 if (defined $self->{'indexfieldmap'}->{$real_field}) { 623 637 $shortname = $self->{'indexfieldmap'}->{$real_field}; … … 646 660 $section = $doc_obj->get_next_section($section); 647 661 } #while defined section 648 print $handle "$text\n</Document>\n"; 662 print $handle "$text\n$documentendtag"; 663 649 664 } 650 665
Note:
See TracChangeset
for help on using the changeset viewer.