Changeset 17568 for gsdl


Ignore:
Timestamp:
2008-10-22T13:27:39+13:00 (16 years ago)
Author:
kjdon
Message:

recoding of the text method. more closely matches mgpp one. ZZ field only contains stuff that is already indexed, not all metadata. metadata will not reindex metadata that is already indexed. indexfieldmap only contains entries for things that have actually been indexed

File:
1 edited

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/lucenebuildproc.pm

    r17287 r17568  
    8686    my $ldoc_level = $levels->{'document'};
    8787    my $lsec_level = $levels->{'section'};
    88     my $lpar_level = $levels->{'paragraph'};
     88    #my $lpar_level = $levels->{'paragraph'};
    8989
    9090    my $gs2_id = "";
     
    107107    my $sec_tag_name = "";
    108108    if ($lsec_level)
    109       {
     109    {
    110110    $sec_tag_name = $mgppbuildproc::level_map{'section'};
    111       }
    112     my ($parastarttag) = "";
    113     my ($paraendtag) = "";
    114     if ($self->{'levels'}->{'paragraph'})
    115       {
    116     if ($self->{'strip_html'})
    117           {
    118         $parastarttag = "<".$mgppbuildproc::level_map{'paragraph'}.">";
    119         $paraendtag = "</".$mgppbuildproc::level_map{'paragraph'}.">";
    120           }
    121         else
    122           {
    123         print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
    124           }
    125       }
     111    }
    126112
    127113    my $doc_section = 0; # just for this document
     
    132118    my $section = $doc_obj->get_top_section();
    133119    while (defined $section)
    134       {
     120    {
    135121    # update a few statistics
    136122    $doc_section++;
     
    155141
    156142    $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
    157     foreach my $field (split (/;/, $fields))
    158           {
     143
     144    # has the user added a 'metadata' index?
     145    my $all_metadata_specified = 0;
     146    # which fields have already been indexed? (same as fields, but in a map)
     147    my $specified_fields = {};
     148   
     149    # do we have an allfields index??
     150    my $allfields_index = 0;
     151    # collect up all the text for it in here
     152    my $allfields_text = "";
     153    foreach my $field (split (/;/, $fields)) {
     154        if ($field eq "allfields") {
     155        $allfields_index = 1;
     156        } elsif ($field eq "metadata") {
     157        $all_metadata_specified = 1;
     158        }
     159    }
     160   
     161    foreach my $field (split (/;/, $fields)) {
     162       
    159163        # only deal with this field if it doesn't start with top or
    160164        # this is the first section
    161165        my $real_field = $field;
    162166        next if (($real_field =~ s/^top//) && ($doc_section != 1));
    163 
    164         my $new_text = "";
    165         my $tmp_text = "";
    166 
    167         # If allfields is requested add all metadata fields and text as
    168             # belonging to the ZZ field
    169         if ($real_field eq "allfields") {
    170               # Text first - no html nor paragraph tags
    171               $new_text .= "$parastarttag<ZZ index=\"1\">\n";
    172               $tmp_text = $self->preprocess_text($doc_obj->get_text ($section), 1, "");
    173               &ghtml::htmlsafe($tmp_text);
    174               $new_text .= "$tmp_text</ZZ>$paraendtag\n";
    175               # Then Metadata
    176               my $metadata = $doc_obj->get_all_metadata ($section);
    177               foreach my $pair (@$metadata) {
    178                 my ($mfield, $mvalue) = (@$pair);
    179                 &ghtml::htmlsafe($mvalue);
    180                 # check fields here, maybe others dont want - change to use dontindex!!
    181                 if ($mfield ne "Identifier"
    182                     && $mfield !~ /^gsdl/
    183                     && $mfield ne "classifytype"
    184                     && $mfield ne "assocfilepath"
    185                     && defined $mvalue && $mvalue ne "") {
    186                   $new_text .= "$parastarttag<ZZ index=\"1\">$mvalue</ZZ>$paraendtag\n";
    187                 }
    188                 if (!defined $self->{'indexfields'}->{$mfield}) {
    189                   $self->{'indexfields'}->{$mfield} = 1;
    190                 }
    191               }
     167       
     168        # process these two later
     169        next if ($real_field eq "allfields" || $real_field eq "metadata");
     170       
     171        #individual metadata and or text specified - could be a comma separated list
     172        $specified_fields->{$real_field} = 1;
     173        my $shortname="";
     174        my $new_field = 0; # have we found a new field name?
     175        if (defined $self->{'indexfieldmap'}->{$real_field}) {
     176        $shortname = $self->{'indexfieldmap'}->{$real_field};
    192177        }
    193         # metadata - output all metadata we know about except gsdl stuff
    194         elsif ($real_field eq "metadata" || $real_field eq "allfields") {
    195         my $shortname = "";
    196         my $metadata = $doc_obj->get_all_metadata ($section);
    197         foreach my $pair (@$metadata) {
    198             my ($mfield, $mvalue) = (@$pair);
    199                     &ghtml::htmlsafe($mvalue);
    200             # check fields here, maybe others dont want - change to use dontindex!!
    201             if ($mfield ne "Identifier"
    202             && $mfield !~ /^gsdl/
    203             && $mfield ne "classifytype"
    204             && $mfield ne "assocfilepath"
    205             && defined $mvalue && $mvalue ne "") {
    206 
    207             if (defined $self->{'indexfieldmap'}->{$mfield}) {
    208                 $shortname = $self->{'indexfieldmap'}->{$mfield};
     178        else {
     179        $shortname = $self->create_shortname($real_field);
     180        $new_field = 1;
     181        }
     182
     183        my @metadata_list = (); # put any metadata values in here
     184        my $section_text = ""; # put the text in here
     185        foreach my $submeta (split /,/, $real_field) {
     186        if ($submeta eq "text") {
     187            # no point in indexing text more than once
     188            if ($section_text eq "") {
     189            $section_text = $doc_obj->get_text($section);
     190            if ($self->{'indexing_text'}) {
     191                # we always strip html
     192                $section_text = $self->preprocess_text($section_text, 1, "");
    209193            }
    210             else {
    211                 $shortname = $self->create_shortname($mfield);
    212                 $self->{'indexfieldmap'}->{$mfield} = $shortname;
    213                 $self->{'indexfieldmap'}->{$shortname} = 1;
    214             }
    215             $new_text .= "$parastarttag<$shortname index=\"1\">$mvalue</$shortname>$paraendtag\n";
    216             if (!defined $self->{'indexfields'}->{$mfield}) {
    217                 $self->{'indexfields'}->{$mfield} = 1;
     194            else {
     195                # leave html stuff in, but escape the tags
     196                &ghtml::htmlsafe($section_text);
    218197            }
    219198            }
    220199        }
    221         }
    222         else {
    223         #individual metadata and or text specified - could be a comma separated list
    224         my $shortname="";
    225         if (defined $self->{'indexfieldmap'}->{$real_field}) {
    226             $shortname = $self->{'indexfieldmap'}->{$real_field};
    227         }
    228200        else {
    229             $shortname = $self->create_shortname($real_field);
     201            # its a metadata element
     202            my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
     203            if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
     204            if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
     205                push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
     206            }
     207            }
     208            push (@metadata_list, @section_metadata);
     209        }
     210        } # for each field in this one index
     211       
     212        # now we add the text and/or metadata into new_text
     213        if ($section_text ne "" || scalar(@metadata_list)) {
     214        my $new_text = "";
     215       
     216        if ($section_text ne "") {
     217            $new_text .= "$section_text ";
     218        }
     219       
     220        foreach my $item (@metadata_list) {
     221            &ghtml::htmlsafe($item);
     222            $new_text .= "$item ";
     223        }
     224
     225        if ($allfields_index) {
     226            $allfields_text .= $new_text;
     227        }
     228
     229        $new_text = "<$shortname index=\"1\">$new_text</$shortname>";
     230        # filter the text
     231        $new_text = $self->filter_text ($field, $new_text);
     232        $self->{'num_processed_bytes'} += length ($new_text);
     233       
     234        $text .= "$new_text";
     235
     236        if ($new_field) {
     237            # we need to add to the list in indexfields
     238           
    230239            $self->{'indexfieldmap'}->{$real_field} = $shortname;
    231240            $self->{'indexfieldmap'}->{$shortname} = 1;
    232241        }
    233 
    234         my @metadata_list = ();
    235         foreach my $submeta (split /,/, $real_field) {
    236             if ($submeta eq "text") {
    237             my $section_text = $doc_obj->get_text($section);
    238             if ($self->{'indexing_text'}) {
    239                             # tag the text with <Text>...</Text>, add the <Paragraph> tags and always strip out HTML
    240                 $new_text .= "$parastarttag<$shortname index=\"1\">\n";
    241                 if ($parastarttag ne "") {
    242                 $section_text = $self->preprocess_text($section_text, 1, "</$shortname>$paraendtag$parastarttag<$shortname index=\"1\">");
    243                 }
    244                 else {
    245                 # we don't want to individually tag each paragraph if not doing para indexing
    246                 $section_text = $self->preprocess_text($section_text, 1, "");
    247                 }
    248                 $new_text .= "$section_text</$shortname>$paraendtag\n";
    249             }
    250             else { # leave html stuff in, but escape the tags, and dont add Paragraph tags - never retrieve paras at the moment
    251                 $tmp_text .= $doc_obj->get_text ($section);
    252                 &ghtml::htmlsafe($tmp_text);
    253                 $new_text .= $tmp_text;
    254             }
    255             }
    256             else {
    257             my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
    258             if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
    259                 if ($self->{'sections_index_document_metadata'} eq "always" || ( scalar(@section_metadata) == 0 && $self->{'sections_index_document_metadata'} eq "unless_section_metadata_exists")) {
    260                 push (@section_metadata, @{$doc_obj->get_metadata ($doc_obj->get_top_section(), $submeta)});
    261                 }
    262             }
    263             push (@metadata_list, @section_metadata);
    264             }
    265         }
    266         foreach my $item (@metadata_list) {
    267             &ghtml::htmlsafe($item);
    268             $new_text .= "$parastarttag<$shortname index=\"1\">$item</$shortname>$paraendtag\n";
    269         }
     242       
     243        }
     244       
     245    } # foreach field
     246
     247
     248    if ($all_metadata_specified) {
     249       
     250        my $new_text = "";
     251        my $shortname = "";
     252        my $metadata = $doc_obj->get_all_metadata ($section);
     253        foreach my $pair (@$metadata) {
     254        my ($mfield, $mvalue) = (@$pair);
     255        # no value
     256        next unless defined $mvalue && $mvalue ne "";
     257        # we have already indexed this
     258        next if defined ($specified_fields->{$mfield});
     259        # check fields here, maybe others dont want - change to use dontindex!!
     260        next if ($mfield eq "Identifier" || $mfield eq "classifytype" || $mfield eq "assocfilepath");
     261        next if ($mfield =~ /^gsdl/);
     262       
     263        &ghtml::htmlsafe($mvalue);
     264       
     265        if (defined $self->{'indexfieldmap'}->{$mfield}) {
     266            $shortname = $self->{'indexfieldmap'}->{$mfield};
     267        }
     268        else {
     269            $shortname = $self->create_shortname($mfield);
     270            $self->{'indexfieldmap'}->{$mfield} = $shortname;
     271            $self->{'indexfieldmap'}->{$shortname} = 1;
     272        }     
     273        $new_text .= "<$shortname index=\"1\">$mvalue</$shortname>\n";
     274        if ($allfields_index) {
     275            $allfields_text .= "$mvalue ";
     276        }
     277
     278        if (!defined $self->{'indexfields'}->{$mfield}) {
     279            $self->{'indexfields'}->{$mfield} = 1;
     280        }                   
     281       
    270282        }
    271283        # filter the text
    272         $new_text = $self->filter_text ($field, $new_text);
     284        $new_text = $self->filter_text ("metadata", $new_text);
     285       
    273286        $self->{'num_processed_bytes'} += length ($new_text);
    274 
    275287        $text .= "$new_text";
    276     } # foreach field
    277 
     288
     289       
     290    }
     291
     292    if ($allfields_index) {
     293        # add the index name mapping
     294        $self->{'indexfieldmap'}->{"allfields"} = "ZZ";
     295        $self->{'indexfieldmap'}->{"ZZ"} = 1;
     296       
     297        my $new_text = "<ZZ index=\"1\">$allfields_text</ZZ>\n";
     298        # filter the text
     299        $new_text = $self->filter_text ("allfields", $new_text);
     300       
     301        $self->{'num_processed_bytes'} += length ($new_text);
     302        $text .= "$new_text";
     303    }
     304   
    278305    $text .= "\n</$sec_tag_name>\n" if ($sec_tag_name ne "");
    279306
Note: See TracChangeset for help on using the changeset viewer.