Changeset 17564 for gsdl/trunk


Ignore:
Timestamp:
2008-10-20T15:33:25+13:00 (16 years ago)
Author:
kjdon
Message:

fixed up some stuff to do with indexfieldmap. still working on it, but want to commit what I've done

Location:
gsdl/trunk/perllib
Files:
4 edited

Legend:

Unmodified
Added
Removed
  • gsdl/trunk/perllib/basebuildproc.pm

    r17111 r17564  
    144144    $self->{'num_docs'}      = 0;
    145145    $self->{'num_sections'}  = 0;
    146     $self->{'num_bytes'}     = 0;
     146    # reconstructed docs have no text, just metadata, so we need to
     147    # remember how many bytes we had initially
     148    $self->{'num_bytes'}     = $self->{'starting_num_bytes'};
    147149   
    148150    $self->{'num_processed_bytes'} = 0;
     
    425427    # update a few statistics
    426428    $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
     429    print STDERR "num bytes added = ".$doc_obj->get_text_length ($section)."\n";
    427430    $self->{'num_sections'} += 1 unless ($doctype eq "classification");
    428431
  • gsdl/trunk/perllib/lucenebuilder.pm

    r17286 r17564  
    4242
    4343use mgppbuilder;
    44 use strict; no strict 'refs';
     44use strict;
     45no strict 'refs';
    4546
    4647
     
    168169    $self->{'buildproc'}->set_index ($textindex);
    169170    $self->{'buildproc'}->set_indexing_text (0);
    170     $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
     171    #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
    171172    $self->{'buildproc'}->set_levels ($levels);
    172173    $self->{'buildproc'}->set_db_level ($db_level);
     
    339340    $self->{'buildproc'}->set_index_languages ($language_metadata, $langarr) if (defined $language);
    340341    $self->{'buildproc'}->set_indexing_text (1);
    341     $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
     342    #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
    342343    $self->{'buildproc'}->set_levels ($local_levels);
    343344    $self->{'buildproc'}->set_db_level($db_level);
  • gsdl/trunk/perllib/mgppbuilder.pm

    r17110 r17564  
    6363               'wa'=>1);
    6464
    65 # change this so a user can add their own ones in via a file or cfg
    66 #add AND, OR, NOT NEAR to this list - these cannot be used as field names
    67 #also add the level names (Doc, Sec, Para)
    68 our %static_indexfield_map = ('Title'=>'TI',
    69               'TI'=>1,
    70               'Subject'=>'SU',
    71               'SU'=>1,
    72               'Creator'=>'CR',
    73               'CR'=>1,
    74               'Organization'=>'ORG',
    75               'ORG'=>1,
    76               'Source'=>'SO',
    77               'SO'=>1,
    78               'Howto'=>'HT',
    79               'HT'=>1,
    80               'ItemTitle'=>'IT',
    81               'IT'=>1,
    82               'ProgNumber'=>'PN',
    83               'PN'=>1,
    84               'People'=>'PE',
    85               'PE'=>1,
    86               'Coverage'=>'CO',
    87               'CO'=>1,
    88               'allfields'=>'ZZ',
    89               'ZZ'=>1,
    90               'text'=>'TX',
    91               'TX'=>1,
    92               'AND'=>1,
    93               'OR'=>1,
    94               'NOT'=>1,
    95               'NEAR'=>1,
    96               'Doc'=>1,
    97               'Sec'=>1,
    98               'Para'=>1);
    9965
    10066my $maxdocsize = $basebuilder::maxdocsize;
     
    10672    $self = bless $self, $class;
    10773
    108     $self->{'indexfieldmap'} = \%static_indexfield_map;
     74    #$self->{'indexfieldmap'} = \%static_indexfield_map;
    10975
    11076    # get the levels (Section, Paragraph) for indexing and compression
     
    248214    $self->{'buildproc'}->set_index ($textindex);
    249215    $self->{'buildproc'}->set_indexing_text (0);
    250     $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
     216    #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
    251217    $self->{'buildproc'}->set_levels ($self->{'levels'});                     
    252218    $self->{'buildproc'}->set_db_level ($db_level);                       
     
    506472    $self->{'buildproc'}->set_index_languages ($language_metadata, $langarr) if (defined $language);
    507473    $self->{'buildproc'}->set_indexing_text (1);
    508     $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
     474    #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
    509475    $self->{'buildproc'}->set_levels ($self->{'levels'});
    510476    $self->{'buildproc'}->set_db_level ($db_level);   
     
    797763    my @indexmap = ();
    798764
     765    print STDERR "in final field list\n";
    799766    if (scalar(keys %{$self->{'buildproc'}->{'indexfieldmap'}}) == 0) {
    800767    # set the default mapping
    801     $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
     768    #$self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
    802769    }
    803770    # we read the stuff in from the build.cfg file - if its there
     
    822789
    823790    if (defined $buildcfg->{'indexfieldmap'}) {
     791    print STDERR "found index field map\n";
    824792    foreach $field (@{$buildcfg->{'indexfieldmap'}}) {
    825793        push (@indexfieldmap, "$field");
  • gsdl/trunk/perllib/mgppbuildproc.pm

    r17117 r17564  
    4646          'paragraph'=>'Para');
    4747
     48# change this so a user can add their own ones in via a file or cfg
     49#add AND, OR, NOT NEAR to this list - these cannot be used as field names
     50#also add the level names (Doc, Sec, Para)
     51our %static_indexfield_map = ('Title'=>'TI',
     52              'TI'=>1,
     53              'Subject'=>'SU',
     54              'SU'=>1,
     55              'Creator'=>'CR',
     56              'CR'=>1,
     57              'Organization'=>'ORG',
     58              'ORG'=>1,
     59              'Source'=>'SO',
     60              'SO'=>1,
     61              'Howto'=>'HT',
     62              'HT'=>1,
     63              'ItemTitle'=>'IT',
     64              'IT'=>1,
     65              'ProgNumber'=>'PN',
     66              'PN'=>1,
     67              'People'=>'PE',
     68              'PE'=>1,
     69              'Coverage'=>'CO',
     70              'CO'=>1,
     71              'allfields'=>'ZZ',
     72              'ZZ'=>1,
     73              'text'=>'TX',
     74              'TX'=>1,
     75              'AND'=>1,
     76              'OR'=>1,
     77              'NOT'=>1,
     78              'NEAR'=>1,
     79              'Doc'=>1,
     80              'Sec'=>1,
     81              'Para'=>1);
     82
     83
    4884sub new {
    4985    my $class = shift @_;
     
    6298
    6399
    64 sub set_indexfieldmap {
    65     my $self = shift (@_);
    66     my ($indexmap) = @_;
    67 
    68     $self->{'indexfieldmap'} = $indexmap;
    69 }
     100#sub set_indexfieldmap {
     101#    my $self = shift (@_);
     102#    my ($indexmap) = @_;
     103
     104#    $self->{'default_index_field_mapping'} = $indexmap;
     105    #$self->{'indexfieldmap'} = $indexmap;
     106#}
    70107
    71108sub get_indexfieldmap {
     
    243280       
    244281        # metadata - output all metadata we know about except gsdl stuff
     282        # each metadata is in a separate index field
    245283        elsif ($real_field eq "metadata") {
    246284        my $shortname = "";
     
    263301                $self->{'indexfieldmap'}->{$shortname} = 1;
    264302            }     
     303            # should this line only be done if the following test is true?
    265304            $new_text .= "$paratag<$shortname>$mvalue</$shortname>\n";
    266305            if (!defined $self->{'indexfields'}->{$mfield}) {
     
    274313        # a comma separated list
    275314        my $shortname="";
     315        my $new_field = 0; # have we found a new field name?
     316
    276317        if (defined $self->{'indexfieldmap'}->{$real_field}) {
    277318            $shortname = $self->{'indexfieldmap'}->{$real_field};
     
    279320        else {
    280321            $shortname = $self->create_shortname($real_field);
    281             $self->{'indexfieldmap'}->{$real_field} = $shortname;
    282             $self->{'indexfieldmap'}->{$shortname} = 1;
     322            $new_field = 1; # we want to record this shortname, but only if we have actually found some metadata values
    283323        }
    284         # we only want one tag around the index
    285         $new_text .= "$paratag<$shortname>";
    286         my @metadata_list = ();
     324        my @metadata_list = (); # put any meta values in here
     325        my $section_text = ""; # put any text in here
    287326        foreach my $submeta (split /,/, $real_field) {
    288             if ($submeta eq "text") {
    289             my $section_text = $doc_obj->get_text($section);
    290             if ($self->{'indexing_text'}) {
    291                 if ($paratag ne "") {
    292                 # we fiddle around with splitting text into paragraphs
    293                 $new_text .= "</$shortname>$paratag<$shortname>\n";
    294                 $section_text = $self->preprocess_text($section_text, $self->{'strip_html'}, "</$shortname>$paratag<$shortname>");
     327            if ($submeta eq "text") {
     328            # no point in indexing text more than once
     329            if ($section_text eq "") {
     330                $section_text = $doc_obj->get_text($section);
     331                if ($self->{'indexing_text'}) {
     332                if ($paratag ne "") {
     333                    # we fiddle around with splitting text into paragraphs
     334                    $section_text = $self->preprocess_text($section_text, $self->{'strip_html'}, "</$shortname>$paratag<$shortname>");
     335                }
     336                else {
     337                    $section_text = $self->preprocess_text($section_text, $self->{'strip_html'}, "");
     338                }
    295339                }
    296                 else {
    297                 $section_text = $self->preprocess_text($section_text, $self->{'strip_html'}, "");
    298                 }
    299                 $new_text .= "$section_text</$shortname><$shortname>\n";
    300340            }
    301             else {
    302                             # leave html stuff in, and don't add Paragraph tags - never retrieve paras at the moment
    303                 $new_text .= $section_text;
    304             }
    305341            }
    306342            else {
     343            # its a metadata element
    307344            my @section_metadata = @{$doc_obj->get_metadata ($section, $submeta)};
    308345            if ($section ne $doc_obj->get_top_section() && $self->{'indexing_text'} && defined ($self->{'sections_index_document_metadata'})) {
     
    313350            push (@metadata_list, @section_metadata);
    314351            }
     352        } # for each field in index
     353
     354
     355        # now we add the text and/or the metadata into new_text
     356        if ($section_text ne "" || scalar(@metadata_list)) {
     357            $new_text .= "$paratag<$shortname>";
     358           
     359            if ($section_text ne "") {
     360            $new_text .= "$section_text ";
     361            if ($paratag ne "" && scalar(@metadata_list)) {
     362                $new_text .= "</$shortname>$paratag<$shortname>";
     363            }
     364            }
     365            foreach my $item (@metadata_list) {
     366            $new_text .= "$item ";
     367            }
     368            $new_text .= "</$shortname>";
     369           
     370            if ($new_field) {
     371            # we need to add to the list in indexfields
     372
     373            $self->{'indexfieldmap'}->{$real_field} = $shortname;
     374            $self->{'indexfieldmap'}->{$shortname} = 1;
     375            }
    315376        }
    316         foreach my $item (@metadata_list) {
    317             #$new_text .= "$paratag<$shortname>$item</$shortname>\n";
    318             $new_text .= "$item ";
    319         }
    320         $new_text .= "</$shortname>";
    321377        }
    322        
     378
    323379        # filter the text
    324380        $new_text = $self->filter_text ($field, $new_text);
     
    341397   
    342398    my ($realname) = @_;
    343     #take the first two chars
     399    # try our predefined static mapping
     400    if (defined $static_indexfield_map{$realname}) {
     401    return $static_indexfield_map{$realname};
     402    }
     403    #try the first two chars
    344404    my $shortname;
    345405    if ($realname =~ /^[^\w]*(\w)[^\w]*(\w)/) {
     
    354414    #if already used, take the first and third letdigs and so on
    355415    my $count = 1;
    356     while (defined $self->{'indexfieldmap'}->{$shortname}) {
     416    while (defined $self->{'indexfieldmap'}->{$shortname} || defined $static_indexfield_map{$shortname}) {
    357417    if ($realname =~ /^[^\w]*(\w)([^\w]*\w){$count}[^\w]*(\w)/) {
    358418        $shortname = "$1$3";
Note: See TracChangeset for help on using the changeset viewer.