Changeset 4811


Ignore:
Timestamp:
2003-06-25T15:57:49+12:00 (21 years ago)
Author:
kjdon
Message:

levels are now specified using upper or lower case, eg Section or section. if levels aren't specified, use document, otherwise use only what is specified eg levels section will only give section level. we use Doc, Sec, Para when passing the stuff to mgpp. the build.cfg file now contains indexlevels and textlevel entries - these give the actual names used by mgpp, and mean that the c++ code no longer has to assume them. collection meta can be specified for the levels, otherwise _textdocument_, _textsection_ and _textparagraph_ will be used.

Location:
trunk/gsdl/perllib
Files:
2 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/mgppbuilder.pm

    r4794 r4811  
    4848$maxdocsize = 12000;
    4949
     50%level_map = ('document'=>'Doc',
     51          'section'=>'Sec',
     52          'paragraph'=>'Para',
     53          'Doc'=>'_textdocument_',
     54          'Sec'=>'_textsection_',
     55          'Para'=>'_textparagraph_');
     56
     57#$doc_level = "Doc";
     58#$sec_level = "Sec";
     59#$para_level = "Para";
    5060
    5161%wanted_index_files = ('td'=>1,
     
    6474# change this so a user can add their own ones in via a file or cfg
    6575#add AND, OR, NOT NEAR to this list - these cannot be used as field names
     76#also add the level names (Doc, Sec, Para)
    6677%static_indexfield_map = ('Title'=>'TI',
    6778              'TI'=>1,
     
    89100              'OR'=>1,
    90101              'NOT'=>1,
    91               'NEAR'=>1);
     102              'NEAR'=>1,
     103              'Doc'=>1,
     104              'Sec'=>1,
     105              'Para'=>1);
    92106
    93107sub new {
     
    170184    # get the levels (Section, Paragraph) for indexing and compression
    171185    $self->{'levels'} = {};
     186    $self->{'levelorder'} = ();
    172187    if (defined $self->{'collect_cfg'}->{'levels'}) {
    173188        foreach $level ( @{$self->{'collect_cfg'}->{'levels'}} ){
     189        $level =~ tr/A-Z/a-z/;
    174190            $self->{'levels'}->{$level} = 1;
     191        push (@{$self->{'levelorder'}}, $level);
    175192        }
    176     } 
    177 
     193    } else { # default to document
     194    $self->{'levels'}->{'document'} = 1;
     195    push (@{$self->{'levelorder'}}, 'document');
     196    }
     197   
     198    $self->{'doc_level'} = "document";
     199    if (! $self->{'levels'}->{'document'}) {
     200    if ($self->{'levels'}->{'section'}) {
     201        $self->{'doc_level'} = "section";
     202    } else {
     203        die "you must have either document or section level specified!!\n";
     204    }
     205    }
     206    print $outhandle "doclevel = ". $self->{'doc_level'}."\n";
    178207    # get the list of plugins for this collection
    179208    my $plugins = [];
     
    274303
    275304
    276     # define the section names for mgpasses
     305    # define the section names and possibly the doc name for mgpasses
    277306    # the compressor doesn't need to know about paragraphs - never want to
    278307    # retrieve them
    279308    my $mgpp_passes_sections = "";
    280     if ($self->{'levels'}->{'Section'}) {
    281     $mgpp_passes_sections .= "-K Section ";
     309    my ($doc_level) = $self->{'doc_level'};
     310    $mgpp_passes_sections .= "-J " . %level_map->{$doc_level};
     311    foreach $level (keys %{$self->{'levels'}}) {
     312    if ($level ne $doc_level && $level ne "paragraph") {
     313        $mgpp_passes_sections .= "-K " . %level_map->{$level};
     314    }
    282315    }
    283316   
     
    543576
    544577    # define the section names for mgpasses
     578    # define the section names and possibly the doc name for mgpasses
    545579    my $mgpp_passes_sections = "";
    546     foreach $level (keys (%{$self->{'levels'}})) {
    547     if ($level eq "Section" || $level eq "Paragraph") {
    548         $mgpp_passes_sections .= "-K $level ";
     580    my ($doc_level) = $self->{'doc_level'};
     581    $mgpp_passes_sections .= "-J " . %level_map->{$doc_level} ." ";
     582   
     583    foreach $level (keys %{$self->{'levels'}}) {
     584    if ($level ne $doc_level) {
     585        $mgpp_passes_sections .= "-K " . %level_map->{$level}. " ";
    549586    }
    550587    }
     
    773810    } # foreach collmeta key
    774811    }
    775     #add the indexfieldmap macros to [collection]
     812    #add the index field macros to [collection]
    776813    # eg <TI>Title
    777814    #    <SU>Subject
     
    779816    $field_entry="";
    780817    foreach $longfield (@{$self->{'build_cfg'}->{'indexfields'}}){
    781     print $outhandle "doing long field $longfield\n";
    782818    $shortfield = $self->{'buildproc'}->{'indexfieldmap'}->{$longfield};
    783819    next if $shortfield eq 1;
     
    786822    my $collmeta = ".$longfield";
    787823    if ($collmetadefined && defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
    788         print $outhandle "coll meta $collmeta defined\n";
    789824        $metadata_entry = $self->create_language_db_map($collmeta, $shortfield);
    790825        $field_entry .= $metadata_entry;
     
    801836    print $handle $field_entry;
    802837   
     838    # now add the level names
     839    $level_entry = "";
     840    foreach $level (@{$self->{'collect_cfg'}->{'levels'}}) {
     841    my $collmeta = ".$level"; # based on the original specification
     842    $level =~ tr/A-Z/a-z/; # make it lower case
     843    my $levelid = %level_map->{$level}; # find the actual value we used in the index
     844    if ($collmetadefined && defined $self->{'collect_cfg'}->{'collectionmeta'}->{$collmeta}) {
     845        $metadata_entry = $self->create_language_db_map($collmeta, $levelid);
     846        $level_entry .= $metadata_entry;
     847    } else {
     848        # use the default macro
     849        $level_entry .= "<$levelid>" . %level_map->{$levelid} . "\n";
     850    }
     851    }
     852    print $handle $level_entry;
    803853    #end the collection entry
    804854    print $handle "\n" . ('-' x 70) . "\n";
     
    835885    my $metadata_entry = "";
    836886    my $default="";
    837     print $outhandle "crate for meta $metaname\n";
    838887    #iterate through the languages
    839888    foreach $lang (keys (%{$self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}})) {
    840     print $outhandle "lang=$lang\n";
    841889    if ($first) {
    842890        $first=0;
    843891        #set the default default to the first entry
    844892        $default=$self->{'collect_cfg'}->{'collectionmeta'}->{$metaname}->{$lang};
    845         print $outhandle "defualt = $default\n";
    846893    }
    847894    if ($lang =~ /default/) {
     
    9831030    $build_cfg->{'buildtype'} = "mgpp"; #do we need this??
    9841031   
     1032    # store the level info
     1033    my @indexlevels = ();
     1034    foreach $l (@{$self->{'levelorder'}}) {
     1035    push (@indexlevels, %level_map->{$l});
     1036    }
     1037    $build_cfg->{'indexlevels'} = \@indexlevels;
     1038   
     1039    if ($self->{'levels'}->{'section'}) {
     1040    $build_cfg->{'textlevel'} = %level_map->{'section'};
     1041    } else {   
     1042    $build_cfg->{'textlevel'} = %level_map->{'document'};
     1043    }
    9851044    # store the number of documents and number of bytes
    9861045    $build_cfg->{'numdocs'} = $self->{'buildproc'}->get_num_docs();
     
    10121071    # write out the build information
    10131072    &cfgread::write_cfg_file("$self->{'build_dir'}/build.cfg", $build_cfg,
    1014                  '^(builddate|buildtype|numdocs|numbytes)$',
    1015                              '^(indexmap|subcollectionmap|languagemap|indexfieldmap|notbuilt|indexfields)$');
     1073                 '^(builddate|buildtype|numdocs|numbytes|textlevel)$',
     1074                             '^(indexmap|subcollectionmap|languagemap|indexfieldmap|notbuilt|indexfields|indexlevels)$');
    10161075   
    10171076}
  • trunk/gsdl/perllib/mgppbuildproc.pm

    r4769 r4811  
    4242}
    4343
     44#this must be the same as in mgppbuilder
     45%level_map = ('document'=>'Doc',
     46          'section'=>'Sec',
     47          'paragraph'=>'Para');
    4448
    4549sub new {
     
    298302    #level not Section
    299303    my $docs_only = 1;
    300     if ($self->{'levels'}->{'Section'}) {
     304    if ($self->{'levels'}->{'section'}) {
    301305    $docs_only = 0;
    302306    }
     
    543547    my ($documenttag) = "";
    544548    my($documentendtag) = "";
    545     #if ($self->{'levels'}->{'Document'}) {
    546     $documenttag = "\n<Document>\n";
    547     $documentendtag = "</Document>\n";
    548     #}
     549    if ($self->{'levels'}->{'document'}) {
     550    $documenttag = "\n<". %level_map->{'document'} . ">\n";
     551    $documentendtag = "\n</". %level_map->{'document'} . ">\n";
     552    }
    549553    my ($sectiontag) = "";
    550     if ($self->{'levels'}->{'Section'}) {
    551     $sectiontag = "\n<Section>\n";
     554    if ($self->{'levels'}->{'section'}) {
     555    $sectiontag = "\n<". %level_map->{'section'} . ">\n";
    552556    }
    553557    my ($paratag) = "";
    554     if ($self->{'levels'}->{'Paragraph'}) {
     558    if ($self->{'levels'}->{'paragraph'}) {
    555559    if ($self->{'strip_html'}) {
    556         $paratag = "<Paragraph>";
     560        $paratag = "<". %level_map->{'paragraph'} . ">";
    557561    } else {
    558562        print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
    559563    }
    560564    }
     565
    561566    my $doc_section = 0; # just for this document
    562567   
Note: See TracChangeset for help on using the changeset viewer.