Changeset 9919 for trunk/gsdl


Ignore:
Timestamp:
2005-05-20T12:24:06+12:00 (19 years ago)
Author:
kjdon
Message:

made a base buildproc class, and shifted most of the buildproc code into it. mainly the subclasses just need to implement the text method

Location:
trunk/gsdl/perllib
Files:
1 added
4 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/lucenebuildproc.pm

    r9669 r9919  
    4646}
    4747
    48 sub set_gdbm_level {
    49     my $self = shift(@_);
    50     my ($level) = @_;
    51 
    52     $self->{'gdbm_level'} = $level;
    53 }
    5448sub preprocess_text {
    5549    my $self = shift (@_);
  • trunk/gsdl/perllib/mgbuildproc.pm

    r9669 r9919  
    3030package mgbuildproc;
    3131
    32 eval {require bytes};
    33 
    34 use classify;
    35 use doc;
    36 use docproc;
    37 use util;
     32use basebuildproc;
    3833
    3934BEGIN {
    40     @mgbuildproc::ISA = ('docproc');
     35    @mgbuildproc::ISA = ('basebuildproc');
    4136}
    4237
    43 
    4438sub new {
    45     my ($class, $collection, $source_dir, $build_dir,
    46     $verbosity, $outhandle) = @_;
    47     my $self = new docproc ();
    48 
    49     # outhandle is where all the debugging info goes
    50     # output_handle is where the output of the plugins is piped
    51     # to (i.e. mg, gdbm etc.)
    52     $outhandle = STDERR unless defined $outhandle;
    53 
    54     $self->{'collection'} = $collection;
    55     $self->{'source_dir'} = $source_dir;
    56     $self->{'build_dir'} = $build_dir;
    57     $self->{'verbosity'} = $verbosity;
    58     $self->{'classifiers'} = [];
    59     $self->{'mode'} = "text";
    60     $self->{'assocdir'} = $build_dir;
    61     $self->{'dontgdbm'} = {};
    62     $self->{'index'} = "section:text";
    63     $self->{'indexexparr'} = [];
    64     $self->{'output_handle'} = "STDOUT";
    65     $self->{'num_docs'} = 0;
    66     $self->{'num_sections'} = 0;
    67     $self->{'num_bytes'} = 0;
    68     $self->{'num_processed_bytes'} = 0;
    69     $self->{'store_text'} = 1;
    70     $self->{'outhandle'} = $outhandle;
    71    
    72     #used by browse interface
    73     $self->{'doclist'} = [];
    74    
    75     $self->{'indexing_text'} = 0;
    76 
     39    my $class = shift @_;
     40    my $self = new basebuildproc (@_);
    7741    return bless $self, $class;
    7842}
    7943
    80 sub reset {
    81     my $self = shift (@_);
    82    
    83     $self->{'num_docs'} = 0;
    84     $self->{'num_sections'} = 0;
    85     $self->{'num_processed_bytes'} = 0;
    86     $self->{'num_bytes'} = 0;
    87 }
    88 
    89 sub get_num_docs {
    90     my $self = shift (@_);
    91 
    92     return $self->{'num_docs'};
    93 }
    94 
    95 sub get_num_sections {
    96     my $self = shift (@_);
    97 
    98     return $self->{'num_sections'};
    99 }
    100 
    101 # num_bytes is the actual number of bytes in the collection
    102 # this is normally the same as what's processed during text compression
    103 sub get_num_bytes {
    104     my $self = shift (@_);
    105 
    106     return $self->{'num_bytes'};
    107 }
    108 
    109 # num_processed_bytes is the number of bytes actually passed
    110 # to mg for the current index
    111 sub get_num_processed_bytes {
    112     my $self = shift (@_);
    113 
    114     return $self->{'num_processed_bytes'};
    115 }
    116 
    117 sub set_output_handle {
    118     my $self = shift (@_);
    119     my ($handle) = @_;
    120 
    121     $self->{'output_handle'} = $handle;
    122 }
    123 
    124 sub set_mode {
    125     my $self = shift (@_);
    126     my ($mode) = @_;
    127 
    128     $self->{'mode'} = $mode;
    129 }
    130 
    131 sub set_assocdir {
    132     my $self = shift (@_);
    133     my ($assocdir) = @_;
    134 
    135     $self->{'assocdir'} = $assocdir;
    136 }
    137 
    138 sub set_dontgdbm {
    139     my $self = shift (@_);
    140     my ($dontgdbm) = @_;
    141 
    142     $self->{'dontgdbm'} = $dontgdbm;
    143 }
    144 
    145 sub set_index {
    146     my $self = shift (@_);
    147     my ($index, $indexexparr) = @_;
    148 
    149     $self->{'index'} = $index;
    150     $self->{'indexexparr'} = $indexexparr if defined $indexexparr;
    151 }
    152 
    153 sub set_index_languages {
    154     my $self = shift (@_);
    155     my ($lang_meta, $langarr) = @_;
    156     $self->{'lang_meta'} = $lang_meta;
    157     $self->{'langarr'} = $langarr;
    158 }
    159 
    160 sub get_index {
    161     my $self = shift (@_);
    162 
    163     return $self->{'index'};
    164 }
    165 
    166 sub set_classifiers {
    167     my $self = shift (@_);
    168     my ($classifiers) = @_;
    169 
    170     $self->{'classifiers'} = $classifiers;
    171 }
    172 
    173 sub set_indexing_text {
    174     my $self = shift (@_);
    175     my ($indexing_text) = @_;
    176 
    177     $self->{'indexing_text'} = $indexing_text;
    178 }
    179 
    180 sub get_indexing_text {
    181     my $self = shift (@_);
    182 
    183     return $self->{'indexing_text'};
    184 }
    185 
    186 sub set_store_text {
    187     my $self = shift (@_);
    188     my ($store_text) = @_;
    189 
    190     $self->{'store_text'} = $store_text;
    191 }
    192 
    193 sub get_doc_list {
    194     my $self = shift(@_);
    195    
    196     return @{$self->{'doclist'}};
    197 }
    198 
    199 
    200 sub process {
    201     my $self = shift (@_);
    202     my $method = $self->{'mode'};
    203 
    204     $self->$method(@_);
    205 }
    206 
    207 # use 'Paged' if document has no more than 2 levels
    208 # and each section at second level has a number for
    209 # Title metadata
    210 #also use Paged if gsdlthistype metadata is set to Paged
    211 sub get_document_type {
    212     my $self = shift (@_);
    213     my ($doc_obj) = @_;
    214 
    215     my $thistype = "VList";
    216     my $childtype = "VList";
    217     my $title;
    218     my @tmp = ();
    219    
    220     my $section = $doc_obj->get_top_section ();
    221 
    222     my $gsdlthistype = $doc_obj->get_metadata_element ($section, "gsdlthistype");
    223     if (defined $gsdlthistype) {
    224     if ($gsdlthistype eq "Paged") {
    225         $childtype = "Paged";
    226         if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
    227         $thistype = "Paged";
    228         } else {
    229         $thistype = "Invisible";
    230         }
    231        
    232         return ($thistype, $childtype);
    233     } elsif ($gsdlthistype eq "Hierarchy") {
    234         return ($thistype, $childtype); # use VList, VList
    235     }
    236     }
    237     my $first = 1;
    238     while (defined $section) {
    239     @tmp = split /\./, $section;
    240     if (scalar(@tmp) > 1) {
    241         return ($thistype, $childtype);
    242     }
    243     if (!$first) {
    244         $title = $doc_obj->get_metadata_element ($section, "Title");
    245         if (!defined $title || $title !~ /^\d+$/) {
    246         return ($thistype, $childtype);
    247         }
    248     }
    249     $first = 0;
    250     $section = $doc_obj->get_next_section($section);
    251     }
    252     if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
    253     $thistype = "Paged";
    254     } else {
    255     $thistype = "Invisible";
    256     }
    257     $childtype = "Paged";
    258     return ($thistype, $childtype);
    259 }
    260 
    261 sub assoc_files {
    262     my $self = shift (@_);
    263     my ($doc_obj, $archivedir) = @_;
    264     my ($afile);
    265    
    266     foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
    267     # if assoc file starts with a slash, we put it relative to the assoc
    268     # dir, otherwise it is relative to the HASH... directory
    269     if ($assoc_file->[1] =~ m@^[/\\]@) {
    270         $afile = &util::filename_cat($self->{'assocdir'},$assoc_file->[1]);
    271     } else {
    272         $afile = &util::filename_cat($self->{'assocdir'}, $archivedir, $assoc_file->[1]);
    273     }
    274     &util::hard_link ($assoc_file->[0], $afile);
    275     }
    276 }
    277 
    278 sub infodb {
    279     my $self = shift (@_);
    280     my ($doc_obj, $filename) = @_;
    281     my $handle = $self->{'output_handle'};
    282 #    $handle = "main::STDOUT";
    283 
    284     my $doctype = $doc_obj->get_doc_type();
    285 
    286     # only output this document if it is one to be indexed
    287     return if ($doctype ne "indexed_doc");
    288 
    289     my ($archivedir) = $filename =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
    290     $archivedir = "" unless defined $archivedir;
    291     $archivedir =~ s/\\/\//g;
    292     $archivedir =~ s/^\/+//;
    293     $archivedir =~ s/\/+$//;
    294 
    295     # resolve the final filenames of the files associated with this document
    296     $self->assoc_files ($doc_obj, $archivedir);
    297 
    298     #GRB: moved 1/06/2004 from GRB01062004
    299     #add this document to the browse structure
    300     push(@{$self->{'doclist'}},$doc_obj->get_OID())
    301     unless ($doctype eq "classification");
    302 
    303     # classify this document
    304     &classify::classify_doc ($self->{'classifiers'}, $doc_obj);
    305     #GRB: end of moved block
    306 
    307     # this is another document
    308     $self->{'num_docs'} += 1 unless ($doctype eq "classification");
    309 
    310     # is this a paged or a hierarchical document
    311     my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
    312 
    313     my $section = $doc_obj->get_top_section ();
    314     my $doc_OID = $doc_obj->get_OID();
    315     my $first = 1;
    316     my $url = "";
    317     while (defined $section) {
    318     # update a few statistics
    319     $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
    320     $self->{'num_sections'} += 1 unless ($doctype eq "classification");
    321 
    322     # output the section name
    323     if ($section eq "") { print $handle "[$doc_OID]\n"; }
    324     else { print $handle "[$doc_OID.$section]\n"; }
    325 
    326     # output the fact that this document is a document (unless doctype
    327     # has been set to something else from within a plugin
    328     my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
    329     if (!defined $dtype || $dtype !~ /\w/) {
    330         print $handle "<doctype>doc\n";
    331     }
    332 
    333     # output whether this node contains text
    334     if ($doc_obj->get_text_length($section) > 0) {
    335         print $handle "<hastxt>1\n";
    336     } else {
    337         print $handle "<hastxt>0\n";
    338     }
    339 
    340     # output all the section metadata
    341     my $metadata = $doc_obj->get_all_metadata ($section);
    342     foreach my $pair (@$metadata) {
    343         my ($field, $value) = (@$pair);
    344 
    345         if ($field ne "Identifier" && $field !~ /^gsdl/ &&
    346         defined $value && $value ne "") {
    347 
    348         # escape problematic stuff
    349         $value =~ s/\\/\\\\/g;
    350         $value =~ s/\n/\\n/g;
    351         $value =~ s/\r/\\r/g;
    352         if ($value =~ /-{70,}/) {
    353             # if value contains 70 or more hyphens in a row we need
    354             # to escape them to prevent txt2db from treating them
    355             # as a separator
    356             $value =~ s/-/&\#045;/gi;
    357         }
    358 
    359         # special case for URL metadata
    360         if ($field =~ /^URL$/i) {
    361                     $url .= "[$value]\n";
    362                     if ($section eq "") {$url .= "<section>$doc_OID\n";}
    363                     else {$url .= "<section>$doc_OID.$section\n";}
    364                     $url .= '-' x 70 . "\n";
    365         }
    366 
    367         if (!defined $self->{'dontgdbm'}->{$field}) {
    368             print $handle "<$field>$value\n";
    369         }
    370         }
    371     }
    372    
    373     # output archivedir if at top level
    374     if ($section eq $doc_obj->get_top_section()) {
    375         print $handle "<archivedir>$archivedir\n";
    376     }
    377    
    378     # output document display type
    379     if ($first) {
    380         print $handle "<thistype>$thistype\n";
    381     }
    382 
    383     # output a list of children
    384     my $children = $doc_obj->get_children ($section);
    385     if (scalar(@$children) > 0) {
    386         print $handle "<childtype>$childtype\n";
    387         print $handle "<contains>";
    388         my $firstchild = 1;
    389         foreach my $child (@$children) {
    390         print $handle ";" unless $firstchild;
    391         $firstchild = 0;
    392         if ($child =~ /^.*?\.(\d+)$/) {
    393             print $handle "\".$1";
    394         } else {
    395             print $handle "\".$child";
    396         }
    397 #       if ($child eq "") { print $handle "$doc_OID"; }
    398 #       elsif ($section eq "") { print $handle "$doc_OID.$child"; }
    399 #       else { print $handle "$doc_OID.$section.$child"; }
    400         }
    401         print $handle "\n";
    402     }
    403 
    404     # output the matching document number
    405     print $handle "<docnum>$self->{'num_sections'}\n";
    406 
    407     print $handle '-' x 70, "\n";
    408 
    409    
    410     # output a database entry for the document number
    411     print $handle "[$self->{'num_sections'}]\n";
    412     if ($section eq "") { print $handle "<section>$doc_OID\n"; }
    413     else { print $handle "<section>$doc_OID.$section\n"; }
    414     print $handle '-' x 70, "\n";
    415    
    416         # output entry for url
    417         if ($url ne "") {
    418             print $handle $url;
    419         }
    420 
    421     $first = 0;
    422     $section = $doc_obj->get_next_section($section);
    423     }
    424 
    425     #GRB01062004: see code above moved from here
    426 }
    42744
    42845sub find_paragraphs {
     
    44259    my ($doc_obj) = @_;
    44360    my $handle = $self->{'output_handle'};
    444     my $indexed_doc = 1;
    445 
     61   
    44662    # only output this document if it is one to be indexed
    44763    return if ($doc_obj->get_doc_type() ne "indexed_doc");
    448 
     64   
    44965    # see if this document belongs to this subcollection
    450     foreach my $indexexp (@{$self->{'indexexparr'}}) {
    451     $indexed_doc = 0;
    452     my ($field, $exp, $options) = split /\//, $indexexp;
    453     if (defined ($field) && defined ($exp)) {
    454         my ($bool) = $field =~ /^(.)/;
    455         $field =~ s/^.// if $bool eq '!';
    456         if ($field =~ /^filename$/i) {
    457         $field = $doc_obj->get_source_filename();
    458         } else {
    459         $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
    460         }
    461         next unless defined $field;
    462         if ($bool eq '!') {
    463         if ($options =~ /^i$/i) {
    464             if ($field !~ /$exp/i) {$indexed_doc = 1; last;}
    465         } else {
    466             if ($field !~ /$exp/) {$indexed_doc = 1; last;}
    467         }
    468         } else {
    469         if ($options =~ /^i$/i) {
    470             if ($field =~ /$exp/i) {$indexed_doc = 1; last;}
    471         } else {
    472             if ($field =~ /$exp/) {$indexed_doc = 1; last;}
    473         }
    474         }
    475     }
    476     }
    477     # if this doc is so far in the sub collection, and we have lang info,
    478     # now we check the languages to see if it matches
    479     if($indexed_doc && defined $self->{'lang_meta'}) {
    480     $indexed_doc = 0;
    481     my $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'lang_meta'});
    482     if (defined $field) {
    483         foreach my $lang (@{$self->{'langarr'}}) {
    484         my ($bool) = $lang =~ /^(.)/;
    485         if ($bool eq '!') {
    486             $lang =~ s/^.//;
    487             if ($field !~ /$lang/) {
    488             $indexed_doc = 1; last;
    489             }
    490         } else {
    491             if ($field =~ /$lang/) {
    492             $indexed_doc = 1; last;
    493             }
    494         }
    495         }
    496     }
    497     }
     66    my $indexed_doc = $self->is_subcollection_doc($doc_obj);
    49867
    49968    # this is another document
  • trunk/gsdl/perllib/mgppbuilder.pm

    r9853 r9919  
    211211    }
    212212    }
     213
    213214    print $outhandle "doclevel = ". $self->{'doc_level'}."\n";
    214215    # get the list of plugins for this collection
     
    350351    $handle = mgppbuilder::PIPEOUT;
    351352    }
     353
     354    # gdbm_level
     355    my $gdbm_level = "document";
     356    if ($self->{'levels'}->{'section'}) {
     357    $gdbm_level = "section";
     358    }
     359   
    352360    $self->{'buildproc'}->set_output_handle ($handle);
    353361    $self->{'buildproc'}->set_mode ('text');
     
    361369    $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
    362370    $self->{'buildproc'}->set_levels ($self->{'levels'});                     
     371    $self->{'buildproc'}->set_gdbm_level ($gdbm_level);                       
    363372    $self->{'buildproc'}->reset();
    364373    &plugin::begin($self->{'pluginfo'}, $self->{'source_dir'},
     
    693702    $handle = mgppbuilder::PIPEOUT;
    694703    }
    695    
     704       
     705    # gdbm_level
     706    my $gdbm_level = "document";
     707    if ($self->{'levels'}->{'section'}) {
     708    $gdbm_level = "section";
     709    }
     710
    696711    # set up the document processr
    697712    $self->{'buildproc'}->set_output_handle ($handle);
     
    702717    $self->{'buildproc'}->set_store_text(1);
    703718    $self->{'buildproc'}->set_indexfieldmap ($self->{'indexfieldmap'});
    704     $self->{'buildproc'}->set_levels ($self->{'levels'});                       
     719    $self->{'buildproc'}->set_levels ($self->{'levels'});
     720    $self->{'buildproc'}->set_gdbm_level ($gdbm_level);   
     721   
    705722    $self->{'buildproc'}->reset();
    706723    &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'},
  • trunk/gsdl/perllib/mgppbuildproc.pm

    r9669 r9919  
    3030package mgppbuildproc;
    3131
    32 eval {require bytes};
    33 
    34 use classify;
    35 use doc;
    36 use docproc;
    37 use util;
     32use basebuildproc;
    3833
    3934
    4035BEGIN {
    41     @mgppbuildproc::ISA = ('docproc');
     36    @mgppbuildproc::ISA = ('basebuildproc');
    4237}
    4338
    4439#this must be the same as in mgppbuilder
    4540our %level_map = ('document'=>'Doc',
    46           'section'=>'Sec',
    47           'paragraph'=>'Para');
     41          'section'=>'Sec',
     42          'paragraph'=>'Para');
    4843
    4944sub new {
    50     my ($class, $collection, $source_dir, $build_dir,
    51     $verbosity, $outhandle) = @_;
    52     my $self = new docproc ();
    53 
    54     # outhandle is where all the debugging info goes
    55     # output_handle is where the output of the plugins is piped
    56     # to (i.e. mg, gdbm etc.)
    57     $outhandle = STDERR unless defined $outhandle;
    58 
    59     $self->{'collection'} = $collection;
    60     $self->{'source_dir'} = $source_dir;
    61     $self->{'build_dir'} = $build_dir;
    62     $self->{'verbosity'} = $verbosity;
    63     $self->{'classifiers'} = [];
    64     $self->{'mode'} = "text";
    65     $self->{'assocdir'} = $build_dir;
    66     $self->{'dontgdbm'} = {};
     45    my $class = shift @_;
     46    my $self = new basebuildproc (@_);
     47
     48    # use a different index specification to the default
    6749    $self->{'index'} = "text";
    68     $self->{'indexexparr'} = [];
    69     $self->{'output_handle'} = "STDOUT";
    70     $self->{'num_docs'} = 0;
    71     $self->{'num_sections'} = 0;
    72     $self->{'num_bytes'} = 0;
    73     $self->{'num_processed_bytes'} = 0;
    74     $self->{'store_text'} = 1;
    75     $self->{'outhandle'} = $outhandle;
    76 
    77     #used by browse interface
    78     $self->{'doclist'} = [];
    79 
    80     $self->{'indexing_text'} = 0;
    81 
    82     #new ones for mgpp
     50
    8351    $self->{'dontindex'} = {};
    8452    $self->{'indexfieldmap'} = {};
    8553    $self->{'indexfields'} = {}; # only put in the ones that are not specified directly in the index
    8654    $self->{'strip_html'}=1;
    87 
    88 
     55   
    8956    return bless $self, $class;
    9057}
    9158
    92 sub reset {
    93     my $self = shift (@_);
    94    
    95     $self->{'num_docs'} = 0;
    96     $self->{'num_sections'} = 0;
    97     $self->{'num_processed_bytes'} = 0;
    98     $self->{'num_bytes'} = 0;
    99 }
    100 
    101 sub get_num_docs {
    102     my $self = shift (@_);
    103 
    104     return $self->{'num_docs'};
    105 }
    106 
    107 sub get_num_sections {
    108     my $self = shift (@_);
    109 
    110     return $self->{'num_sections'};
    111 }
    112 
    113 # num_bytes is the actual number of bytes in the collection
    114 # this is normally the same as what's processed during text compression
    115 sub get_num_bytes {
    116     my $self = shift (@_);
    117 
    118     return $self->{'num_bytes'};
    119 }
    120 
    121 # num_processed_bytes is the number of bytes actually passed
    122 # to mgpp for the current index
    123 sub get_num_processed_bytes {
    124     my $self = shift (@_);
    125 
    126     return $self->{'num_processed_bytes'};
    127 }
    128 
    129 sub set_output_handle {
    130     my $self = shift (@_);
    131     my ($handle) = @_;
    132 
    133     $self->{'output_handle'} = $handle;
    134 }
    135 
    136 sub set_mode {
    137     my $self = shift (@_);
    138     my ($mode) = @_;
    139 
    140     $self->{'mode'} = $mode;
    141 }
    142 
    143 sub set_assocdir {
    144     my $self = shift (@_);
    145     my ($assocdir) = @_;
    146 
    147     $self->{'assocdir'} = $assocdir;
    148 }
    149 
    150 sub set_dontgdbm {
    151     my $self = shift (@_);
    152     my ($dontgdbm) = @_;
    153 
    154     $self->{'dontgdbm'} = $dontgdbm;
    155 }
    156 
    157 sub set_index {
    158     my $self = shift (@_);
    159     my ($index, $indexexparr) = @_;
    160 
    161     $self->{'index'} = $index;
    162     $self->{'indexexparr'} = $indexexparr if defined $indexexparr;
    163 }
    164 
    165 sub set_index_languages {
    166     my $self = shift (@_);
    167     my ($lang_meta, $langarr) = @_;
    168     $self->{'lang_meta'} = $lang_meta;
    169     $self->{'langarr'} = $langarr;
    170 }
    171 
    172 sub get_index {
    173     my $self = shift (@_);
    174 
    175     return $self->{'index'};
    176 }
    177 
    178 sub set_classifiers {
    179     my $self = shift (@_);
    180     my ($classifiers) = @_;
    181 
    182     $self->{'classifiers'} = $classifiers;
    183 }
    184 
    185 sub set_indexing_text {
    186     my $self = shift (@_);
    187     my ($indexing_text) = @_;
    188 
    189     $self->{'indexing_text'} = $indexing_text;
    190 }
    191 
    192 sub get_indexing_text {
    193     my $self = shift (@_);
    194 
    195     return $self->{'indexing_text'};
    196 }
    197 
    198 sub set_store_text {
    199     my $self = shift (@_);
    200     my ($store_text) = @_;
    201 
    202     $self->{'store_text'} = $store_text;
    203 }
    204 
    205 sub get_doc_list {
    206     my $self = shift(@_);
    207    
    208     return @{$self->{'doclist'}};
    209 }
    21059
    21160sub set_indexfieldmap {
     
    23584}
    23685
    237 sub process {
    238     my $self = shift (@_);
    239     my $method = $self->{'mode'};
    240 
    241     $self->$method(@_);
    242 }
    243 
    244 # use 'Paged' if document has no more than 2 levels
    245 # and each section at second level has a number for
    246 # Title metadata
    247 # also use Paged if gsdlthistype metadata is set to Paged
    248 sub get_document_type {
    249     my $self = shift (@_);
    250     my ($doc_obj) = @_;
    251 
    252     my $thistype = "VList";
    253     my $childtype = "VList";
    254     my $title;
    255     my @tmp = ();
    256    
    257     my $section = $doc_obj->get_top_section ();
    258    
    259     my $gsdlthistype = $doc_obj->get_metadata_element ($section, "gsdlthistype");
    260     if (defined $gsdlthistype) {
    261     if ($gsdlthistype eq "Paged") {
    262         $childtype = "Paged";
    263         if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
    264         $thistype = "Paged";
    265         } else {
    266         $thistype = "Invisible";
    267         }
    268        
    269         return ($thistype, $childtype);
    270     } elsif ($gsdlthistype eq "Hierarchy") {
    271         return ($thistype, $childtype); # use VList, VList
    272     }
    273     }
    274     my $first = 1;
    275     while (defined $section) {
    276     @tmp = split /\./, $section;
    277     if (scalar(@tmp) > 1) {
    278         return ($thistype, $childtype);
    279     }
    280     if (!$first) {
    281         $title = $doc_obj->get_metadata_element ($section, "Title");
    282         if (!defined $title || $title !~ /^\d+$/) {
    283         return ($thistype, $childtype);
    284         }
    285     }
    286     $first = 0;
    287     $section = $doc_obj->get_next_section($section);
    288     }
    289     if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
    290     $thistype = "Paged";
    291     } else {
    292     $thistype = "Invisible";
    293     }
    294     $childtype = "Paged";
    295     return ($thistype, $childtype);
    296 }
    297 
    298 sub assoc_files {
    299    my $self = shift (@_);
    300     my ($doc_obj, $archivedir) = @_;
    301     my ($afile);
    302    
    303     foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
    304     # if assoc file starts with a slash, we put it relative to the assoc
    305     # dir, otherwise it is relative to the HASH... directory
    306     if ($assoc_file->[1] =~ m@^[/\\]@) {
    307         $afile = &util::filename_cat($self->{'assocdir'},$assoc_file->[1]);
    308     } else {
    309         $afile = &util::filename_cat($self->{'assocdir'}, $archivedir, $assoc_file->[1]);
    310     }
    311     &util::hard_link ($assoc_file->[0], $afile);
    312     }
    313 }
    314 
    315 sub infodb {
    316     my $self = shift (@_);
    317     my ($doc_obj, $filename) = @_;
    318     my $handle = $self->{'output_handle'};
    319 
    320     my $doctype = $doc_obj->get_doc_type();
    321 
    322     # only output this document if it is one to be indexed
    323     return if ($doctype ne "indexed_doc");
    324 
     86
     87sub get_gdbm_level {
     88    my $self = shift (@_);
     89   
    32590    #if a Section level index is not built, the gdbm file should be at doc
    32691    #level not Section
    327     my $docs_only = 1;
    32892    if ($self->{'levels'}->{'section'}) {
    329     $docs_only = 0;
    330     }
    331 
    332     my ($archivedir) = $filename =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
    333     $archivedir = "" unless defined $archivedir;
    334     $archivedir =~ s/\\/\//g;
    335     $archivedir =~ s/^\/+//;
    336     $archivedir =~ s/\/+$//;
    337 
    338     # resolve the final filenames of the files associated with this document
    339     $self->assoc_files ($doc_obj, $archivedir);
    340 
    341     #GRB: moved 1/06/2004 from GRB01062004
    342     #add this document to the browse structure
    343     push(@{$self->{'doclist'}},$doc_obj->get_OID())
    344     unless ($doctype eq "classification");
    345 
    346     # classify this document
    347     &classify::classify_doc ($self->{'classifiers'}, $doc_obj);
    348     #GRB: end of moved block
    349 
    350     # this is another document
    351     $self->{'num_docs'} += 1 unless ($doctype eq "classification");
    352 
    353     # is this a paged or a hierarchical document
    354     my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
    355 
    356     my $section = $doc_obj->get_top_section ();
    357     my $doc_OID = $doc_obj->get_OID();
    358     my $first = 1;
    359     my $url = "";
    360     while (defined $section) {
    361     # update a few statistics
    362     $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
    363     $self->{'num_sections'} += 1 unless ($doctype eq "classification");
    364 
    365     # output the section name
    366     if ($section eq "") { print $handle "[$doc_OID]\n"; }
    367     else { print $handle "[$doc_OID.$section]\n"; }
    368 
    369     # output the fact that this document is a document (unless doctype
    370     # has been set to something else from within a plugin
    371     my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
    372     if (!defined $dtype || $dtype !~ /\w/) {
    373         print $handle "<doctype>doc\n";
    374     }
    375 
    376     # output whether this node contains text
    377     if ($doc_obj->get_text_length($section) > 0) {
    378         print $handle "<hastxt>1\n";
    379     } else {
    380         print $handle "<hastxt>0\n";
    381     }
    382 
    383     # output all the section metadata
    384     my $metadata = $doc_obj->get_all_metadata ($section);
    385     foreach my $pair (@$metadata) {
    386         my ($field, $value) = (@$pair);
    387 
    388         if ($field ne "Identifier" && $field !~ /^gsdl/ &&
    389         defined $value && $value ne "") {       
    390 
    391         # escape problematic stuff
    392         $value =~ s/\\/\\\\/g;
    393         $value =~ s/\n/\\n/g;
    394         $value =~ s/\r/\\r/g;
    395 
    396         # special case for URL metadata
    397         if ($field =~ /^URL$/i) {
    398                     $url .= "[$value]\n";
    399                     if ($section eq "") {$url .= "<section>$doc_OID\n";}
    400                     else {$url .= "<section>$doc_OID.$section\n";}
    401                     $url .= '-' x 70 . "\n";
    402         }
    403 
    404         if (!defined $self->{'dontgdbm'}->{$field}) {
    405             print $handle "<$field>$value\n";
    406         }
    407         }
    408     }
    409 
    410     # output archivedir if at top level
    411     if ($section eq $doc_obj->get_top_section()) {
    412         print $handle "<archivedir>$archivedir\n";
    413     }
    414 
    415     # output document display type
    416     if ($first) {
    417         print $handle "<thistype>$thistype\n";
    418     }
    419 
    420     if (!$docs_only) {
    421         # output a list of children
    422         my $children = $doc_obj->get_children ($section);
    423         if (scalar(@$children) > 0) {
    424         print $handle "<childtype>$childtype\n";
    425         print $handle "<contains>";
    426         my $firstchild = 1;
    427         foreach my $child (@$children) {
    428             print $handle ";" unless $firstchild;
    429             $firstchild = 0;
    430             if ($child =~ /^.*?\.(\d+)$/) {
    431             print $handle "\".$1";
    432             } else {
    433             print $handle "\".$child";
    434             }
    435 #       if ($child eq "") { print $handle "$doc_OID"; }
    436 #       elsif ($section eq "") { print $handle "$doc_OID.$child"; }
    437 #       else { print $handle "$doc_OID.$section.$child"; }
    438         }
    439         print $handle "\n";
    440         }
    441         #output the matching doc number
    442         print $handle "<docnum>$self->{'num_sections'}\n";
    443        
    444     } # if (!$docs_only)
    445     else { #docs only, doc num is num_docs not num_sections
    446         # output the matching document number
    447         print $handle "<docnum>$self->{'num_docs'}\n";
    448     }
    449    
    450     print $handle '-' x 70, "\n";
    451 
    452    
    453     # output a database entry for the document number
    454     if ($docs_only) {
    455         print $handle "[$self->{'num_docs'}]\n";
    456         print $handle "<section>$doc_OID\n";
    457     }
    458     else {
    459         print $handle "[$self->{'num_sections'}]\n";
    460         if ($section eq "") { print $handle "<section>$doc_OID\n"; }
    461         else { print $handle "<section>$doc_OID.$section\n"; }
    462     }
    463     print $handle '-' x 70, "\n";
    464    
    465         # output entry for url
    466         if ($url ne "") {
    467             print $handle $url;
    468         }
    469 
    470     $first = 0;
    471     $section = $doc_obj->get_next_section($section);
    472     last if ($docs_only); # if no sections wanted, only gdbm the docs
    473     }
    474 
    475     #GRB01062004: see code above moved from here
    476 }
     93    return "section";
     94    }
     95    return "document";
     96}
     97
    47798
    47899#sub find_paragraphs {
     
    533154    my $handle = $self->{'output_handle'};
    534155    my $outhandle = $self->{'outhandle'};
    535     my $indexed_doc = 1;
    536156
    537157    # only output this document if it is one to be indexed
    538158    return if ($doc_obj->get_doc_type() ne "indexed_doc");
    539159
    540     # see if this document belongs to this subcollection
    541     foreach my $indexexp (@{$self->{'indexexparr'}}) {
    542     $indexed_doc = 0;
    543     my ($field, $exp, $options) = split /\//, $indexexp;
    544     if (defined ($field) && defined ($exp)) {
    545         my ($bool) = $field =~ /^(.)/;
    546         $field =~ s/^.// if $bool eq '!';
    547         if ($field =~ /^filename$/i) {
    548         $field = $doc_obj->get_source_filename();
    549         } else {
    550         $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
    551         }
    552         next unless defined $field;
    553         if ($bool eq '!') {
    554         if ($options =~ /^i$/i) {
    555             if ($field !~ /$exp/i) {$indexed_doc = 1; last;}
    556         } else {
    557             if ($field !~ /$exp/) {$indexed_doc = 1; last;}
    558         }
    559         } else {
    560         if ($options =~ /^i$/i) {
    561             if ($field =~ /$exp/i) {$indexed_doc = 1; last;}
    562         } else {
    563             if ($field =~ /$exp/) {$indexed_doc = 1; last;}
    564         }
    565         }
    566     }
    567     }
    568 
    569     # if this doc is so far in the sub collection, and we have lang info,
    570     # now we check the languages to see if it matches
    571     if($indexed_doc && defined $self->{'lang_meta'}) {
    572     $indexed_doc = 0;
    573     my $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'lang_meta'});
    574     if (defined $field) {
    575         foreach my $lang (@{$self->{'langarr'}}) {
    576         my ($bool) = $lang =~ /^(.)/;
    577         if ($bool eq '!') {
    578             $lang =~ s/^.//;
    579             if ($field !~ /$lang/) {
    580             $indexed_doc = 1; last;
    581             }
    582         } else {
    583             if ($field =~ /$lang/) {
    584             $indexed_doc = 1; last;
    585             }
    586         }
    587         }
    588     }
    589     }
    590 
     160    my $indexed_doc = $self->is_subcollection_doc($doc_obj);
     161   
    591162    # this is another document
    592163    $self->{'num_docs'} += 1;
Note: See TracChangeset for help on using the changeset viewer.