Ignore:
Timestamp:
1999-06-30T15:35:07+12:00 (25 years ago)
Author:
sjboddie
Message:
  • removed old infodb stuff
  • changed the way classifiers work
  • added maxdocs and allclassifications options
  • no longer get doctype from collect.cfg but instead set it directly in plugins that don't use the default
File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/gsdl/perllib/mgbuildproc.pm

    r292 r315  
    107107}
    108108
    109 sub newinfodb {
     109sub infodb {
    110110    my $self = shift (@_);
    111111    my ($doc_obj, $filename) = @_;
    112112    my $handle = $self->{'output_handle'};
    113113#    $handle = "main::STDOUT";
    114 
    115     # this was used in the old version
    116     return if ($doc_obj eq 'classifications');
    117114
    118115    my $doctype = $doc_obj->get_doc_type();
     
    209206    &classify::classify_doc ($self->{'classifiers'}, $doc_obj);
    210207
    211 }
    212 
    213 
    214 
    215 sub infodb {
    216     my $self = shift (@_);
    217     my ($doc_obj, $filename) = @_;
    218     my $handle = $self->{'output_handle'};
    219 
    220     if ($doc_obj eq 'classifications') {
    221     # output classifications if all books have been processed
    222     foreach $key (keys %$saved_classifications) {
    223         $saved_classifications->{$key}->{'contains'} = undef if
    224         $saved_classifications->{$key}->{'contains'} eq "";
    225         $saved_classifications->{$key}->{'parent'} = undef if
    226         (!defined $saved_classifications->{$key}->{'parent'}) ||
    227             ($saved_classifications->{$key}->{'parent'} eq "");
    228         $self->write_to_gdbm ($handle, $key, $saved_classifications->{$key}->{'title'},
    229                   undef, undef, undef, undef,
    230                   $saved_classifications->{$key}->{'contains'}, undef,
    231                   $saved_classifications->{$key}->{'parent'}, undef, undef);
    232     }
    233     return;
    234     }
    235 
    236     my $doctype = $doc_obj->get_doc_type();
    237 
    238     # only output this document if it is one to be indexed
    239     return if ($doctype ne "indexed_doc" &&
    240            $doctype ne "classification");
    241 
    242     # found classification document
    243     $saved_classifications = {} if ($doctype eq "classification");
    244     $sectionmap = {};
    245 
    246     # this is another document
    247     $self->{'num_docs'} += 1 unless ($doctype eq "classification");
    248 
    249     my $section = $doc_obj->get_top_section();
    250     while (defined $section) {
    251     # update a few statistics
    252     $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
    253     $self->{'num_sections'} += 1 unless ($doctype eq "classification");
    254 
    255     my $title = $doc_obj->get_metadata_element($section, "Title");
    256     my $source = $doc_obj->get_metadata_element($section, "Source");
    257     my $date = $doc_obj->get_metadata_element($section, "Date");
    258     my $jobnumber = $doc_obj->get_source_filename();
    259 
    260     my $mapped_section = $self->map_section($doctype, $section);
    261     if ($doctype eq "classification") {
    262         $mapped_section = $self->char_classification($mapped_section);
    263     } else {
    264         $mapped_section = "B.$self->{'num_docs'}.$mapped_section";
    265     }
    266     $mapped_section =~ s/\.+$//;
    267 
    268     my ($parent, $classification, $creator);
    269    
    270     $classification = $self->get_classifications($doc_obj, $section, $mapped_section)
    271         unless $doctype eq "classification";
    272 
    273     if ($section ne $doc_obj->get_top_section()) {
    274         $parent = $self->map_section($doctype, $doc_obj->get_parent_section($section));
    275         if ($doctype eq "classification") {
    276         $parent = $self->char_classification($parent);
    277         } else {
    278         $parent = "B.$self->{'num_docs'}.$parent";
    279         }
    280         $parent =~ s/\.+$//;
    281     } else {
    282         $creator = $doc_obj->get_metadata_element($section, "Creator");
    283 
    284         # need filename so we know what directory to look in for associated files
    285         $filename =~ s/^\/?(.*?\.dir).*$/$1/ if (defined $filename);
    286     }
    287 
    288     if ($doc_obj->is_leaf_section($section)) {
    289         if ($doctype eq "classification") {
    290         if (defined $saved_classifications->{$mapped_section}) {
    291             print STDERR "mgbuildproc:warning: classification $mapped_section " .
    292             "declared multiple times\n";
    293         } else {
    294             $saved_classifications->{$mapped_section}  = {'title' => $title, 'contains' => "",
    295                                   'parent' => $parent};
    296         }
    297         } else {
    298         $self->write_to_gdbm($handle, $mapped_section, $title, $creator, $source, $date, $jobnumber, undef,
    299                      $self->{'num_sections'}, $parent, $classification, $filename);
    300         }
    301     } else {
    302 
    303         # add the introductory section if it exists
    304         my $contains = "";
    305         if ($doc_obj->get_text_length($section) > 0) {
    306         $contains .= "B.$self->{'num_docs'}." . $self->map_section ($doctype, "$section.0");
    307         }
    308 
    309         # add the rest of the children
    310         my @children = @{$doc_obj->get_children($section)};
    311         if ($doctype eq "classification") {
    312         map {$_ = $self->char_classification($_);} @children;
    313         if (defined $saved_classifications->{$mapped_section}) {
    314             print STDERR "mgbuildproc:warning: classification $mapped_section " .
    315             "declared multiple times\n";
    316         } else {
    317             $saved_classifications->{$mapped_section}  = {'title' => $title, 'contains' => ""};
    318             $saved_classifications->{$mapped_section}->{'contains'} = join ":", @children;
    319             $saved_classifications->{$mapped_section}->{'parent'} = $parent;
    320         }
    321         } else {
    322         map {$_ = "B.$self->{'num_docs'}." . $self->map_section($doctype, $_);} @children;
    323         $contains .= ":" if $contains ne "";
    324         $contains .= join ":", @children;
    325         $self->write_to_gdbm ($handle, $mapped_section, $title, $creator, $source,
    326                       $date, $jobnumber, $contains, $self->{'num_sections'},
    327                       $parent, $classification, $filename);
    328 
    329         if ($doc_obj->get_text_length($section) > 0) {
    330             my $intromapsection = "B.$self->{'num_docs'}." .
    331             $self->map_section($doctype, "$section.0");
    332             $self->write_to_gdbm ($handle, $intromapsection, "<i>(introductory text)</i>", $creator,
    333                       $source, $date, $jobnumber, undef, $self->{'num_sections'},
    334                       $mapped_section, $classification, $filename);
    335         }
    336         }
    337     }
    338     $section = $doc_obj->get_next_section($section);
    339     }
    340    
    341     # update classification list with those books that
    342     # were processed before classification list
    343     if ($doctype eq "classification" && defined $temp_classifications) {
    344     foreach $key (keys(%$temp_classifications)) {
    345         if (!defined $saved_classifications->{$key}) {
    346         print STDERR "mgbuildproc:$temp_classifications->{$key} belong to " .
    347             "undefined classification $key\n";
    348         }
    349         $saved_classifications->{$key}->{'contains'} = $temp_classifications->{$key}->{'contains'};
    350     }
    351     $temp_classifications = undef;
    352     }
    353 }
    354 
    355 
    356 sub write_to_gdbm {
    357     my $self = shift (@_);
    358     my ($handle, $section, $title, $creator, $source, $date,
    359     $jobnumber, $contains, $docnum, $parent, $classification, $OID) = @_;
    360 
    361     print $handle "[$section]\n";
    362     print $handle "<t>$title\n" if (defined $title && $title ne "");
    363     print $handle "<a>$creator\n" if (defined $creator && $creator ne "");
    364     print $handle "<s>$source\n" if (defined $source && $source ne "");
    365     print $handle "<i>$date\n" if (defined $date && $date ne "");
    366     print $handle "<j>$jobnumber\n" if (defined $jobnumber && $jobnumber ne "");
    367     print $handle "<c>$contains\n" if (defined $contains && $contains ne "");
    368     print $handle "<d>$docnum\n" if (defined $docnum && $docnum ne "");
    369     print $handle "<p>$parent\n" if (defined $parent && $parent ne "");
    370     print $handle "<x>$classification\n" if (defined $classification && $classification ne "");
    371     print $handle "<o>$OID\n" if defined $OID;
    372     print $handle '-' x 70, "\n";
    373    
    374     if (defined $docnum) {
    375     print $handle "[$docnum]\n";
    376     print $handle "<x>$section\n";
    377     print $handle '-' x 70, "\n";
    378     }
    379 }
    380 
    381 sub get_classifications {
    382     my $self = shift (@_);
    383     my ($doc_obj, $section, $mapped_section) = @_;
    384 
    385     my ($classificationsref);
    386     if (defined $saved_classifications) {
    387     # classification list has been processed
    388     $classificationsref = $saved_classifications;
    389     } else {
    390     # classification list has yet to be processed, save
    391     # books in temp_classifications until list is processed
    392     # and they can be moved into saved_classifications
    393     $temp_classifications = {} unless defined $temp_classifications;
    394     $classificationsref = $temp_classifications;
    395     }
    396 
    397     my $classifications = $doc_obj->get_metadata($section, "Subject");
    398 
    399     # need to save which books belong in each classification
    400     # to output later
    401     foreach $classification (@$classifications) {
    402 
    403     if (!defined $classificationsref->{$classification}) {
    404         $classificationsref->{$classification}->{'parent'} = $doc_obj->get_parent_section($section);
    405         $classificationsref->{$classification}->{'contains'} = "";
    406     }
    407     $classificationsref->{$classification}->{'contains'} .= ":" unless
    408         $classificationsref->{$classification}->{'contains'} eq "";
    409     $classificationsref->{$classification}->{'contains'} .= $mapped_section;
    410     }
    411 
    412     return (join ":", @$classifications);
    413 }
    414 
    415 sub map_section {
    416     my $self = shift (@_);
    417     my ($doctype, $section) = @_;
    418 
    419     # classifications should never need to be mapped
    420     return $section if $doctype eq "classification";
    421 
    422     return "" unless (defined $section) && ($section ne "");
    423 
    424     $sectionmap = {} unless defined $sectionmap;
    425 
    426 
    427 
    428     # get the section into a standard format
    429     $section =~ s/^\.+|\.+$//g;
    430 
    431     # return the mapped section if it has been seen before
    432     if (defined $sectionmap->{$section}) {
    433     return $sectionmap->{$section};
    434     }
    435 
    436     # find out the parent section
    437     my ($parentsection, $num);
    438     if ($section =~ /^(.+)\.(\d+)$/) {
    439     $parentsection = $1;
    440     $num = $2;
    441     } elsif ($section =~ /^(\d+)$/) {
    442     $parentsection = "";
    443     $num = $1;
    444     } else {
    445     print STDERR "mgbuildproc:map_section - misformed section $section\n";
    446     }
    447     $parentsection = "" unless defined $parentsection;
    448 
    449     if ($parentsection eq "") {
    450     $num ++ if $num == 1 and defined $sectionmap->{'0'};
    451     } else {
    452     $num ++ if $num == 1 and defined $sectionmap->{"$parentsection.0"};
    453     }
    454 
    455     # find out the mapped parent section
    456     my $mappedparentsection = $self->map_section ($doctype, $parentsection);
    457 
    458     # find the next unused child section
    459     my $previousnum = $num - 1;
    460     my $previoussection = "";
    461     if ($parentsection eq "") {
    462     $previoussection = $previousnum;
    463     $previoussection = 0 if ($section == 1) && (defined $sectionmap->{'0'});
    464     } else{
    465     $previoussection = "$parentsection.$previousnum";
    466     $previoussection = "$parentsection.0" if ($section =~ /\.1$/) && (defined $sectionmap->{"$parentsection.0"});
    467     }
    468     while (($previousnum > 0) && (!defined $sectionmap->{$previoussection})) {
    469     $previousnum--;
    470     $previoussection = "$parentsection.$previousnum";
    471     $previoussection = $previousnum if $parentsection eq "";
    472     }
    473 
    474     # there has been no children under this parent, this section will be number 1
    475     if ($previousnum <= 0) {
    476     if ($mappedparentsection eq "") {
    477         $sectionmap->{$section} = "1";
    478     } else {
    479         $sectionmap->{$section} = "$mappedparentsection.1";
    480     }
    481 
    482     } else {
    483     # get the previous mapped number
    484     my $previousmapnum = 0;
    485     if ($sectionmap->{$previoussection} =~ /(^|\.)?(\d+)$/) {
    486         $previousmapnum = $2;
    487     }
    488    
    489     # increment it to get this mapped child
    490     my $mappednum = $previousmapnum+1;
    491 
    492     if ($mappedparentsection eq "") {
    493         $sectionmap->{$section} = $mappednum;
    494     } else {
    495         $sectionmap->{$section} = "$mappedparentsection.$mappednum";
    496     }
    497     }
    498 
    499     return $sectionmap->{$section};
    500208}
    501209
     
    591299            if ($self->{'indexing_text'} &&
    592300            $new_text =~ /[\(\)\{\}]/) {
    593             print "arrgh: $new_text\n";
    594301            }
    595302
Note: See TracChangeset for help on using the changeset viewer.