Changeset 315 for trunk/gsdl/perllib/mgbuildproc.pm
- Timestamp:
- 1999-06-30T15:35:07+12:00 (25 years ago)
- File:
-
- 1 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/mgbuildproc.pm
r292 r315 107 107 } 108 108 109 sub newinfodb {109 sub infodb { 110 110 my $self = shift (@_); 111 111 my ($doc_obj, $filename) = @_; 112 112 my $handle = $self->{'output_handle'}; 113 113 # $handle = "main::STDOUT"; 114 115 # this was used in the old version116 return if ($doc_obj eq 'classifications');117 114 118 115 my $doctype = $doc_obj->get_doc_type(); … … 209 206 &classify::classify_doc ($self->{'classifiers'}, $doc_obj); 210 207 211 }212 213 214 215 sub infodb {216 my $self = shift (@_);217 my ($doc_obj, $filename) = @_;218 my $handle = $self->{'output_handle'};219 220 if ($doc_obj eq 'classifications') {221 # output classifications if all books have been processed222 foreach $key (keys %$saved_classifications) {223 $saved_classifications->{$key}->{'contains'} = undef if224 $saved_classifications->{$key}->{'contains'} eq "";225 $saved_classifications->{$key}->{'parent'} = undef if226 (!defined $saved_classifications->{$key}->{'parent'}) ||227 ($saved_classifications->{$key}->{'parent'} eq "");228 $self->write_to_gdbm ($handle, $key, $saved_classifications->{$key}->{'title'},229 undef, undef, undef, undef,230 $saved_classifications->{$key}->{'contains'}, undef,231 $saved_classifications->{$key}->{'parent'}, undef, undef);232 }233 return;234 }235 236 my $doctype = $doc_obj->get_doc_type();237 238 # only output this document if it is one to be indexed239 return if ($doctype ne "indexed_doc" &&240 $doctype ne "classification");241 242 # found classification document243 $saved_classifications = {} if ($doctype eq "classification");244 $sectionmap = {};245 246 # this is another document247 $self->{'num_docs'} += 1 unless ($doctype eq "classification");248 249 my $section = $doc_obj->get_top_section();250 while (defined $section) {251 # update a few statistics252 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);253 $self->{'num_sections'} += 1 unless ($doctype eq "classification");254 255 my $title = $doc_obj->get_metadata_element($section, "Title");256 my $source = $doc_obj->get_metadata_element($section, "Source");257 my $date = $doc_obj->get_metadata_element($section, "Date");258 my $jobnumber = $doc_obj->get_source_filename();259 260 my $mapped_section = $self->map_section($doctype, $section);261 if ($doctype eq "classification") {262 $mapped_section = $self->char_classification($mapped_section);263 } else {264 $mapped_section = "B.$self->{'num_docs'}.$mapped_section";265 }266 $mapped_section =~ s/\.+$//;267 268 my ($parent, $classification, $creator);269 270 $classification = $self->get_classifications($doc_obj, $section, $mapped_section)271 unless $doctype eq "classification";272 273 if ($section ne $doc_obj->get_top_section()) {274 $parent = $self->map_section($doctype, $doc_obj->get_parent_section($section));275 if ($doctype eq "classification") {276 $parent = $self->char_classification($parent);277 } else {278 $parent = "B.$self->{'num_docs'}.$parent";279 }280 $parent =~ s/\.+$//;281 } else {282 $creator = $doc_obj->get_metadata_element($section, "Creator");283 284 # need filename so we know what directory to look in for associated files285 $filename =~ s/^\/?(.*?\.dir).*$/$1/ if (defined $filename);286 }287 288 if ($doc_obj->is_leaf_section($section)) {289 if ($doctype eq "classification") {290 if (defined $saved_classifications->{$mapped_section}) {291 print STDERR "mgbuildproc:warning: classification $mapped_section " .292 "declared multiple times\n";293 } else {294 $saved_classifications->{$mapped_section} = {'title' => $title, 'contains' => "",295 'parent' => $parent};296 }297 } else {298 $self->write_to_gdbm($handle, $mapped_section, $title, $creator, $source, $date, $jobnumber, undef,299 $self->{'num_sections'}, $parent, $classification, $filename);300 }301 } else {302 303 # add the introductory section if it exists304 my $contains = "";305 if ($doc_obj->get_text_length($section) > 0) {306 $contains .= "B.$self->{'num_docs'}." . $self->map_section ($doctype, "$section.0");307 }308 309 # add the rest of the children310 my @children = @{$doc_obj->get_children($section)};311 if ($doctype eq "classification") {312 map {$_ = $self->char_classification($_);} @children;313 if (defined $saved_classifications->{$mapped_section}) {314 print STDERR "mgbuildproc:warning: classification $mapped_section " .315 "declared multiple times\n";316 } else {317 $saved_classifications->{$mapped_section} = {'title' => $title, 'contains' => ""};318 $saved_classifications->{$mapped_section}->{'contains'} = join ":", @children;319 $saved_classifications->{$mapped_section}->{'parent'} = $parent;320 }321 } else {322 map {$_ = "B.$self->{'num_docs'}." . $self->map_section($doctype, $_);} @children;323 $contains .= ":" if $contains ne "";324 $contains .= join ":", @children;325 $self->write_to_gdbm ($handle, $mapped_section, $title, $creator, $source,326 $date, $jobnumber, $contains, $self->{'num_sections'},327 $parent, $classification, $filename);328 329 if ($doc_obj->get_text_length($section) > 0) {330 my $intromapsection = "B.$self->{'num_docs'}." .331 $self->map_section($doctype, "$section.0");332 $self->write_to_gdbm ($handle, $intromapsection, "<i>(introductory text)</i>", $creator,333 $source, $date, $jobnumber, undef, $self->{'num_sections'},334 $mapped_section, $classification, $filename);335 }336 }337 }338 $section = $doc_obj->get_next_section($section);339 }340 341 # update classification list with those books that342 # were processed before classification list343 if ($doctype eq "classification" && defined $temp_classifications) {344 foreach $key (keys(%$temp_classifications)) {345 if (!defined $saved_classifications->{$key}) {346 print STDERR "mgbuildproc:$temp_classifications->{$key} belong to " .347 "undefined classification $key\n";348 }349 $saved_classifications->{$key}->{'contains'} = $temp_classifications->{$key}->{'contains'};350 }351 $temp_classifications = undef;352 }353 }354 355 356 sub write_to_gdbm {357 my $self = shift (@_);358 my ($handle, $section, $title, $creator, $source, $date,359 $jobnumber, $contains, $docnum, $parent, $classification, $OID) = @_;360 361 print $handle "[$section]\n";362 print $handle "<t>$title\n" if (defined $title && $title ne "");363 print $handle "<a>$creator\n" if (defined $creator && $creator ne "");364 print $handle "<s>$source\n" if (defined $source && $source ne "");365 print $handle "<i>$date\n" if (defined $date && $date ne "");366 print $handle "<j>$jobnumber\n" if (defined $jobnumber && $jobnumber ne "");367 print $handle "<c>$contains\n" if (defined $contains && $contains ne "");368 print $handle "<d>$docnum\n" if (defined $docnum && $docnum ne "");369 print $handle "<p>$parent\n" if (defined $parent && $parent ne "");370 print $handle "<x>$classification\n" if (defined $classification && $classification ne "");371 print $handle "<o>$OID\n" if defined $OID;372 print $handle '-' x 70, "\n";373 374 if (defined $docnum) {375 print $handle "[$docnum]\n";376 print $handle "<x>$section\n";377 print $handle '-' x 70, "\n";378 }379 }380 381 sub get_classifications {382 my $self = shift (@_);383 my ($doc_obj, $section, $mapped_section) = @_;384 385 my ($classificationsref);386 if (defined $saved_classifications) {387 # classification list has been processed388 $classificationsref = $saved_classifications;389 } else {390 # classification list has yet to be processed, save391 # books in temp_classifications until list is processed392 # and they can be moved into saved_classifications393 $temp_classifications = {} unless defined $temp_classifications;394 $classificationsref = $temp_classifications;395 }396 397 my $classifications = $doc_obj->get_metadata($section, "Subject");398 399 # need to save which books belong in each classification400 # to output later401 foreach $classification (@$classifications) {402 403 if (!defined $classificationsref->{$classification}) {404 $classificationsref->{$classification}->{'parent'} = $doc_obj->get_parent_section($section);405 $classificationsref->{$classification}->{'contains'} = "";406 }407 $classificationsref->{$classification}->{'contains'} .= ":" unless408 $classificationsref->{$classification}->{'contains'} eq "";409 $classificationsref->{$classification}->{'contains'} .= $mapped_section;410 }411 412 return (join ":", @$classifications);413 }414 415 sub map_section {416 my $self = shift (@_);417 my ($doctype, $section) = @_;418 419 # classifications should never need to be mapped420 return $section if $doctype eq "classification";421 422 return "" unless (defined $section) && ($section ne "");423 424 $sectionmap = {} unless defined $sectionmap;425 426 427 428 # get the section into a standard format429 $section =~ s/^\.+|\.+$//g;430 431 # return the mapped section if it has been seen before432 if (defined $sectionmap->{$section}) {433 return $sectionmap->{$section};434 }435 436 # find out the parent section437 my ($parentsection, $num);438 if ($section =~ /^(.+)\.(\d+)$/) {439 $parentsection = $1;440 $num = $2;441 } elsif ($section =~ /^(\d+)$/) {442 $parentsection = "";443 $num = $1;444 } else {445 print STDERR "mgbuildproc:map_section - misformed section $section\n";446 }447 $parentsection = "" unless defined $parentsection;448 449 if ($parentsection eq "") {450 $num ++ if $num == 1 and defined $sectionmap->{'0'};451 } else {452 $num ++ if $num == 1 and defined $sectionmap->{"$parentsection.0"};453 }454 455 # find out the mapped parent section456 my $mappedparentsection = $self->map_section ($doctype, $parentsection);457 458 # find the next unused child section459 my $previousnum = $num - 1;460 my $previoussection = "";461 if ($parentsection eq "") {462 $previoussection = $previousnum;463 $previoussection = 0 if ($section == 1) && (defined $sectionmap->{'0'});464 } else{465 $previoussection = "$parentsection.$previousnum";466 $previoussection = "$parentsection.0" if ($section =~ /\.1$/) && (defined $sectionmap->{"$parentsection.0"});467 }468 while (($previousnum > 0) && (!defined $sectionmap->{$previoussection})) {469 $previousnum--;470 $previoussection = "$parentsection.$previousnum";471 $previoussection = $previousnum if $parentsection eq "";472 }473 474 # there has been no children under this parent, this section will be number 1475 if ($previousnum <= 0) {476 if ($mappedparentsection eq "") {477 $sectionmap->{$section} = "1";478 } else {479 $sectionmap->{$section} = "$mappedparentsection.1";480 }481 482 } else {483 # get the previous mapped number484 my $previousmapnum = 0;485 if ($sectionmap->{$previoussection} =~ /(^|\.)?(\d+)$/) {486 $previousmapnum = $2;487 }488 489 # increment it to get this mapped child490 my $mappednum = $previousmapnum+1;491 492 if ($mappedparentsection eq "") {493 $sectionmap->{$section} = $mappednum;494 } else {495 $sectionmap->{$section} = "$mappedparentsection.$mappednum";496 }497 }498 499 return $sectionmap->{$section};500 208 } 501 209 … … 591 299 if ($self->{'indexing_text'} && 592 300 $new_text =~ /[\(\)\{\}]/) { 593 print "arrgh: $new_text\n";594 301 } 595 302
Note:
See TracChangeset
for help on using the changeset viewer.