Changeset 315 for trunk/gsdl
- Timestamp:
- 1999-06-30T15:35:07+12:00 (25 years ago)
- Location:
- trunk/gsdl/perllib
- Files:
-
- 4 edited
Legend:
- Unmodified
- Added
- Removed
-
trunk/gsdl/perllib/classify.pm
r232 r315 6 6 7 7 8 $next_classify_num = "0";8 $next_classify_num = 1; 9 9 10 10 # load_classifier will load one classifier. $classinfo is an … … 39 39 40 40 # init_classifiers resets all the classifiers and readys them to process 41 # the documents. They are each given a unique classification OID41 # the documents. 42 42 sub init_classifiers { 43 43 my ($classifiers) = @_; … … 45 45 foreach $classobj (@$classifiers) { 46 46 $classobj->init(); 47 $classobj->set_OID ("CL$next_classify_num");48 $next_classify_num++;49 47 } 50 48 } … … 62 60 # to the gdbm 63 61 sub output_classify_info { 64 my ($classifiers, $handle ) = @_;62 my ($classifiers, $handle, $allclassifications) = @_; 65 63 # $handle = "main::STDOUT"; 66 64 67 my $contains_str = ""; 68 my $first = 1; 69 70 # output each of the classifications 65 # create a classification containing all the info 66 my $classifyinfo = {'classifyOID'=>'browse', 67 'contains'=>[]}; 68 69 # get each of the classifications 71 70 foreach $classobj (@$classifiers) { 72 $classobj->output_classify_info($handle);73 $ contains_str .= ";" unless $first;74 $ first = 0;75 $contains_str .= $classobj->get_OID();71 my $tempinfo = $classobj->get_classify_info(); 72 $tempinfo->{'classifyOID'} = "CL$next_classify_num"; 73 $next_classify_num++; 74 push (@{$classifyinfo->{'contains'}}, $tempinfo); 76 75 } 77 76 78 # output information about each of the classifications 79 print $handle "[browse]\n"; 80 print $handle "<doctype>classify\n"; 81 print $handle "<hastxt>0\n"; 82 print $handle "<contains>$contains_str\n" 83 unless $contains_str eq ""; 84 print $handle '-' x 70, "\n"; 77 &print_classify_info ($handle, $classifyinfo, "", $allclassifications); 85 78 } 86 79 80 sub print_classify_info { 81 my ($handle, $classifyinfo, $OID, $allclassifications) = @_; 82 $OID =~ s/^\.+//; # just for good luck 83 84 # book information is printed elsewhere 85 return if (defined ($classifyinfo->{'OID'})); 86 87 # don't want empty classifications 88 if ($allclassifications || &clean_contents ($classifyinfo)) { 89 90 $OID = $classifyinfo->{'classifyOID'} if defined ($classifyinfo->{'classifyOID'}); 91 92 my $outputtext = "[$OID]\n"; 93 $outputtext .= "<doctype>classify\n"; 94 $outputtext .= "<hastxt>0\n"; 95 $outputtext .= "<classifytype>$classifyinfo->{'classifytype'}\n" if defined $classifyinfo->{'classifytype'}; 96 $outputtext .= "<Title>$classifyinfo->{'Title'}\n" if defined $classifyinfo->{'Title'}; 97 98 $outputtext .= "<contains>"; 99 100 my $next_subOID = 1; 101 my $first = 1; 102 foreach $tempinfo (@{$classifyinfo->{'contains'}}) { 103 # empty contents were made undefined by clean_contents() 104 next unless defined $tempinfo; 105 106 $outputtext .= ";" unless $first; 107 $first = 0; 108 109 if (defined ($tempinfo->{'classifyOID'})) { 110 $outputtext .= $tempinfo->{'classifyOID'}; 111 &print_classify_info ($handle, $tempinfo, $tempinfo->{'classifyOID'}, 112 $allclassifications); 113 } elsif (defined ($tempinfo->{'OID'})) { 114 $outputtext .= $tempinfo->{'OID'}; 115 # note: we don't want to print the contents of the books 116 } else { 117 $outputtext .= "\".$next_subOID"; 118 &print_classify_info ($handle, $tempinfo, "$OID.$next_subOID", 119 $allclassifications); 120 $next_subOID++; 121 } 122 } 123 $outputtext .= "\n"; 124 $outputtext .= '-' x 70 . "\n"; 125 126 print $handle $outputtext; 127 } 128 } 129 130 sub clean_contents { 131 my ($classifyinfo) = @_; 132 my $has_content = 0; 133 134 foreach $content (@{$classifyinfo->{'contains'}}) { 135 if (defined $content->{'OID'}) { 136 # found a book 137 $has_content = 1; 138 } elsif (&clean_contents ($content)) { 139 # there's a book somewhere below 140 $has_content = 1; 141 } else { 142 # section contains no books so we want to remove 143 # it from its parents contents 144 $content = undef; 145 } 146 } 147 return $has_content; 148 } 87 149 88 150 1; -
trunk/gsdl/perllib/mgbuilder.pm
r292 r315 25 25 26 26 sub new { 27 my ($class, $collection, $source_dir, $build_dir, $verbosity, $newgdbm) = @_; 27 my ($class, $collection, $source_dir, $build_dir, 28 $verbosity, $maxdocs, $allclassifications) = @_; 28 29 29 30 # create an mgbuilder object … … 32 33 'build_dir'=>$build_dir, 33 34 'verbosity'=>$verbosity, 34 'newgdbm'=>$newgdbm}, $class; 35 'maxdocs'=>$maxdocs, 36 'allclassifications'=>$allclassifications 37 }, $class; 35 38 36 39 … … 79 82 80 83 # load all the classifiers 81 if ($self->{'newgdbm'}) { 82 $self->{'classifiers'} = []; 83 if (open (COLCFG, $colcfgname)) { 84 while (defined ($line = &cfgread::read_cfg_line('mgbuilder::COLCFG'))) { 85 if (scalar(@$line) >= 2) { 86 my $key = shift (@$line); 87 if ($key eq "classify") { 88 my $classinfo = &classify::load_classifier($line); 89 push (@{$self->{'classifiers'}}, $classinfo) 90 if defined $classinfo; 91 } 84 $self->{'classifiers'} = []; 85 if (open (COLCFG, $colcfgname)) { 86 while (defined ($line = &cfgread::read_cfg_line('mgbuilder::COLCFG'))) { 87 if (scalar(@$line) >= 2) { 88 my $key = shift (@$line); 89 if ($key eq "classify") { 90 my $classinfo = &classify::load_classifier($line); 91 push (@{$self->{'classifiers'}}, $classinfo) 92 if defined $classinfo; 92 93 } 93 94 } 94 close (COLCFG);95 } else {96 print STDERR "mgbuilder::new couldn't read the cfg file $colcfgname\n"; 97 print STDERR " no classifiers were loaded\n";98 }95 } 96 close (COLCFG); 97 } else { 98 print STDERR "mgbuilder::new couldn't read the cfg file $colcfgname\n"; 99 print STDERR " no classifiers were loaded\n"; 99 100 } 100 101 … … 102 103 # been set in the collect.cfg then the receptionist currently defaults to displaying 103 104 # documents as 'Book' 104 if ($self->{'newgdbm'}) { 105 if (open (COLCFG, $colcfgname)) { 106 while (defined ($line = &cfgread::read_cfg_line('mgbuilder::COLCFG'))) { 107 if (scalar(@$line) == 2) { 108 my $key = shift (@$line); 109 if ($key eq "doctype") { 110 $self->{'classifytype'} = shift (@$line); 111 } 105 if (open (COLCFG, $colcfgname)) { 106 while (defined ($line = &cfgread::read_cfg_line('mgbuilder::COLCFG'))) { 107 if (scalar(@$line) == 2) { 108 my $key = shift (@$line); 109 if ($key eq "doctype") { 110 $self->{'classifytype'} = shift (@$line); 112 111 } 113 112 } 114 close (COLCFG);115 }113 } 114 close (COLCFG); 116 115 } 117 116 … … 130 129 131 130 eval("\$self->{'buildproc'} = new $buildproctype(\$collection, " . 132 "\$source_dir, \$build_dir, \$verbosity , \$newgdbm)");131 "\$source_dir, \$build_dir, \$verbosity)"); 133 132 die "$@" if $@; 134 133 … … 183 182 $self->{'buildproc'}->reset(); 184 183 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, 185 "", {}, $self->{'buildproc'} );184 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}); 186 185 close (PIPEOUT); 187 186 … … 206 205 $self->{'buildproc'}->reset(); 207 206 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, 208 "", {}, $self->{'buildproc'} );207 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}); 209 208 close (PIPEOUT); 210 209 } … … 411 410 $self->{'buildproc'}->reset(); 412 411 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, 413 "", {}, $self->{'buildproc'} );412 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}); 414 413 close (PIPEOUT); 415 414 … … 429 428 $self->{'buildproc'}->reset(); 430 429 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, 431 "", {}, $self->{'buildproc'} );430 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}); 432 431 close (PIPEOUT); 433 432 … … 490 489 491 490 # init all the classifiers 492 if ($self->{'newgdbm'}) { 493 &classify::init_classifiers ($self->{'classifiers'}); 494 } 491 &classify::init_classifiers ($self->{'classifiers'}); 495 492 496 493 # set up the document processor 497 494 $self->{'buildproc'}->set_output_handle ('mgbuilder::PIPEOUT'); 498 if ($self->{'newgdbm'}) { 499 $self->{'buildproc'}->set_mode ('newinfodb'); 500 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'}); 501 } else { 502 $self->{'buildproc'}->set_mode ('infodb'); 503 } 495 $self->{'buildproc'}->set_mode ('infodb'); 496 $self->{'buildproc'}->set_classifiers ($self->{'classifiers'}); 504 497 $self->{'buildproc'}->set_indexing_text (0); 505 498 … … 511 504 $self->{'buildproc'}->reset(); 512 505 513 my $metadata = {};514 if (defined $self->{'classifytype'} && $self->{'classifytype'} =~ /\w/) {515 $metadata->{'classifytype'} = $self->{'classifytype'};516 }517 506 &plugin::read ($self->{'pluginfo'}, $self->{'source_dir'}, 518 "", $metadata, $self->{'buildproc'});507 "", {}, $self->{'buildproc'}, $self->{'maxdocs'}); 519 508 520 509 # output classification information 521 if ($self->{'newgdbm'}) { 522 &classify::output_classify_info ($self->{'classifiers'}, 'mgbuilder::PIPEOUT'); 523 } 510 &classify::output_classify_info ($self->{'classifiers'}, 'mgbuilder::PIPEOUT', 511 $self->{'allclassifications'}); 524 512 525 513 close (PIPEOUT); -
trunk/gsdl/perllib/mgbuildproc.pm
r292 r315 107 107 } 108 108 109 sub newinfodb {109 sub infodb { 110 110 my $self = shift (@_); 111 111 my ($doc_obj, $filename) = @_; 112 112 my $handle = $self->{'output_handle'}; 113 113 # $handle = "main::STDOUT"; 114 115 # this was used in the old version116 return if ($doc_obj eq 'classifications');117 114 118 115 my $doctype = $doc_obj->get_doc_type(); … … 209 206 &classify::classify_doc ($self->{'classifiers'}, $doc_obj); 210 207 211 }212 213 214 215 sub infodb {216 my $self = shift (@_);217 my ($doc_obj, $filename) = @_;218 my $handle = $self->{'output_handle'};219 220 if ($doc_obj eq 'classifications') {221 # output classifications if all books have been processed222 foreach $key (keys %$saved_classifications) {223 $saved_classifications->{$key}->{'contains'} = undef if224 $saved_classifications->{$key}->{'contains'} eq "";225 $saved_classifications->{$key}->{'parent'} = undef if226 (!defined $saved_classifications->{$key}->{'parent'}) ||227 ($saved_classifications->{$key}->{'parent'} eq "");228 $self->write_to_gdbm ($handle, $key, $saved_classifications->{$key}->{'title'},229 undef, undef, undef, undef,230 $saved_classifications->{$key}->{'contains'}, undef,231 $saved_classifications->{$key}->{'parent'}, undef, undef);232 }233 return;234 }235 236 my $doctype = $doc_obj->get_doc_type();237 238 # only output this document if it is one to be indexed239 return if ($doctype ne "indexed_doc" &&240 $doctype ne "classification");241 242 # found classification document243 $saved_classifications = {} if ($doctype eq "classification");244 $sectionmap = {};245 246 # this is another document247 $self->{'num_docs'} += 1 unless ($doctype eq "classification");248 249 my $section = $doc_obj->get_top_section();250 while (defined $section) {251 # update a few statistics252 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);253 $self->{'num_sections'} += 1 unless ($doctype eq "classification");254 255 my $title = $doc_obj->get_metadata_element($section, "Title");256 my $source = $doc_obj->get_metadata_element($section, "Source");257 my $date = $doc_obj->get_metadata_element($section, "Date");258 my $jobnumber = $doc_obj->get_source_filename();259 260 my $mapped_section = $self->map_section($doctype, $section);261 if ($doctype eq "classification") {262 $mapped_section = $self->char_classification($mapped_section);263 } else {264 $mapped_section = "B.$self->{'num_docs'}.$mapped_section";265 }266 $mapped_section =~ s/\.+$//;267 268 my ($parent, $classification, $creator);269 270 $classification = $self->get_classifications($doc_obj, $section, $mapped_section)271 unless $doctype eq "classification";272 273 if ($section ne $doc_obj->get_top_section()) {274 $parent = $self->map_section($doctype, $doc_obj->get_parent_section($section));275 if ($doctype eq "classification") {276 $parent = $self->char_classification($parent);277 } else {278 $parent = "B.$self->{'num_docs'}.$parent";279 }280 $parent =~ s/\.+$//;281 } else {282 $creator = $doc_obj->get_metadata_element($section, "Creator");283 284 # need filename so we know what directory to look in for associated files285 $filename =~ s/^\/?(.*?\.dir).*$/$1/ if (defined $filename);286 }287 288 if ($doc_obj->is_leaf_section($section)) {289 if ($doctype eq "classification") {290 if (defined $saved_classifications->{$mapped_section}) {291 print STDERR "mgbuildproc:warning: classification $mapped_section " .292 "declared multiple times\n";293 } else {294 $saved_classifications->{$mapped_section} = {'title' => $title, 'contains' => "",295 'parent' => $parent};296 }297 } else {298 $self->write_to_gdbm($handle, $mapped_section, $title, $creator, $source, $date, $jobnumber, undef,299 $self->{'num_sections'}, $parent, $classification, $filename);300 }301 } else {302 303 # add the introductory section if it exists304 my $contains = "";305 if ($doc_obj->get_text_length($section) > 0) {306 $contains .= "B.$self->{'num_docs'}." . $self->map_section ($doctype, "$section.0");307 }308 309 # add the rest of the children310 my @children = @{$doc_obj->get_children($section)};311 if ($doctype eq "classification") {312 map {$_ = $self->char_classification($_);} @children;313 if (defined $saved_classifications->{$mapped_section}) {314 print STDERR "mgbuildproc:warning: classification $mapped_section " .315 "declared multiple times\n";316 } else {317 $saved_classifications->{$mapped_section} = {'title' => $title, 'contains' => ""};318 $saved_classifications->{$mapped_section}->{'contains'} = join ":", @children;319 $saved_classifications->{$mapped_section}->{'parent'} = $parent;320 }321 } else {322 map {$_ = "B.$self->{'num_docs'}." . $self->map_section($doctype, $_);} @children;323 $contains .= ":" if $contains ne "";324 $contains .= join ":", @children;325 $self->write_to_gdbm ($handle, $mapped_section, $title, $creator, $source,326 $date, $jobnumber, $contains, $self->{'num_sections'},327 $parent, $classification, $filename);328 329 if ($doc_obj->get_text_length($section) > 0) {330 my $intromapsection = "B.$self->{'num_docs'}." .331 $self->map_section($doctype, "$section.0");332 $self->write_to_gdbm ($handle, $intromapsection, "<i>(introductory text)</i>", $creator,333 $source, $date, $jobnumber, undef, $self->{'num_sections'},334 $mapped_section, $classification, $filename);335 }336 }337 }338 $section = $doc_obj->get_next_section($section);339 }340 341 # update classification list with those books that342 # were processed before classification list343 if ($doctype eq "classification" && defined $temp_classifications) {344 foreach $key (keys(%$temp_classifications)) {345 if (!defined $saved_classifications->{$key}) {346 print STDERR "mgbuildproc:$temp_classifications->{$key} belong to " .347 "undefined classification $key\n";348 }349 $saved_classifications->{$key}->{'contains'} = $temp_classifications->{$key}->{'contains'};350 }351 $temp_classifications = undef;352 }353 }354 355 356 sub write_to_gdbm {357 my $self = shift (@_);358 my ($handle, $section, $title, $creator, $source, $date,359 $jobnumber, $contains, $docnum, $parent, $classification, $OID) = @_;360 361 print $handle "[$section]\n";362 print $handle "<t>$title\n" if (defined $title && $title ne "");363 print $handle "<a>$creator\n" if (defined $creator && $creator ne "");364 print $handle "<s>$source\n" if (defined $source && $source ne "");365 print $handle "<i>$date\n" if (defined $date && $date ne "");366 print $handle "<j>$jobnumber\n" if (defined $jobnumber && $jobnumber ne "");367 print $handle "<c>$contains\n" if (defined $contains && $contains ne "");368 print $handle "<d>$docnum\n" if (defined $docnum && $docnum ne "");369 print $handle "<p>$parent\n" if (defined $parent && $parent ne "");370 print $handle "<x>$classification\n" if (defined $classification && $classification ne "");371 print $handle "<o>$OID\n" if defined $OID;372 print $handle '-' x 70, "\n";373 374 if (defined $docnum) {375 print $handle "[$docnum]\n";376 print $handle "<x>$section\n";377 print $handle '-' x 70, "\n";378 }379 }380 381 sub get_classifications {382 my $self = shift (@_);383 my ($doc_obj, $section, $mapped_section) = @_;384 385 my ($classificationsref);386 if (defined $saved_classifications) {387 # classification list has been processed388 $classificationsref = $saved_classifications;389 } else {390 # classification list has yet to be processed, save391 # books in temp_classifications until list is processed392 # and they can be moved into saved_classifications393 $temp_classifications = {} unless defined $temp_classifications;394 $classificationsref = $temp_classifications;395 }396 397 my $classifications = $doc_obj->get_metadata($section, "Subject");398 399 # need to save which books belong in each classification400 # to output later401 foreach $classification (@$classifications) {402 403 if (!defined $classificationsref->{$classification}) {404 $classificationsref->{$classification}->{'parent'} = $doc_obj->get_parent_section($section);405 $classificationsref->{$classification}->{'contains'} = "";406 }407 $classificationsref->{$classification}->{'contains'} .= ":" unless408 $classificationsref->{$classification}->{'contains'} eq "";409 $classificationsref->{$classification}->{'contains'} .= $mapped_section;410 }411 412 return (join ":", @$classifications);413 }414 415 sub map_section {416 my $self = shift (@_);417 my ($doctype, $section) = @_;418 419 # classifications should never need to be mapped420 return $section if $doctype eq "classification";421 422 return "" unless (defined $section) && ($section ne "");423 424 $sectionmap = {} unless defined $sectionmap;425 426 427 428 # get the section into a standard format429 $section =~ s/^\.+|\.+$//g;430 431 # return the mapped section if it has been seen before432 if (defined $sectionmap->{$section}) {433 return $sectionmap->{$section};434 }435 436 # find out the parent section437 my ($parentsection, $num);438 if ($section =~ /^(.+)\.(\d+)$/) {439 $parentsection = $1;440 $num = $2;441 } elsif ($section =~ /^(\d+)$/) {442 $parentsection = "";443 $num = $1;444 } else {445 print STDERR "mgbuildproc:map_section - misformed section $section\n";446 }447 $parentsection = "" unless defined $parentsection;448 449 if ($parentsection eq "") {450 $num ++ if $num == 1 and defined $sectionmap->{'0'};451 } else {452 $num ++ if $num == 1 and defined $sectionmap->{"$parentsection.0"};453 }454 455 # find out the mapped parent section456 my $mappedparentsection = $self->map_section ($doctype, $parentsection);457 458 # find the next unused child section459 my $previousnum = $num - 1;460 my $previoussection = "";461 if ($parentsection eq "") {462 $previoussection = $previousnum;463 $previoussection = 0 if ($section == 1) && (defined $sectionmap->{'0'});464 } else{465 $previoussection = "$parentsection.$previousnum";466 $previoussection = "$parentsection.0" if ($section =~ /\.1$/) && (defined $sectionmap->{"$parentsection.0"});467 }468 while (($previousnum > 0) && (!defined $sectionmap->{$previoussection})) {469 $previousnum--;470 $previoussection = "$parentsection.$previousnum";471 $previoussection = $previousnum if $parentsection eq "";472 }473 474 # there has been no children under this parent, this section will be number 1475 if ($previousnum <= 0) {476 if ($mappedparentsection eq "") {477 $sectionmap->{$section} = "1";478 } else {479 $sectionmap->{$section} = "$mappedparentsection.1";480 }481 482 } else {483 # get the previous mapped number484 my $previousmapnum = 0;485 if ($sectionmap->{$previoussection} =~ /(^|\.)?(\d+)$/) {486 $previousmapnum = $2;487 }488 489 # increment it to get this mapped child490 my $mappednum = $previousmapnum+1;491 492 if ($mappedparentsection eq "") {493 $sectionmap->{$section} = $mappednum;494 } else {495 $sectionmap->{$section} = "$mappedparentsection.$mappednum";496 }497 }498 499 return $sectionmap->{$section};500 208 } 501 209 … … 591 299 if ($self->{'indexing_text'} && 592 300 $new_text =~ /[\(\)\{\}]/) { 593 print "arrgh: $new_text\n";594 301 } 595 302 -
trunk/gsdl/perllib/plugin.pm
r171 r315 32 32 33 33 sub read { 34 my ($pluginfo, $base_dir, $file, $metadata, $processor) = @_; 34 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_; 35 36 my $rv = 0; 35 37 36 38 # pass this file by each of the plugins in turn until one 37 39 # is found which will process it 38 40 foreach $plugobj (@$pluginfo) { 39 if ($plugobj->read($pluginfo, $base_dir, $file, 40 $metadata, $processor)) { 41 return; 42 } 41 $rv = $plugobj->read($pluginfo, $base_dir, $file, 42 $metadata, $processor, $maxdocs); 43 return $rv if defined $rv; 43 44 } 44 45 45 46 if ($processor->{'verbosity'} >= 2) { 46 47 print STDERR "WARNING - no plugin could process " . 47 48 &util::filename_cat($base_dir,$file) . "\n"; 48 49 } 50 return 0; 49 51 } 50 52
Note:
See TracChangeset
for help on using the changeset viewer.