source: trunk/gsdl/perllib/mgbuildproc.pm@ 292

Last change on this file since 292 was 292, checked in by rjmcnab, 25 years ago

Added ability to post-process text before it is fed to be indexed or to be
compressed.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 18.5 KB
Line 
1# This document processor outputs a document
2# for mg to process
3
4
5package mgbuildproc;
6
7use classify;
8use doc;
9use docproc;
10use util;
11
12
13BEGIN {
14 @ISA = ('docproc');
15}
16
17
18sub new {
19 my ($class, $collection, $source_dir, $build_dir, $verbosity) = @_;
20 my $self = new docproc ();
21
22 $self->{'collection'} = $collection;
23 $self->{'source_dir'} = $source_dir;
24 $self->{'build_dir'} = $build_dir;
25 $self->{'verbosity'} = $verbosity;
26 $self->{'classifiers'} = [];
27 $self->{'mode'} = "text";
28 $self->{'index'} = "section:text";
29 $self->{'indexexparr'} = [];
30 $self->{'output_handle'} = "STDOUT";
31 $self->{'num_docs'} = 0;
32 $self->{'num_sections'} = 0;
33 $self->{'num_bytes'} = 0;
34
35 $self->{'indexing_text'} = 0;
36
37 return bless $self, $class;
38}
39
40sub reset {
41 my $self = shift (@_);
42
43 $self->{'num_docs'} = 0;
44 $self->{'num_sections'} = 0;
45 $self->{'num_bytes'} = 0;
46}
47
48sub get_num_docs {
49 my $self = shift (@_);
50
51 return $self->{'num_docs'};
52}
53
54sub get_num_sections {
55 my $self = shift (@_);
56
57 return $self->{'num_sections'};
58}
59
60sub get_num_bytes {
61 my $self = shift (@_);
62
63 return $self->{'num_bytes'};
64}
65
66sub set_output_handle {
67 my $self = shift (@_);
68 my ($handle) = @_;
69
70 $self->{'output_handle'} = $handle;
71}
72
73sub set_mode {
74 my $self = shift (@_);
75 my ($mode) = @_;
76
77 $self->{'mode'} = $mode;
78}
79
80sub set_index {
81 my $self = shift (@_);
82 my ($index, $indexexparr) = @_;
83
84 $self->{'index'} = $index;
85 $self->{'indexexparr'} = $indexexparr if defined $indexexparr;
86}
87
88sub set_classifiers {
89 my $self = shift (@_);
90 my ($classifiers) = @_;
91
92 $self->{'classifiers'} = $classifiers;
93}
94
95sub set_indexing_text {
96 my $self = shift (@_);
97 my ($indexing_text) = @_;
98
99 $self->{'indexing_text'} = $indexing_text;
100}
101
102sub process {
103 my $self = shift (@_);
104 my $method = $self->{'mode'};
105
106 $self->$method(@_);
107}
108
109sub newinfodb {
110 my $self = shift (@_);
111 my ($doc_obj, $filename) = @_;
112 my $handle = $self->{'output_handle'};
113# $handle = "main::STDOUT";
114
115 # this was used in the old version
116 return if ($doc_obj eq 'classifications');
117
118 my $doctype = $doc_obj->get_doc_type();
119
120 # only output this document if it is one to be indexed
121 return if ($doctype ne "indexed_doc");
122
123 # this is another document
124 $self->{'num_docs'} += 1 unless ($doctype eq "classification");
125
126 my $section = $doc_obj->get_top_section ();
127 my $doc_OID = $doc_obj->get_OID();
128 while (defined $section) {
129 # update a few statistics
130 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
131 $self->{'num_sections'} += 1 unless ($doctype eq "classification");
132
133 # output the section name
134 if ($section eq "") { print $handle "[$doc_OID]\n"; }
135 else { print $handle "[$doc_OID.$section]\n"; }
136
137 # output the fact that this document is a document
138 print $handle "<doctype>doc\n";
139
140 # output whether this node contains text
141 if ($doc_obj->get_text_length($section) > 0) {
142 print $handle "<hastxt>1\n";
143 } else {
144 print $handle "<hastxt>0\n";
145 }
146
147 # output all the section metadata
148 my $metadata = $doc_obj->get_all_metadata ($section);
149 foreach $pair (@$metadata) {
150 my ($field, $value) = (@$pair);
151
152 if ($field ne "Identifier" && $field !~ /^gsdl/ &&
153 defined $value && $value ne "") {
154 # escape problematic stuff
155 $value =~ s/\\/\\\\/g;
156 $value =~ s/\n/\\n/g;
157 $value =~ s/\r/\\r/g;
158
159 print $handle "<$field>$value\n";
160 }
161 }
162
163 # output archivedir if at top level
164 if ($section eq $doc_obj->get_top_section()) {
165 my ($archivedir) = $filename =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
166 $archivedir = "" unless defined $archivedir;
167 $archivedir =~ s/^(\/|\\)*//;
168 $archivedir =~ s/(\/|\\)*$//;
169 print $handle "<archivedir>$archivedir\n";
170 }
171
172 # output a list of children
173 my $children = $doc_obj->get_children ($section);
174 if (scalar(@$children) > 0) {
175 print $handle "<contains>";
176 my $firstchild = 1;
177 foreach $child (@$children) {
178 print $handle ";" unless $firstchild;
179 $firstchild = 0;
180 if ($child =~ /^.*?\.(\d+)$/) {
181 print $handle "\".$1";
182 } else {
183 print $handle "\".$child";
184 }
185# if ($child eq "") { print $handle "$doc_OID"; }
186# elsif ($section eq "") { print $handle "$doc_OID.$child"; }
187# else { print $handle "$doc_OID.$section.$child"; }
188 }
189 print $handle "\n";
190 }
191
192 # output the matching document number
193 print $handle "<docnum>$self->{'num_sections'}\n";
194
195 print $handle '-' x 70, "\n";
196
197
198 # output a database entry for the document number
199 print $handle "[$self->{'num_sections'}]\n";
200 if ($section eq "") { print $handle "<section>$doc_OID\n"; }
201 else { print $handle "<section>$doc_OID.$section\n"; }
202 print $handle '-' x 70, "\n";
203
204
205 $section = $doc_obj->get_next_section($section);
206 }
207
208 # classify this document
209 &classify::classify_doc ($self->{'classifiers'}, $doc_obj);
210
211}
212
213
214
215sub infodb {
216 my $self = shift (@_);
217 my ($doc_obj, $filename) = @_;
218 my $handle = $self->{'output_handle'};
219
220 if ($doc_obj eq 'classifications') {
221 # output classifications if all books have been processed
222 foreach $key (keys %$saved_classifications) {
223 $saved_classifications->{$key}->{'contains'} = undef if
224 $saved_classifications->{$key}->{'contains'} eq "";
225 $saved_classifications->{$key}->{'parent'} = undef if
226 (!defined $saved_classifications->{$key}->{'parent'}) ||
227 ($saved_classifications->{$key}->{'parent'} eq "");
228 $self->write_to_gdbm ($handle, $key, $saved_classifications->{$key}->{'title'},
229 undef, undef, undef, undef,
230 $saved_classifications->{$key}->{'contains'}, undef,
231 $saved_classifications->{$key}->{'parent'}, undef, undef);
232 }
233 return;
234 }
235
236 my $doctype = $doc_obj->get_doc_type();
237
238 # only output this document if it is one to be indexed
239 return if ($doctype ne "indexed_doc" &&
240 $doctype ne "classification");
241
242 # found classification document
243 $saved_classifications = {} if ($doctype eq "classification");
244 $sectionmap = {};
245
246 # this is another document
247 $self->{'num_docs'} += 1 unless ($doctype eq "classification");
248
249 my $section = $doc_obj->get_top_section();
250 while (defined $section) {
251 # update a few statistics
252 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
253 $self->{'num_sections'} += 1 unless ($doctype eq "classification");
254
255 my $title = $doc_obj->get_metadata_element($section, "Title");
256 my $source = $doc_obj->get_metadata_element($section, "Source");
257 my $date = $doc_obj->get_metadata_element($section, "Date");
258 my $jobnumber = $doc_obj->get_source_filename();
259
260 my $mapped_section = $self->map_section($doctype, $section);
261 if ($doctype eq "classification") {
262 $mapped_section = $self->char_classification($mapped_section);
263 } else {
264 $mapped_section = "B.$self->{'num_docs'}.$mapped_section";
265 }
266 $mapped_section =~ s/\.+$//;
267
268 my ($parent, $classification, $creator);
269
270 $classification = $self->get_classifications($doc_obj, $section, $mapped_section)
271 unless $doctype eq "classification";
272
273 if ($section ne $doc_obj->get_top_section()) {
274 $parent = $self->map_section($doctype, $doc_obj->get_parent_section($section));
275 if ($doctype eq "classification") {
276 $parent = $self->char_classification($parent);
277 } else {
278 $parent = "B.$self->{'num_docs'}.$parent";
279 }
280 $parent =~ s/\.+$//;
281 } else {
282 $creator = $doc_obj->get_metadata_element($section, "Creator");
283
284 # need filename so we know what directory to look in for associated files
285 $filename =~ s/^\/?(.*?\.dir).*$/$1/ if (defined $filename);
286 }
287
288 if ($doc_obj->is_leaf_section($section)) {
289 if ($doctype eq "classification") {
290 if (defined $saved_classifications->{$mapped_section}) {
291 print STDERR "mgbuildproc:warning: classification $mapped_section " .
292 "declared multiple times\n";
293 } else {
294 $saved_classifications->{$mapped_section} = {'title' => $title, 'contains' => "",
295 'parent' => $parent};
296 }
297 } else {
298 $self->write_to_gdbm($handle, $mapped_section, $title, $creator, $source, $date, $jobnumber, undef,
299 $self->{'num_sections'}, $parent, $classification, $filename);
300 }
301 } else {
302
303 # add the introductory section if it exists
304 my $contains = "";
305 if ($doc_obj->get_text_length($section) > 0) {
306 $contains .= "B.$self->{'num_docs'}." . $self->map_section ($doctype, "$section.0");
307 }
308
309 # add the rest of the children
310 my @children = @{$doc_obj->get_children($section)};
311 if ($doctype eq "classification") {
312 map {$_ = $self->char_classification($_);} @children;
313 if (defined $saved_classifications->{$mapped_section}) {
314 print STDERR "mgbuildproc:warning: classification $mapped_section " .
315 "declared multiple times\n";
316 } else {
317 $saved_classifications->{$mapped_section} = {'title' => $title, 'contains' => ""};
318 $saved_classifications->{$mapped_section}->{'contains'} = join ":", @children;
319 $saved_classifications->{$mapped_section}->{'parent'} = $parent;
320 }
321 } else {
322 map {$_ = "B.$self->{'num_docs'}." . $self->map_section($doctype, $_);} @children;
323 $contains .= ":" if $contains ne "";
324 $contains .= join ":", @children;
325 $self->write_to_gdbm ($handle, $mapped_section, $title, $creator, $source,
326 $date, $jobnumber, $contains, $self->{'num_sections'},
327 $parent, $classification, $filename);
328
329 if ($doc_obj->get_text_length($section) > 0) {
330 my $intromapsection = "B.$self->{'num_docs'}." .
331 $self->map_section($doctype, "$section.0");
332 $self->write_to_gdbm ($handle, $intromapsection, "<i>(introductory text)</i>", $creator,
333 $source, $date, $jobnumber, undef, $self->{'num_sections'},
334 $mapped_section, $classification, $filename);
335 }
336 }
337 }
338 $section = $doc_obj->get_next_section($section);
339 }
340
341 # update classification list with those books that
342 # were processed before classification list
343 if ($doctype eq "classification" && defined $temp_classifications) {
344 foreach $key (keys(%$temp_classifications)) {
345 if (!defined $saved_classifications->{$key}) {
346 print STDERR "mgbuildproc:$temp_classifications->{$key} belong to " .
347 "undefined classification $key\n";
348 }
349 $saved_classifications->{$key}->{'contains'} = $temp_classifications->{$key}->{'contains'};
350 }
351 $temp_classifications = undef;
352 }
353}
354
355
356sub write_to_gdbm {
357 my $self = shift (@_);
358 my ($handle, $section, $title, $creator, $source, $date,
359 $jobnumber, $contains, $docnum, $parent, $classification, $OID) = @_;
360
361 print $handle "[$section]\n";
362 print $handle "<t>$title\n" if (defined $title && $title ne "");
363 print $handle "<a>$creator\n" if (defined $creator && $creator ne "");
364 print $handle "<s>$source\n" if (defined $source && $source ne "");
365 print $handle "<i>$date\n" if (defined $date && $date ne "");
366 print $handle "<j>$jobnumber\n" if (defined $jobnumber && $jobnumber ne "");
367 print $handle "<c>$contains\n" if (defined $contains && $contains ne "");
368 print $handle "<d>$docnum\n" if (defined $docnum && $docnum ne "");
369 print $handle "<p>$parent\n" if (defined $parent && $parent ne "");
370 print $handle "<x>$classification\n" if (defined $classification && $classification ne "");
371 print $handle "<o>$OID\n" if defined $OID;
372 print $handle '-' x 70, "\n";
373
374 if (defined $docnum) {
375 print $handle "[$docnum]\n";
376 print $handle "<x>$section\n";
377 print $handle '-' x 70, "\n";
378 }
379}
380
381sub get_classifications {
382 my $self = shift (@_);
383 my ($doc_obj, $section, $mapped_section) = @_;
384
385 my ($classificationsref);
386 if (defined $saved_classifications) {
387 # classification list has been processed
388 $classificationsref = $saved_classifications;
389 } else {
390 # classification list has yet to be processed, save
391 # books in temp_classifications until list is processed
392 # and they can be moved into saved_classifications
393 $temp_classifications = {} unless defined $temp_classifications;
394 $classificationsref = $temp_classifications;
395 }
396
397 my $classifications = $doc_obj->get_metadata($section, "Subject");
398
399 # need to save which books belong in each classification
400 # to output later
401 foreach $classification (@$classifications) {
402
403 if (!defined $classificationsref->{$classification}) {
404 $classificationsref->{$classification}->{'parent'} = $doc_obj->get_parent_section($section);
405 $classificationsref->{$classification}->{'contains'} = "";
406 }
407 $classificationsref->{$classification}->{'contains'} .= ":" unless
408 $classificationsref->{$classification}->{'contains'} eq "";
409 $classificationsref->{$classification}->{'contains'} .= $mapped_section;
410 }
411
412 return (join ":", @$classifications);
413}
414
415sub map_section {
416 my $self = shift (@_);
417 my ($doctype, $section) = @_;
418
419 # classifications should never need to be mapped
420 return $section if $doctype eq "classification";
421
422 return "" unless (defined $section) && ($section ne "");
423
424 $sectionmap = {} unless defined $sectionmap;
425
426
427
428 # get the section into a standard format
429 $section =~ s/^\.+|\.+$//g;
430
431 # return the mapped section if it has been seen before
432 if (defined $sectionmap->{$section}) {
433 return $sectionmap->{$section};
434 }
435
436 # find out the parent section
437 my ($parentsection, $num);
438 if ($section =~ /^(.+)\.(\d+)$/) {
439 $parentsection = $1;
440 $num = $2;
441 } elsif ($section =~ /^(\d+)$/) {
442 $parentsection = "";
443 $num = $1;
444 } else {
445 print STDERR "mgbuildproc:map_section - misformed section $section\n";
446 }
447 $parentsection = "" unless defined $parentsection;
448
449 if ($parentsection eq "") {
450 $num ++ if $num == 1 and defined $sectionmap->{'0'};
451 } else {
452 $num ++ if $num == 1 and defined $sectionmap->{"$parentsection.0"};
453 }
454
455 # find out the mapped parent section
456 my $mappedparentsection = $self->map_section ($doctype, $parentsection);
457
458 # find the next unused child section
459 my $previousnum = $num - 1;
460 my $previoussection = "";
461 if ($parentsection eq "") {
462 $previoussection = $previousnum;
463 $previoussection = 0 if ($section == 1) && (defined $sectionmap->{'0'});
464 } else{
465 $previoussection = "$parentsection.$previousnum";
466 $previoussection = "$parentsection.0" if ($section =~ /\.1$/) && (defined $sectionmap->{"$parentsection.0"});
467 }
468 while (($previousnum > 0) && (!defined $sectionmap->{$previoussection})) {
469 $previousnum--;
470 $previoussection = "$parentsection.$previousnum";
471 $previoussection = $previousnum if $parentsection eq "";
472 }
473
474 # there has been no children under this parent, this section will be number 1
475 if ($previousnum <= 0) {
476 if ($mappedparentsection eq "") {
477 $sectionmap->{$section} = "1";
478 } else {
479 $sectionmap->{$section} = "$mappedparentsection.1";
480 }
481
482 } else {
483 # get the previous mapped number
484 my $previousmapnum = 0;
485 if ($sectionmap->{$previoussection} =~ /(^|\.)?(\d+)$/) {
486 $previousmapnum = $2;
487 }
488
489 # increment it to get this mapped child
490 my $mappednum = $previousmapnum+1;
491
492 if ($mappedparentsection eq "") {
493 $sectionmap->{$section} = $mappednum;
494 } else {
495 $sectionmap->{$section} = "$mappedparentsection.$mappednum";
496 }
497 }
498
499 return $sectionmap->{$section};
500}
501
502sub find_paragraphs {
503 $_[1] =~ s/(<p\b)/\cC$1/gi;
504}
505
506sub filter_text {
507 # $self->filter_text ($field, $new_text);
508 # don't want to do anything for this version, however,
509 # in a particular collection you might want to override
510 # this method to post-process certain fields depending on
511 # the field, or whether we are outputting it for indexing
512}
513
514sub text {
515 my $self = shift (@_);
516 my ($doc_obj) = @_;
517 my $handle = $self->{'output_handle'};
518 my $indexed_doc = 1;
519
520 # only output this document if it is one to be indexed
521 return if ($doc_obj->get_doc_type() ne "indexed_doc");
522
523 # see if this document belongs to this subcollection
524 foreach $indexexp (@{$self->{'indexexparr'}}) {
525 $indexed_doc = 0;
526 my ($field, $exp, $options) = split /\//, $indexexp;
527 if (defined ($field) && defined ($exp)) {
528 my ($bool) = $field =~ /^(.)/;
529 $field =~ s/^.// if $bool eq '!';
530 if ($field eq "filename") {
531 $field = $doc_obj->get_source_filename();
532 } else {
533 $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
534 }
535 next unless defined $field;
536 if ($bool eq '!') {
537 if ($options =~ /^i$/i) {
538 if ($field !~ /$exp/i) {$indexed_doc = 1; last;}
539 } else {
540 if ($field !~ /$exp/) {$indexed_doc = 1; last;}
541 }
542 } else {
543 if ($options =~ /^i$/i) {
544 if ($field =~ /$exp/i) {$indexed_doc = 1; last;}
545 } else {
546 if ($field =~ /$exp/) {$indexed_doc = 1; last;}
547 }
548 }
549 }
550 }
551
552 # this is another document
553 $self->{'num_docs'} += 1;
554
555 # get the parameters for the output
556 my ($level, $fields) = split (/:/, $self->{'index'});
557 $fields =~ s/\ball\b/Title,Creator,text/;
558 $fields =~ s/\btopall\b/topTitle,topCreator,toptext/;
559
560 my $doc_section = 0; # just for this document
561 my $text = "";
562 my $text_extra = "";
563
564 # get the text for this document
565 my $section = $doc_obj->get_top_section();
566 while (defined $section) {
567 # update a few statistics
568 $doc_section++;
569 $self->{'num_sections'} += 1;
570
571 if ($indexed_doc) {
572 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
573 foreach $field (split (/,/, $fields)) {
574 # only deal with this field if it doesn't start with top or
575 # this is the first section
576 my $real_field = $field;
577 if (!($real_field =~ s/^top//) || ($doc_section == 1)) {
578 my $new_text = "";
579 if ($real_field eq "text") {
580 $new_text = $doc_obj->get_text ($section);
581 $new_text =~ s/[\cB\cC]//g;
582 $self->find_paragraphs($new_text);
583
584 } else {
585 $new_text = join ("\cC", @{$doc_obj->get_metadata ($section, $real_field)});
586 }
587
588 # filter the text
589 $self->filter_text ($field, $new_text);
590
591 if ($self->{'indexing_text'} &&
592 $new_text =~ /[\(\)\{\}]/) {
593 print "arrgh: $new_text\n";
594 }
595
596 $text .= "$new_text\cC";
597 }
598 }
599 }
600
601 if ($level eq "document") { $text_extra .= "\cB"; }
602 else { $text .= "\cB"; }
603
604 $section = $doc_obj->get_next_section($section);
605 }
606
607 print $handle "$text$text_extra";
608}
609
610# converts leading number in classification back
611# to letter it represents
612# i.e 67.2.4 becomes C.2.4
613sub char_classification {
614 my $self = shift (@_);
615 my ($classification) = @_;
616
617 return $classification if $classification eq "";
618 my ($c) = $classification =~ /^\.?(\d+)/;
619 $c = chr($c);
620 $classification =~ s/^\d+/$c/;
621
622 return $classification;
623}
624
625# converts leading letter of a classification into its ascii equivalent
626# i.e C.2.4 becomes 67.2.4
627sub int_classification {
628 my $self = shift (@_);
629 my ($classification) = @_;
630 my $c = ord($classification);
631 $classification =~ s/^./$c/;
632
633 return $classification;
634}
6351;
636
Note: See TracBrowser for help on using the repository browser.