source: trunk/gsdl/perllib/mgbuildproc.pm@ 289

Last change on this file since 289 was 289, checked in by rjmcnab, 25 years ago

Included seperate method for finding 'paragraphs' so that it could
be overriden for different collections.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 17.9 KB
Line 
1# This document processor outputs a document
2# for mg to process
3
4
5package mgbuildproc;
6
7use classify;
8use doc;
9use docproc;
10use util;
11
12
13BEGIN {
14 @ISA = ('docproc');
15}
16
17
18sub new {
19 my ($class, $collection, $source_dir, $build_dir, $verbosity) = @_;
20 my $self = new docproc ();
21
22 $self->{'collection'} = $collection;
23 $self->{'source_dir'} = $source_dir;
24 $self->{'build_dir'} = $build_dir;
25 $self->{'verbosity'} = $verbosity;
26 $self->{'classifiers'} = [];
27 $self->{'mode'} = "text";
28 $self->{'index'} = "section:text";
29 $self->{'indexexparr'} = [];
30 $self->{'output_handle'} = "STDOUT";
31 $self->{'num_docs'} = 0;
32 $self->{'num_sections'} = 0;
33 $self->{'num_bytes'} = 0;
34
35 return bless $self, $class;
36}
37
38sub reset {
39 my $self = shift (@_);
40
41 $self->{'num_docs'} = 0;
42 $self->{'num_sections'} = 0;
43 $self->{'num_bytes'} = 0;
44}
45
46sub get_num_docs {
47 my $self = shift (@_);
48
49 return $self->{'num_docs'};
50}
51
52sub get_num_sections {
53 my $self = shift (@_);
54
55 return $self->{'num_sections'};
56}
57
58sub get_num_bytes {
59 my $self = shift (@_);
60
61 return $self->{'num_bytes'};
62}
63
64sub set_output_handle {
65 my $self = shift (@_);
66 my ($handle) = @_;
67
68 $self->{'output_handle'} = $handle;
69}
70
71sub set_mode {
72 my $self = shift (@_);
73 my ($mode) = @_;
74
75 $self->{'mode'} = $mode;
76}
77
78sub set_index {
79 my $self = shift (@_);
80 my ($index, $indexexparr) = @_;
81
82 $self->{'index'} = $index;
83 $self->{'indexexparr'} = $indexexparr if defined $indexexparr;
84}
85
86sub set_classifiers {
87 my $self = shift (@_);
88 my ($classifiers) = @_;
89
90 $self->{'classifiers'} = $classifiers;
91}
92
93sub process {
94 my $self = shift (@_);
95 my $method = $self->{'mode'};
96
97 $self->$method(@_);
98}
99
100sub newinfodb {
101 my $self = shift (@_);
102 my ($doc_obj, $filename) = @_;
103 my $handle = $self->{'output_handle'};
104# $handle = "main::STDOUT";
105
106 # this was used in the old version
107 return if ($doc_obj eq 'classifications');
108
109 my $doctype = $doc_obj->get_doc_type();
110
111 # only output this document if it is one to be indexed
112 return if ($doctype ne "indexed_doc");
113
114 # this is another document
115 $self->{'num_docs'} += 1 unless ($doctype eq "classification");
116
117 my $section = $doc_obj->get_top_section ();
118 my $doc_OID = $doc_obj->get_OID();
119 while (defined $section) {
120 # update a few statistics
121 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
122 $self->{'num_sections'} += 1 unless ($doctype eq "classification");
123
124 # output the section name
125 if ($section eq "") { print $handle "[$doc_OID]\n"; }
126 else { print $handle "[$doc_OID.$section]\n"; }
127
128 # output the fact that this document is a document
129 print $handle "<doctype>doc\n";
130
131 # output whether this node contains text
132 if ($doc_obj->get_text_length($section) > 0) {
133 print $handle "<hastxt>1\n";
134 } else {
135 print $handle "<hastxt>0\n";
136 }
137
138 # output all the section metadata
139 my $metadata = $doc_obj->get_all_metadata ($section);
140 foreach $pair (@$metadata) {
141 my ($field, $value) = (@$pair);
142
143 if ($field ne "Identifier" && $field !~ /^gsdl/ &&
144 defined $value && $value ne "") {
145 # escape problematic stuff
146 $value =~ s/\\/\\\\/g;
147 $value =~ s/\n/\\n/g;
148 $value =~ s/\r/\\r/g;
149
150 print $handle "<$field>$value\n";
151 }
152 }
153
154 # output archivedir if at top level
155 if ($section eq $doc_obj->get_top_section()) {
156 my ($archivedir) = $filename =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
157 $archivedir = "" unless defined $archivedir;
158 $archivedir =~ s/^(\/|\\)*//;
159 $archivedir =~ s/(\/|\\)*$//;
160 print $handle "<archivedir>$archivedir\n";
161 }
162
163 # output a list of children
164 my $children = $doc_obj->get_children ($section);
165 if (scalar(@$children) > 0) {
166 print $handle "<contains>";
167 my $firstchild = 1;
168 foreach $child (@$children) {
169 print $handle ";" unless $firstchild;
170 $firstchild = 0;
171 if ($child =~ /^.*?\.(\d+)$/) {
172 print $handle "\".$1";
173 } else {
174 print $handle "\".$child";
175 }
176# if ($child eq "") { print $handle "$doc_OID"; }
177# elsif ($section eq "") { print $handle "$doc_OID.$child"; }
178# else { print $handle "$doc_OID.$section.$child"; }
179 }
180 print $handle "\n";
181 }
182
183 # output the matching document number
184 print $handle "<docnum>$self->{'num_sections'}\n";
185
186 print $handle '-' x 70, "\n";
187
188
189 # output a database entry for the document number
190 print $handle "[$self->{'num_sections'}]\n";
191 if ($section eq "") { print $handle "<section>$doc_OID\n"; }
192 else { print $handle "<section>$doc_OID.$section\n"; }
193 print $handle '-' x 70, "\n";
194
195
196 $section = $doc_obj->get_next_section($section);
197 }
198
199 # classify this document
200 &classify::classify_doc ($self->{'classifiers'}, $doc_obj);
201
202}
203
204
205
206sub infodb {
207 my $self = shift (@_);
208 my ($doc_obj, $filename) = @_;
209 my $handle = $self->{'output_handle'};
210
211 if ($doc_obj eq 'classifications') {
212 # output classifications if all books have been processed
213 foreach $key (keys %$saved_classifications) {
214 $saved_classifications->{$key}->{'contains'} = undef if
215 $saved_classifications->{$key}->{'contains'} eq "";
216 $saved_classifications->{$key}->{'parent'} = undef if
217 (!defined $saved_classifications->{$key}->{'parent'}) ||
218 ($saved_classifications->{$key}->{'parent'} eq "");
219 $self->write_to_gdbm ($handle, $key, $saved_classifications->{$key}->{'title'},
220 undef, undef, undef, undef,
221 $saved_classifications->{$key}->{'contains'}, undef,
222 $saved_classifications->{$key}->{'parent'}, undef, undef);
223 }
224 return;
225 }
226
227 my $doctype = $doc_obj->get_doc_type();
228
229 # only output this document if it is one to be indexed
230 return if ($doctype ne "indexed_doc" &&
231 $doctype ne "classification");
232
233 # found classification document
234 $saved_classifications = {} if ($doctype eq "classification");
235 $sectionmap = {};
236
237 # this is another document
238 $self->{'num_docs'} += 1 unless ($doctype eq "classification");
239
240 my $section = $doc_obj->get_top_section();
241 while (defined $section) {
242 # update a few statistics
243 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
244 $self->{'num_sections'} += 1 unless ($doctype eq "classification");
245
246 my $title = $doc_obj->get_metadata_element($section, "Title");
247 my $source = $doc_obj->get_metadata_element($section, "Source");
248 my $date = $doc_obj->get_metadata_element($section, "Date");
249 my $jobnumber = $doc_obj->get_source_filename();
250
251 my $mapped_section = $self->map_section($doctype, $section);
252 if ($doctype eq "classification") {
253 $mapped_section = $self->char_classification($mapped_section);
254 } else {
255 $mapped_section = "B.$self->{'num_docs'}.$mapped_section";
256 }
257 $mapped_section =~ s/\.+$//;
258
259 my ($parent, $classification, $creator);
260
261 $classification = $self->get_classifications($doc_obj, $section, $mapped_section)
262 unless $doctype eq "classification";
263
264 if ($section ne $doc_obj->get_top_section()) {
265 $parent = $self->map_section($doctype, $doc_obj->get_parent_section($section));
266 if ($doctype eq "classification") {
267 $parent = $self->char_classification($parent);
268 } else {
269 $parent = "B.$self->{'num_docs'}.$parent";
270 }
271 $parent =~ s/\.+$//;
272 } else {
273 $creator = $doc_obj->get_metadata_element($section, "Creator");
274
275 # need filename so we know what directory to look in for associated files
276 $filename =~ s/^\/?(.*?\.dir).*$/$1/ if (defined $filename);
277 }
278
279 if ($doc_obj->is_leaf_section($section)) {
280 if ($doctype eq "classification") {
281 if (defined $saved_classifications->{$mapped_section}) {
282 print STDERR "mgbuildproc:warning: classification $mapped_section " .
283 "declared multiple times\n";
284 } else {
285 $saved_classifications->{$mapped_section} = {'title' => $title, 'contains' => "",
286 'parent' => $parent};
287 }
288 } else {
289 $self->write_to_gdbm($handle, $mapped_section, $title, $creator, $source, $date, $jobnumber, undef,
290 $self->{'num_sections'}, $parent, $classification, $filename);
291 }
292 } else {
293
294 # add the introductory section if it exists
295 my $contains = "";
296 if ($doc_obj->get_text_length($section) > 0) {
297 $contains .= "B.$self->{'num_docs'}." . $self->map_section ($doctype, "$section.0");
298 }
299
300 # add the rest of the children
301 my @children = @{$doc_obj->get_children($section)};
302 if ($doctype eq "classification") {
303 map {$_ = $self->char_classification($_);} @children;
304 if (defined $saved_classifications->{$mapped_section}) {
305 print STDERR "mgbuildproc:warning: classification $mapped_section " .
306 "declared multiple times\n";
307 } else {
308 $saved_classifications->{$mapped_section} = {'title' => $title, 'contains' => ""};
309 $saved_classifications->{$mapped_section}->{'contains'} = join ":", @children;
310 $saved_classifications->{$mapped_section}->{'parent'} = $parent;
311 }
312 } else {
313 map {$_ = "B.$self->{'num_docs'}." . $self->map_section($doctype, $_);} @children;
314 $contains .= ":" if $contains ne "";
315 $contains .= join ":", @children;
316 $self->write_to_gdbm ($handle, $mapped_section, $title, $creator, $source,
317 $date, $jobnumber, $contains, $self->{'num_sections'},
318 $parent, $classification, $filename);
319
320 if ($doc_obj->get_text_length($section) > 0) {
321 my $intromapsection = "B.$self->{'num_docs'}." .
322 $self->map_section($doctype, "$section.0");
323 $self->write_to_gdbm ($handle, $intromapsection, "<i>(introductory text)</i>", $creator,
324 $source, $date, $jobnumber, undef, $self->{'num_sections'},
325 $mapped_section, $classification, $filename);
326 }
327 }
328 }
329 $section = $doc_obj->get_next_section($section);
330 }
331
332 # update classification list with those books that
333 # were processed before classification list
334 if ($doctype eq "classification" && defined $temp_classifications) {
335 foreach $key (keys(%$temp_classifications)) {
336 if (!defined $saved_classifications->{$key}) {
337 print STDERR "mgbuildproc:$temp_classifications->{$key} belong to " .
338 "undefined classification $key\n";
339 }
340 $saved_classifications->{$key}->{'contains'} = $temp_classifications->{$key}->{'contains'};
341 }
342 $temp_classifications = undef;
343 }
344}
345
346
347sub write_to_gdbm {
348 my $self = shift (@_);
349 my ($handle, $section, $title, $creator, $source, $date,
350 $jobnumber, $contains, $docnum, $parent, $classification, $OID) = @_;
351
352 print $handle "[$section]\n";
353 print $handle "<t>$title\n" if (defined $title && $title ne "");
354 print $handle "<a>$creator\n" if (defined $creator && $creator ne "");
355 print $handle "<s>$source\n" if (defined $source && $source ne "");
356 print $handle "<i>$date\n" if (defined $date && $date ne "");
357 print $handle "<j>$jobnumber\n" if (defined $jobnumber && $jobnumber ne "");
358 print $handle "<c>$contains\n" if (defined $contains && $contains ne "");
359 print $handle "<d>$docnum\n" if (defined $docnum && $docnum ne "");
360 print $handle "<p>$parent\n" if (defined $parent && $parent ne "");
361 print $handle "<x>$classification\n" if (defined $classification && $classification ne "");
362 print $handle "<o>$OID\n" if defined $OID;
363 print $handle '-' x 70, "\n";
364
365 if (defined $docnum) {
366 print $handle "[$docnum]\n";
367 print $handle "<x>$section\n";
368 print $handle '-' x 70, "\n";
369 }
370}
371
372sub get_classifications {
373 my $self = shift (@_);
374 my ($doc_obj, $section, $mapped_section) = @_;
375
376 my ($classificationsref);
377 if (defined $saved_classifications) {
378 # classification list has been processed
379 $classificationsref = $saved_classifications;
380 } else {
381 # classification list has yet to be processed, save
382 # books in temp_classifications until list is processed
383 # and they can be moved into saved_classifications
384 $temp_classifications = {} unless defined $temp_classifications;
385 $classificationsref = $temp_classifications;
386 }
387
388 my $classifications = $doc_obj->get_metadata($section, "Subject");
389
390 # need to save which books belong in each classification
391 # to output later
392 foreach $classification (@$classifications) {
393
394 if (!defined $classificationsref->{$classification}) {
395 $classificationsref->{$classification}->{'parent'} = $doc_obj->get_parent_section($section);
396 $classificationsref->{$classification}->{'contains'} = "";
397 }
398 $classificationsref->{$classification}->{'contains'} .= ":" unless
399 $classificationsref->{$classification}->{'contains'} eq "";
400 $classificationsref->{$classification}->{'contains'} .= $mapped_section;
401 }
402
403 return (join ":", @$classifications);
404}
405
406sub map_section {
407 my $self = shift (@_);
408 my ($doctype, $section) = @_;
409
410 # classifications should never need to be mapped
411 return $section if $doctype eq "classification";
412
413 return "" unless (defined $section) && ($section ne "");
414
415 $sectionmap = {} unless defined $sectionmap;
416
417
418
419 # get the section into a standard format
420 $section =~ s/^\.+|\.+$//g;
421
422 # return the mapped section if it has been seen before
423 if (defined $sectionmap->{$section}) {
424 return $sectionmap->{$section};
425 }
426
427 # find out the parent section
428 my ($parentsection, $num);
429 if ($section =~ /^(.+)\.(\d+)$/) {
430 $parentsection = $1;
431 $num = $2;
432 } elsif ($section =~ /^(\d+)$/) {
433 $parentsection = "";
434 $num = $1;
435 } else {
436 print STDERR "mgbuildproc:map_section - misformed section $section\n";
437 }
438 $parentsection = "" unless defined $parentsection;
439
440 if ($parentsection eq "") {
441 $num ++ if $num == 1 and defined $sectionmap->{'0'};
442 } else {
443 $num ++ if $num == 1 and defined $sectionmap->{"$parentsection.0"};
444 }
445
446 # find out the mapped parent section
447 my $mappedparentsection = $self->map_section ($doctype, $parentsection);
448
449 # find the next unused child section
450 my $previousnum = $num - 1;
451 my $previoussection = "";
452 if ($parentsection eq "") {
453 $previoussection = $previousnum;
454 $previoussection = 0 if ($section == 1) && (defined $sectionmap->{'0'});
455 } else{
456 $previoussection = "$parentsection.$previousnum";
457 $previoussection = "$parentsection.0" if ($section =~ /\.1$/) && (defined $sectionmap->{"$parentsection.0"});
458 }
459 while (($previousnum > 0) && (!defined $sectionmap->{$previoussection})) {
460 $previousnum--;
461 $previoussection = "$parentsection.$previousnum";
462 $previoussection = $previousnum if $parentsection eq "";
463 }
464
465 # there has been no children under this parent, this section will be number 1
466 if ($previousnum <= 0) {
467 if ($mappedparentsection eq "") {
468 $sectionmap->{$section} = "1";
469 } else {
470 $sectionmap->{$section} = "$mappedparentsection.1";
471 }
472
473 } else {
474 # get the previous mapped number
475 my $previousmapnum = 0;
476 if ($sectionmap->{$previoussection} =~ /(^|\.)?(\d+)$/) {
477 $previousmapnum = $2;
478 }
479
480 # increment it to get this mapped child
481 my $mappednum = $previousmapnum+1;
482
483 if ($mappedparentsection eq "") {
484 $sectionmap->{$section} = $mappednum;
485 } else {
486 $sectionmap->{$section} = "$mappedparentsection.$mappednum";
487 }
488 }
489
490 return $sectionmap->{$section};
491}
492
493sub find_paragraphs {
494 $_[1] =~ s/(<p\b)/\cC$1/gi;
495}
496
497sub text {
498 my $self = shift (@_);
499 my ($doc_obj) = @_;
500 my $handle = $self->{'output_handle'};
501 my $indexed_doc = 1;
502
503 # only output this document if it is one to be indexed
504 return if ($doc_obj->get_doc_type() ne "indexed_doc");
505
506 # see if this document belongs to this subcollection
507 foreach $indexexp (@{$self->{'indexexparr'}}) {
508 $indexed_doc = 0;
509 my ($field, $exp, $options) = split /\//, $indexexp;
510 if (defined ($field) && defined ($exp)) {
511 my ($bool) = $field =~ /^(.)/;
512 $field =~ s/^.// if $bool eq '!';
513 if ($field eq "filename") {
514 $field = $doc_obj->get_source_filename();
515 } else {
516 $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
517 }
518 next unless defined $field;
519 if ($bool eq '!') {
520 if ($options =~ /^i$/i) {
521 if ($field !~ /$exp/i) {$indexed_doc = 1; last;}
522 } else {
523 if ($field !~ /$exp/) {$indexed_doc = 1; last;}
524 }
525 } else {
526 if ($options =~ /^i$/i) {
527 if ($field =~ /$exp/i) {$indexed_doc = 1; last;}
528 } else {
529 if ($field =~ /$exp/) {$indexed_doc = 1; last;}
530 }
531 }
532 }
533 }
534
535 # this is another document
536 $self->{'num_docs'} += 1;
537
538 # get the parameters for the output
539 my ($level, $fields) = split (/:/, $self->{'index'});
540 $fields =~ s/\ball\b/Title,Creator,text/;
541 $fields =~ s/\btopall\b/topTitle,topCreator,toptext/;
542
543 my $doc_section = 0; # just for this document
544 my $text = "";
545 my $text_extra = "";
546
547 # get the text for this document
548 my $section = $doc_obj->get_top_section();
549 while (defined $section) {
550 # update a few statistics
551 $doc_section++;
552 $self->{'num_sections'} += 1;
553
554 if ($indexed_doc) {
555 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
556 foreach $field (split (/,/, $fields)) {
557 # only deal with this field if it doesn't start with top or
558 # this is the first section
559 my $real_field = $field;
560 if (!($real_field =~ s/^top//) || ($doc_section == 1)) {
561 my $new_text = "";
562 if ($real_field eq "text") {
563 $new_text = $doc_obj->get_text ($section);
564 $new_text =~ s/[\cB\cC]//g;
565 $self->find_paragraphs($new_text);
566
567 } else {
568 $new_text = join ("\cC", @{$doc_obj->get_metadata ($section, $real_field)});
569 }
570
571 $text .= "$new_text\cC";
572 }
573 }
574 }
575
576 if ($level eq "document") { $text_extra .= "\cB"; }
577 else { $text .= "\cB"; }
578
579 $section = $doc_obj->get_next_section($section);
580 }
581
582 print $handle "$text$text_extra";
583}
584
585# converts leading number in classification back
586# to letter it represents
587# i.e 67.2.4 becomes C.2.4
588sub char_classification {
589 my $self = shift (@_);
590 my ($classification) = @_;
591
592 return $classification if $classification eq "";
593 my ($c) = $classification =~ /^\.?(\d+)/;
594 $c = chr($c);
595 $classification =~ s/^\d+/$c/;
596
597 return $classification;
598}
599
600# converts leading letter of a classification into its ascii equivalent
601# i.e C.2.4 becomes 67.2.4
602sub int_classification {
603 my $self = shift (@_);
604 my ($classification) = @_;
605 my $c = ord($classification);
606 $classification =~ s/^./$c/;
607
608 return $classification;
609}
6101;
611
Note: See TracBrowser for help on using the repository browser.