source: trunk/gsdl/perllib/mgbuildproc.pm@ 255

Last change on this file since 255 was 255, checked in by sjboddie, 25 years ago

top level of document now saves archivedir in gdbm

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 17.8 KB
Line 
1# This document processor outputs a document
2# for mg to process
3
4
5package mgbuildproc;
6
7use classify;
8use doc;
9use docproc;
10use util;
11
12
13BEGIN {
14 @ISA = ('docproc');
15}
16
17
18sub new {
19 my ($class, $collection, $source_dir, $build_dir, $verbosity) = @_;
20 my $self = new docproc ();
21
22 $self->{'collection'} = $collection;
23 $self->{'source_dir'} = $source_dir;
24 $self->{'build_dir'} = $build_dir;
25 $self->{'verbosity'} = $verbosity;
26 $self->{'classifiers'} = [];
27 $self->{'mode'} = "text";
28 $self->{'index'} = "section:text";
29 $self->{'indexexparr'} = [];
30 $self->{'output_handle'} = "STDOUT";
31 $self->{'num_docs'} = 0;
32 $self->{'num_sections'} = 0;
33 $self->{'num_bytes'} = 0;
34
35 return bless $self, $class;
36}
37
38sub reset {
39 my $self = shift (@_);
40
41 $self->{'num_docs'} = 0;
42 $self->{'num_sections'} = 0;
43 $self->{'num_bytes'} = 0;
44}
45
46sub get_num_docs {
47 my $self = shift (@_);
48
49 return $self->{'num_docs'};
50}
51
52sub get_num_sections {
53 my $self = shift (@_);
54
55 return $self->{'num_sections'};
56}
57
58sub get_num_bytes {
59 my $self = shift (@_);
60
61 return $self->{'num_bytes'};
62}
63
64sub set_output_handle {
65 my $self = shift (@_);
66 my ($handle) = @_;
67
68 $self->{'output_handle'} = $handle;
69}
70
71sub set_mode {
72 my $self = shift (@_);
73 my ($mode) = @_;
74
75 $self->{'mode'} = $mode;
76}
77
78sub set_index {
79 my $self = shift (@_);
80 my ($index, $indexexparr) = @_;
81
82 $self->{'index'} = $index;
83 $self->{'indexexparr'} = $indexexparr if defined $indexexparr;
84}
85
86sub set_classifiers {
87 my $self = shift (@_);
88 my ($classifiers) = @_;
89
90 $self->{'classifiers'} = $classifiers;
91}
92
93sub process {
94 my $self = shift (@_);
95 my $method = $self->{'mode'};
96
97 $self->$method(@_);
98}
99
100sub newinfodb {
101 my $self = shift (@_);
102 my ($doc_obj, $filename) = @_;
103 my $handle = $self->{'output_handle'};
104# $handle = "main::STDOUT";
105
106 # this was used in the old version
107 return if ($doc_obj eq 'classifications');
108
109 my $doctype = $doc_obj->get_doc_type();
110
111 # only output this document if it is one to be indexed
112 return if ($doctype ne "indexed_doc");
113
114 # this is another document
115 $self->{'num_docs'} += 1 unless ($doctype eq "classification");
116
117 my $section = $doc_obj->get_top_section ();
118 my $doc_OID = $doc_obj->get_OID();
119 while (defined $section) {
120 # update a few statistics
121 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
122 $self->{'num_sections'} += 1 unless ($doctype eq "classification");
123
124 # output the section name
125 if ($section eq "") { print $handle "[$doc_OID]\n"; }
126 else { print $handle "[$doc_OID.$section]\n"; }
127
128 # output the fact that this document is a document
129 print $handle "<doctype>doc\n";
130
131 # output whether this node contains text
132 if ($doc_obj->get_text_length($section) > 0) {
133 print $handle "<hastxt>1\n";
134 } else {
135 print $handle "<hastxt>0\n";
136 }
137
138 # output all the section metadata
139 my $metadata = $doc_obj->get_all_metadata ($section);
140 foreach $pair (@$metadata) {
141 my ($field, $value) = (@$pair);
142
143 if ($field ne "Identifier" && $field !~ /^gsdl/ &&
144 defined $value && $value ne "") {
145 # escape problematic stuff
146 $value =~ s/\\/\\\\/g;
147 $value =~ s/\n/\\n/g;
148 $value =~ s/\r/\\r/g;
149
150 print $handle "<$field>$value\n";
151 }
152 }
153
154 # output archivedir if at top level
155 if ($section eq $doc_obj->get_top_section()) {
156 my ($archivedir) = $filename =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
157 $archivedir = "" unless defined $archivedir;
158 $archivedir =~ s/^(\/|\\)*//;
159 $archivedir =~ s/(\/|\\)*$//;
160 print $handle "<archivedir>$archivedir\n";
161 }
162
163 # output a list of children
164 my $children = $doc_obj->get_children ($section);
165 if (scalar(@$children) > 0) {
166 print $handle "<contains>";
167 my $firstchild = 1;
168 foreach $child (@$children) {
169 print $handle ";" unless $firstchild;
170 $firstchild = 0;
171 if ($child =~ /^.*?\.(\d+)$/) {
172 print $handle "\".$1";
173 } else {
174 print $handle "\".$child";
175 }
176# if ($child eq "") { print $handle "$doc_OID"; }
177# elsif ($section eq "") { print $handle "$doc_OID.$child"; }
178# else { print $handle "$doc_OID.$section.$child"; }
179 }
180 print $handle "\n";
181 }
182
183 # output the matching document number
184 print $handle "<docnum>$self->{'num_sections'}\n";
185
186 print $handle '-' x 70, "\n";
187
188
189 # output a database entry for the document number
190 print $handle "[$self->{'num_sections'}]\n";
191 if ($section eq "") { print $handle "<section>$doc_OID\n"; }
192 else { print $handle "<section>$doc_OID.$section\n"; }
193 print $handle '-' x 70, "\n";
194
195
196 $section = $doc_obj->get_next_section($section);
197 }
198
199 # classify this document
200 &classify::classify_doc ($self->{'classifiers'}, $doc_obj);
201
202}
203
204
205
206sub infodb {
207 my $self = shift (@_);
208 my ($doc_obj, $filename) = @_;
209 my $handle = $self->{'output_handle'};
210
211 if ($doc_obj eq 'classifications') {
212 # output classifications if all books have been processed
213 foreach $key (keys %$saved_classifications) {
214 $saved_classifications->{$key}->{'contains'} = undef if
215 $saved_classifications->{$key}->{'contains'} eq "";
216 $saved_classifications->{$key}->{'parent'} = undef if
217 (!defined $saved_classifications->{$key}->{'parent'}) ||
218 ($saved_classifications->{$key}->{'parent'} eq "");
219 $self->write_to_gdbm ($handle, $key, $saved_classifications->{$key}->{'title'},
220 undef, undef, undef, undef,
221 $saved_classifications->{$key}->{'contains'}, undef,
222 $saved_classifications->{$key}->{'parent'}, undef, undef);
223 }
224 return;
225 }
226
227 my $doctype = $doc_obj->get_doc_type();
228
229 # only output this document if it is one to be indexed
230 return if ($doctype ne "indexed_doc" &&
231 $doctype ne "classification");
232
233 # found classification document
234 $saved_classifications = {} if ($doctype eq "classification");
235 $sectionmap = {};
236
237 # this is another document
238 $self->{'num_docs'} += 1 unless ($doctype eq "classification");
239
240 my $section = $doc_obj->get_top_section();
241 while (defined $section) {
242 # update a few statistics
243 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
244 $self->{'num_sections'} += 1 unless ($doctype eq "classification");
245
246 my $title = $doc_obj->get_metadata_element($section, "Title");
247 my $source = $doc_obj->get_metadata_element($section, "Source");
248 my $date = $doc_obj->get_metadata_element($section, "Date");
249 my $jobnumber = $doc_obj->get_source_filename();
250
251 my $mapped_section = $self->map_section($doctype, $section);
252 if ($doctype eq "classification") {
253 $mapped_section = $self->char_classification($mapped_section);
254 } else {
255 $mapped_section = "B.$self->{'num_docs'}.$mapped_section";
256 }
257 $mapped_section =~ s/\.+$//;
258
259 my ($parent, $classification, $creator);
260
261 $classification = $self->get_classifications($doc_obj, $section, $mapped_section)
262 unless $doctype eq "classification";
263
264 if ($section ne $doc_obj->get_top_section()) {
265 $parent = $self->map_section($doctype, $doc_obj->get_parent_section($section));
266 if ($doctype eq "classification") {
267 $parent = $self->char_classification($parent);
268 } else {
269 $parent = "B.$self->{'num_docs'}.$parent";
270 }
271 $parent =~ s/\.+$//;
272 } else {
273 $creator = $doc_obj->get_metadata_element($section, "Creator");
274
275 # need filename so we know what directory to look in for associated files
276 $filename =~ s/^\/?(.*?\.dir).*$/$1/ if (defined $filename);
277 }
278
279 if ($doc_obj->is_leaf_section($section)) {
280 if ($doctype eq "classification") {
281 if (defined $saved_classifications->{$mapped_section}) {
282 print STDERR "mgbuildproc:warning: classification $mapped_section " .
283 "declared multiple times\n";
284 } else {
285 $saved_classifications->{$mapped_section} = {'title' => $title, 'contains' => "",
286 'parent' => $parent};
287 }
288 } else {
289 $self->write_to_gdbm($handle, $mapped_section, $title, $creator, $source, $date, $jobnumber, undef,
290 $self->{'num_sections'}, $parent, $classification, $filename);
291 }
292 } else {
293
294 # add the introductory section if it exists
295 my $contains = "";
296 if ($doc_obj->get_text_length($section) > 0) {
297 $contains .= "B.$self->{'num_docs'}." . $self->map_section ($doctype, "$section.0");
298 }
299
300 # add the rest of the children
301 my @children = @{$doc_obj->get_children($section)};
302 if ($doctype eq "classification") {
303 map {$_ = $self->char_classification($_);} @children;
304 if (defined $saved_classifications->{$mapped_section}) {
305 print STDERR "mgbuildproc:warning: classification $mapped_section " .
306 "declared multiple times\n";
307 } else {
308 $saved_classifications->{$mapped_section} = {'title' => $title, 'contains' => ""};
309 $saved_classifications->{$mapped_section}->{'contains'} = join ":", @children;
310 $saved_classifications->{$mapped_section}->{'parent'} = $parent;
311 }
312 } else {
313 map {$_ = "B.$self->{'num_docs'}." . $self->map_section($doctype, $_);} @children;
314 $contains .= ":" if $contains ne "";
315 $contains .= join ":", @children;
316 $self->write_to_gdbm ($handle, $mapped_section, $title, $creator, $source,
317 $date, $jobnumber, $contains, $self->{'num_sections'},
318 $parent, $classification, $filename);
319
320 if ($doc_obj->get_text_length($section) > 0) {
321 my $intromapsection = "B.$self->{'num_docs'}." .
322 $self->map_section($doctype, "$section.0");
323 $self->write_to_gdbm ($handle, $intromapsection, "<i>(introductory text)</i>", $creator,
324 $source, $date, $jobnumber, undef, $self->{'num_sections'},
325 $mapped_section, $classification, $filename);
326 }
327 }
328 }
329 $section = $doc_obj->get_next_section($section);
330 }
331
332 # update classification list with those books that
333 # were processed before classification list
334 if ($doctype eq "classification" && defined $temp_classifications) {
335 foreach $key (keys(%$temp_classifications)) {
336 if (!defined $saved_classifications->{$key}) {
337 print STDERR "mgbuildproc:$temp_classifications->{$key} belong to " .
338 "undefined classification $key\n";
339 }
340 $saved_classifications->{$key}->{'contains'} = $temp_classifications->{$key}->{'contains'};
341 }
342 $temp_classifications = undef;
343 }
344}
345
346
347sub write_to_gdbm {
348 my $self = shift (@_);
349 my ($handle, $section, $title, $creator, $source, $date,
350 $jobnumber, $contains, $docnum, $parent, $classification, $OID) = @_;
351
352 print $handle "[$section]\n";
353 print $handle "<t>$title\n" if (defined $title && $title ne "");
354 print $handle "<a>$creator\n" if (defined $creator && $creator ne "");
355 print $handle "<s>$source\n" if (defined $source && $source ne "");
356 print $handle "<i>$date\n" if (defined $date && $date ne "");
357 print $handle "<j>$jobnumber\n" if (defined $jobnumber && $jobnumber ne "");
358 print $handle "<c>$contains\n" if (defined $contains && $contains ne "");
359 print $handle "<d>$docnum\n" if (defined $docnum && $docnum ne "");
360 print $handle "<p>$parent\n" if (defined $parent && $parent ne "");
361 print $handle "<x>$classification\n" if (defined $classification && $classification ne "");
362 print $handle "<o>$OID\n" if defined $OID;
363 print $handle '-' x 70, "\n";
364
365 if (defined $docnum) {
366 print $handle "[$docnum]\n";
367 print $handle "<x>$section\n";
368 print $handle '-' x 70, "\n";
369 }
370}
371
372sub get_classifications {
373 my $self = shift (@_);
374 my ($doc_obj, $section, $mapped_section) = @_;
375
376 my ($classificationsref);
377 if (defined $saved_classifications) {
378 # classification list has been processed
379 $classificationsref = $saved_classifications;
380 } else {
381 # classification list has yet to be processed, save
382 # books in temp_classifications until list is processed
383 # and they can be moved into saved_classifications
384 $temp_classifications = {} unless defined $temp_classifications;
385 $classificationsref = $temp_classifications;
386 }
387
388 my $classifications = $doc_obj->get_metadata($section, "Subject");
389
390 # need to save which books belong in each classification
391 # to output later
392 foreach $classification (@$classifications) {
393
394 if (!defined $classificationsref->{$classification}) {
395 $classificationsref->{$classification}->{'parent'} = $doc_obj->get_parent_section($section);
396 $classificationsref->{$classification}->{'contains'} = "";
397 }
398 $classificationsref->{$classification}->{'contains'} .= ":" unless
399 $classificationsref->{$classification}->{'contains'} eq "";
400 $classificationsref->{$classification}->{'contains'} .= $mapped_section;
401 }
402
403 return (join ":", @$classifications);
404}
405
406sub map_section {
407 my $self = shift (@_);
408 my ($doctype, $section) = @_;
409
410 # classifications should never need to be mapped
411 return $section if $doctype eq "classification";
412
413 return "" unless (defined $section) && ($section ne "");
414
415 $sectionmap = {} unless defined $sectionmap;
416
417
418
419 # get the section into a standard format
420 $section =~ s/^\.+|\.+$//g;
421
422 # return the mapped section if it has been seen before
423 if (defined $sectionmap->{$section}) {
424 return $sectionmap->{$section};
425 }
426
427 # find out the parent section
428 my ($parentsection, $num);
429 if ($section =~ /^(.+)\.(\d+)$/) {
430 $parentsection = $1;
431 $num = $2;
432 } elsif ($section =~ /^(\d+)$/) {
433 $parentsection = "";
434 $num = $1;
435 } else {
436 print STDERR "mgbuildproc:map_section - misformed section $section\n";
437 }
438 $parentsection = "" unless defined $parentsection;
439
440 if ($parentsection eq "") {
441 $num ++ if $num == 1 and defined $sectionmap->{'0'};
442 } else {
443 $num ++ if $num == 1 and defined $sectionmap->{"$parentsection.0"};
444 }
445
446 # find out the mapped parent section
447 my $mappedparentsection = $self->map_section ($doctype, $parentsection);
448
449 # find the next unused child section
450 my $previousnum = $num - 1;
451 my $previoussection = "";
452 if ($parentsection eq "") {
453 $previoussection = $previousnum;
454 $previoussection = 0 if ($section == 1) && (defined $sectionmap->{'0'});
455 } else{
456 $previoussection = "$parentsection.$previousnum";
457 $previoussection = "$parentsection.0" if ($section =~ /\.1$/) && (defined $sectionmap->{"$parentsection.0"});
458 }
459 while (($previousnum > 0) && (!defined $sectionmap->{$previoussection})) {
460 $previousnum--;
461 $previoussection = "$parentsection.$previousnum";
462 $previoussection = $previousnum if $parentsection eq "";
463 }
464
465 # there has been no children under this parent, this section will be number 1
466 if ($previousnum <= 0) {
467 if ($mappedparentsection eq "") {
468 $sectionmap->{$section} = "1";
469 } else {
470 $sectionmap->{$section} = "$mappedparentsection.1";
471 }
472
473 } else {
474 # get the previous mapped number
475 my $previousmapnum = 0;
476 if ($sectionmap->{$previoussection} =~ /(^|\.)?(\d+)$/) {
477 $previousmapnum = $2;
478 }
479
480 # increment it to get this mapped child
481 my $mappednum = $previousmapnum+1;
482
483 if ($mappedparentsection eq "") {
484 $sectionmap->{$section} = $mappednum;
485 } else {
486 $sectionmap->{$section} = "$mappedparentsection.$mappednum";
487 }
488 }
489
490 return $sectionmap->{$section};
491}
492
493sub text {
494 my $self = shift (@_);
495 my ($doc_obj) = @_;
496 my $handle = $self->{'output_handle'};
497 my $indexed_doc = 1;
498
499 # only output this document if it is one to be indexed
500 return if ($doc_obj->get_doc_type() ne "indexed_doc");
501
502 # see if this document belongs to this subcollection
503 foreach $indexexp (@{$self->{'indexexparr'}}) {
504 $indexed_doc = 0;
505 my ($field, $exp, $options) = split /\//, $indexexp;
506 if (defined ($field) && defined ($exp)) {
507 my ($bool) = $field =~ /^(.)/;
508 $field =~ s/^.// if $bool eq '!';
509 if ($field eq "filename") {
510 $field = $doc_obj->get_source_filename();
511 } else {
512 $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
513 }
514 next unless defined $field;
515 if ($bool eq '!') {
516 if ($options =~ /^i$/i) {
517 if ($field !~ /$exp/i) {$indexed_doc = 1; last;}
518 } else {
519 if ($field !~ /$exp/) {$indexed_doc = 1; last;}
520 }
521 } else {
522 if ($options =~ /^i$/i) {
523 if ($field =~ /$exp/i) {$indexed_doc = 1; last;}
524 } else {
525 if ($field =~ /$exp/) {$indexed_doc = 1; last;}
526 }
527 }
528 }
529 }
530
531 # this is another document
532 $self->{'num_docs'} += 1;
533
534 # get the parameters for the output
535 my ($level, $fields) = split (/:/, $self->{'index'});
536 $fields =~ s/\ball\b/Title,Creator,text/;
537 $fields =~ s/\btopall\b/topTitle,topCreator,toptext/;
538
539 my $doc_section = 0; # just for this document
540 my $text = "";
541 my $text_extra = "";
542
543 # get the text for this document
544 my $section = $doc_obj->get_top_section();
545 while (defined $section) {
546 # update a few statistics
547 $doc_section++;
548 $self->{'num_sections'} += 1;
549
550 if ($indexed_doc) {
551 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
552 foreach $field (split (/,/, $fields)) {
553 # only deal with this field if it doesn't start with top or
554 # this is the first section
555 my $real_field = $field;
556 if (!($real_field =~ s/^top//) || ($doc_section == 1)) {
557 my $new_text = "";
558 if ($real_field eq "text") {
559 $new_text = $doc_obj->get_text ($section);
560 $new_text =~ s/[\cB\cC]//g;
561 $new_text =~ s/(<p\b)/\cC$1/gi;
562
563 } else {
564 $new_text = join ("\cC", @{$doc_obj->get_metadata ($section, $real_field)});
565 }
566
567 $text .= "$new_text\cC";
568 }
569 }
570 }
571
572 if ($level eq "document") { $text_extra .= "\cB"; }
573 else { $text .= "\cB"; }
574
575 $section = $doc_obj->get_next_section($section);
576 }
577
578 print $handle "$text$text_extra";
579}
580
581# converts leading number in classification back
582# to letter it represents
583# i.e 67.2.4 becomes C.2.4
584sub char_classification {
585 my $self = shift (@_);
586 my ($classification) = @_;
587
588 return $classification if $classification eq "";
589 my ($c) = $classification =~ /^\.?(\d+)/;
590 $c = chr($c);
591 $classification =~ s/^\d+/$c/;
592
593 return $classification;
594}
595
596# converts leading letter of a classification into its ascii equivalent
597# i.e C.2.4 becomes 67.2.4
598sub int_classification {
599 my $self = shift (@_);
600 my ($classification) = @_;
601 my $c = ord($classification);
602 $classification =~ s/^./$c/;
603
604 return $classification;
605}
6061;
607
Note: See TracBrowser for help on using the repository browser.