source: trunk/gsdl/perllib/mgbuildproc.pm@ 82

Last change on this file since 82 was 82, checked in by sjboddie, 25 years ago

minor changes to allow individual sections of documents to belong to
classifications as well as the document itself

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 15.0 KB
Line 
1# This document processor outputs a document
2# for mg to process
3
4
5package mgbuildproc;
6
7use docproc;
8use util;
9
10
11BEGIN {
12 @ISA = ('docproc');
13}
14
15
16sub new {
17 my ($class, $collection, $source_dir, $build_dir, $verbosity) = @_;
18 my $self = new docproc ();
19
20 $self->{'collection'} = $collection;
21 $self->{'source_dir'} = $source_dir;
22 $self->{'build_dir'} = $build_dir;
23 $self->{'verbosity'} = $verbosity;
24 $self->{'mode'} = "text";
25 $self->{'index'} = "section:text";
26 $self->{'indexexparr'} = [];
27 $self->{'output_handle'} = "STDOUT";
28 $self->{'num_docs'} = 0;
29 $self->{'num_sections'} = 0;
30 $self->{'num_bytes'} = 0;
31
32 return bless $self, $class;
33}
34
35sub reset {
36 my $self = shift (@_);
37
38 $self->{'num_docs'} = 0;
39 $self->{'num_sections'} = 0;
40 $self->{'num_bytes'} = 0;
41}
42
43sub get_num_docs {
44 my $self = shift (@_);
45
46 return $self->{'num_docs'};
47}
48
49sub get_num_sections {
50 my $self = shift (@_);
51
52 return $self->{'num_sections'};
53}
54
55sub get_num_bytes {
56 my $self = shift (@_);
57
58 return $self->{'num_bytes'};
59}
60
61sub set_output_handle {
62 my $self = shift (@_);
63 my ($handle) = @_;
64
65 $self->{'output_handle'} = $handle;
66}
67
68sub set_mode {
69 my $self = shift (@_);
70 my ($mode) = @_;
71
72 $self->{'mode'} = $mode;
73}
74
75sub set_index {
76 my $self = shift (@_);
77 my ($index, $indexexparr) = @_;
78
79 $self->{'index'} = $index;
80 $self->{'indexexparr'} = $indexexparr if defined $indexexparr;
81}
82
83sub process {
84 my $self = shift (@_);
85 my ($doc_obj) = @_;
86 my $method = $self->{'mode'};
87
88 $self->$method($doc_obj);
89}
90
91sub infodb {
92 my $self = shift (@_);
93 my ($doc_obj) = @_;
94 my $handle = $self->{'output_handle'};
95
96 if ($doc_obj eq 'classifications') {
97 # output classifications if all books have been processed
98 foreach $key (keys %$saved_classifications) {
99 $saved_classifications->{$key}->{'contains'} = undef if
100 $saved_classifications->{$key}->{'contains'} eq "";
101 $saved_classifications->{$key}->{'parent'} = undef if
102 (!defined $saved_classifications->{$key}->{'parent'}) ||
103 ($saved_classifications->{$key}->{'parent'} eq "");
104 $self->write_to_gdbm ($handle, $key, $saved_classifications->{$key}->{'title'},
105 undef, undef, undef, undef,
106 $saved_classifications->{$key}->{'contains'}, undef,
107 $saved_classifications->{$key}->{'parent'}, undef, undef);
108 }
109 return;
110 }
111
112 my $doctype = $doc_obj->get_doc_type();
113
114 # only output this document if it is one to be indexed
115 return if ($doctype ne "indexed_doc" &&
116 $doctype ne "classification");
117
118 # found classification document
119 $saved_classifications = {} if ($doctype eq "classification");
120 $sectionmap = {};
121
122 # this is another document
123 $self->{'num_docs'} += 1 unless ($doctype eq "classification");
124
125 my $section = $doc_obj->get_top_section();
126 while (defined $section) {
127 # update a few statistics
128 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
129 $self->{'num_sections'} += 1 unless ($doctype eq "classification");
130
131 my $title = $doc_obj->get_metadata_element($section, "Title");
132 my $source = $doc_obj->get_metadata_element($section, "Source");
133 my $date = $doc_obj->get_metadata_element($section, "Date");
134 my $jobnumber = $doc_obj->get_source_filename();
135
136 my $mapped_section = $self->map_section($doctype, $section);
137 if ($doctype eq "classification") {
138 $mapped_section = $self->char_classification($mapped_section);
139 } else {
140 $mapped_section = "B.$self->{'num_docs'}.$mapped_section";
141 }
142 $mapped_section =~ s/\.+$//;
143
144 my ($parent, $classification, $creator, $OID);
145
146 $classification = $self->get_classifications($doc_obj, $section, $mapped_section)
147 unless $doctype eq "classification";
148
149 if ($section ne $doc_obj->get_top_section()) {
150 $parent = $self->map_section($doctype, $doc_obj->get_parent_section($section));
151 if ($doctype eq "classification") {
152 $parent = $self->char_classification($parent);
153 } else {
154 $parent = "B.$self->{'num_docs'}.$parent";
155 }
156 $parent =~ s/\.+$//;
157 } else {
158 $creator = $doc_obj->get_metadata_element($section, "Creator");
159 $OID = $doc_obj->get_OID();
160 # this is a hack at getting OID to look like the directory path - I'm sure there's
161 # a better way to do it but it's late... Stefan
162 my @OIDchars = split //, $OID;
163 $OID = "";
164 my $count = 0;
165 foreach $i (@OIDchars) {
166 if ($count == 7) {
167 $OID .= "$i/";
168 $count = 0;
169 } else {
170 $OID .= $i;
171 $count ++;
172 }
173 }
174 $OID =~ s/\/$//;
175 $OID .= ".dir";
176 }
177
178 if ($doc_obj->is_leaf_section($section)) {
179 if ($doctype eq "classification") {
180 if (defined $saved_classifications->{$mapped_section}) {
181 print STDERR "mgbuildproc:warning: classification $mapped_section " .
182 "declared multiple times\n";
183 } else {
184 $saved_classifications->{$mapped_section} = {'title' => $title, 'contains' => "",
185 'parent' => $parent};
186 }
187 } else {
188 $self->write_to_gdbm($handle, $mapped_section, $title, $creator, $source, $date, $jobnumber, undef,
189 $self->{'num_sections'}, $parent, $classification, $OID);
190 }
191 } else {
192
193 # add the introductory section if it exists
194 my $contains = "";
195 if ($doc_obj->get_text_length($section) > 0) {
196 $contains .= "B.$self->{'num_docs'}." . $self->map_section ($doctype, "$section.0");
197 }
198
199 # add the rest of the children
200 my @children = @{$doc_obj->get_children($section)};
201 if ($doctype eq "classification") {
202 map {$_ = $self->char_classification($_);} @children;
203 if (defined $saved_classifications->{$mapped_section}) {
204 print STDERR "mgbuildproc:warning: classification $mapped_section " .
205 "declared multiple times\n";
206 } else {
207 $saved_classifications->{$mapped_section} = {'title' => $title, 'contains' => ""};
208 $saved_classifications->{$mapped_section}->{'contains'} = join ":", @children;
209 $saved_classifications->{$mapped_section}->{'parent'} = $parent;
210 }
211 } else {
212 map {$_ = "B.$self->{'num_docs'}." . $self->map_section($doctype, $_);} @children;
213 $contains .= ":" if $contains ne "";
214 $contains .= join ":", @children;
215 $self->write_to_gdbm ($handle, $mapped_section, $title, $creator, $source,
216 $date, $jobnumber, $contains, $self->{'num_sections'},
217 $parent, $classification, $OID);
218
219 if ($doc_obj->get_text_length($section) > 0) {
220 my $intromapsection = "B.$self->{'num_docs'}." .
221 $self->map_section($doctype, "$section.0");
222 $self->write_to_gdbm ($handle, $intromapsection, "<i>(introductory text)</i>", $creator,
223 $source, $date, $jobnumber, undef, $self->{'num_sections'},
224 $mapped_section, $classification, $OID);
225 }
226 }
227 }
228 $section = $doc_obj->get_next_section($section);
229 }
230
231 # update classification list with those books that
232 # were processed before classification list
233 if ($doctype eq "classification" && defined $temp_classifications) {
234 foreach $key (keys(%$temp_classifications)) {
235 if (!defined $saved_classifications->{$key}) {
236 print STDERR "mgbuildproc:$temp_classifications->{$key} belong to " .
237 "undefined classification $key\n";
238 }
239 $saved_classifications->{$key}->{'contains'} = $temp_classifications->{$key}->{'contains'};
240 }
241 $temp_classifications = undef;
242 }
243}
244
245
246sub write_to_gdbm {
247 my $self = shift (@_);
248 my ($handle, $section, $title, $creator, $source, $date,
249 $jobnumber, $contains, $docnum, $parent, $classification, $OID) = @_;
250
251 print $handle "[$section]\n";
252 print $handle "<t>$title\n" if (defined $title && $title ne "");
253 print $handle "<a>$creator\n" if (defined $creator && $creator ne "");
254 print $handle "<s>$source\n" if (defined $source && $source ne "");
255 print $handle "<i>$date\n" if (defined $date && $date ne "");
256 print $handle "<j>$jobnumber\n" if (defined $jobnumber && $jobnumber ne "");
257 print $handle "<c>$contains\n" if (defined $contains && $contains ne "");
258 print $handle "<d>$docnum\n" if (defined $docnum && $docnum ne "");
259 print $handle "<p>$parent\n" if (defined $parent && $parent ne "");
260 print $handle "<x>$classification\n" if (defined $classification && $classification ne "");
261 print $handle "<o>$OID\n" if defined $OID;
262 print $handle '-' x 70, "\n";
263
264 if (defined $docnum) {
265 print $handle "[$docnum]\n";
266 print $handle "<x>$section\n";
267 print $handle '-' x 70, "\n";
268 }
269}
270
271sub get_classifications {
272 my $self = shift (@_);
273 my ($doc_obj, $section, $mapped_section) = @_;
274
275 my ($classificationsref);
276 if (defined $saved_classifications) {
277 # classification list has been processed
278 $classificationsref = $saved_classifications;
279 } else {
280 # classification list has yet to be processed, save
281 # books in temp_classifications until list is processed
282 # and they can be moved into saved_classifications
283 $temp_classifications = {} unless defined $temp_classifications;
284 $classificationsref = $temp_classifications;
285 }
286
287 my $classifications = $doc_obj->get_metadata($section, "Subject");
288
289 # need to save which books belong in each classification
290 # to output later
291 foreach $classification (@$classifications) {
292
293 if (!defined $classificationsref->{$classification}) {
294 $classificationsref->{$classification}->{'parent'} = $doc_obj->get_parent_section($section);
295 $classificationsref->{$classification}->{'contains'} = "";
296 }
297 $classificationsref->{$classification}->{'contains'} .= ":" unless
298 $classificationsref->{$classification}->{'contains'} eq "";
299 $classificationsref->{$classification}->{'contains'} .= $mapped_section;
300 }
301
302 return (join ":", @$classifications);
303}
304
305sub map_section {
306 my $self = shift (@_);
307 my ($doctype, $section) = @_;
308
309 # classifications should never need to be mapped
310 return $section if $doctype eq "classification";
311
312 return "" unless (defined $section) && ($section ne "");
313
314 $sectionmap = {} unless defined $sectionmap;
315
316
317
318 # get the section into a standard format
319 $section =~ s/^\.+|\.+$//g;
320
321 # return the mapped section if it has been seen before
322 if (defined $sectionmap->{$section}) {
323 return $sectionmap->{$section};
324 }
325
326 # find out the parent section
327 my ($parentsection, $num);
328 if ($section =~ /^(.+)\.(\d+)$/) {
329 $parentsection = $1;
330 $num = $2;
331 } elsif ($section =~ /^(\d+)$/) {
332 $parentsection = "";
333 $num = $1;
334 } else {
335 print STDERR "mgbuildproc:map_section - misformed section $section\n";
336 }
337 $parentsection = "" unless defined $parentsection;
338
339 if ($parentsection eq "") {
340 $num ++ if $num == 1 and defined $sectionmap->{'0'};
341 } else {
342 $num ++ if $num == 1 and defined $sectionmap->{"$parentsection.0"};
343 }
344
345 # find out the mapped parent section
346 my $mappedparentsection = $self->map_section ($doctype, $parentsection);
347
348 # find the next unused child section
349 my $previousnum = $num - 1;
350 my $previoussection = "";
351 if ($parentsection eq "") {
352 $previoussection = $previousnum;
353 $previoussection = 0 if ($section == 1) && (defined $sectionmap->{'0'});
354 } else{
355 $previoussection = "$parentsection.$previousnum";
356 $previoussection = "$parentsection.0" if ($section =~ /\.1$/) && (defined $sectionmap->{"$parentsection.0"});
357 }
358 while (($previousnum > 0) && (!defined $sectionmap->{$previoussection})) {
359 $previousnum--;
360 $previoussection = "$parentsection.$previousnum";
361 $previoussection = $previousnum if $parentsection eq "";
362 }
363
364 # there has been no children under this parent, this section will be number 1
365 if ($previousnum <= 0) {
366 if ($mappedparentsection eq "") {
367 $sectionmap->{$section} = "1";
368 } else {
369 $sectionmap->{$section} = "$mappedparentsection.1";
370 }
371
372 } else {
373 # get the previous mapped number
374 my $previousmapnum = 0;
375 if ($sectionmap->{$previoussection} =~ /(^|\.)?(\d+)$/) {
376 $previousmapnum = $2;
377 }
378
379 # increment it to get this mapped child
380 my $mappednum = $previousmapnum+1;
381
382 if ($mappedparentsection eq "") {
383 $sectionmap->{$section} = $mappednum;
384 } else {
385 $sectionmap->{$section} = "$mappedparentsection.$mappednum";
386 }
387 }
388
389 return $sectionmap->{$section};
390}
391
392sub text {
393 my $self = shift (@_);
394 my ($doc_obj) = @_;
395 my $handle = $self->{'output_handle'};
396 my $indexed_doc = 1;
397
398 # only output this document if it is one to be indexed
399 return if ($doc_obj->get_doc_type() ne "indexed_doc");
400
401 # see if this document belongs to this subcollection
402 foreach $indexexp (@{$self->{'indexexparr'}}) {
403 $indexed_doc = 0;
404 my ($field, $exp, $options) = split /\//, $indexexp;
405 if (defined ($field) && defined ($exp)) {
406 my ($bool) = $field =~ /^(.)/;
407 $field =~ s/^.// if $bool eq '!';
408 if ($field eq "filename") {
409 $field = $doc_obj->get_source_filename();
410 } else {
411 $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
412 }
413 next unless defined $field;
414 if ($bool eq '!') {
415 if ($options =~ /^i$/i) {
416 if ($field !~ /$exp/i) {$indexed_doc = 1; last;}
417 } else {
418 if ($field !~ /$exp/) {$indexed_doc = 1; last;}
419 }
420 } else {
421 if ($options =~ /^i$/i) {
422 if ($field =~ /$exp/i) {$indexed_doc = 1; last;}
423 } else {
424 if ($field =~ /$exp/) {$indexed_doc = 1; last;}
425 }
426 }
427 }
428 }
429
430 # this is another document
431 $self->{'num_docs'} += 1;
432
433 # get the parameters for the output
434 my ($level, $fields) = split (/:/, $self->{'index'});
435 $fields =~ s/\ball\b/Title,Creator,text/;
436 $fields =~ s/\btopall\b/topTitle,topCreator,toptext/;
437
438 my $doc_section = 0; # just for this document
439 my $text = "";
440 my $text_extra = "";
441
442 # get the text for this document
443 my $section = $doc_obj->get_top_section();
444 while (defined $section) {
445 # update a few statistics
446 $doc_section++;
447 $self->{'num_sections'} += 1;
448
449 if ($indexed_doc) {
450 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
451 foreach $field (split (/,/, $fields)) {
452 # only deal with this field if it doesn't start with top or
453 # this is the first section
454 my $real_field = $field;
455 if (!($real_field =~ s/^top//) || ($doc_section == 1)) {
456 my $new_text = "";
457 if ($real_field eq "text") {
458 $new_text = $doc_obj->get_text ($section);
459 $new_text =~ s/[\cB\cC]//g;
460 $new_text =~ s/(<p\b)/\cC$1/gi;
461
462 } else {
463 $new_text = join ("\cC", @{$doc_obj->get_metadata ($section, $real_field)});
464 }
465
466 $text .= "$new_text\cC";
467 }
468 }
469 }
470
471 if ($level eq "document") { $text_extra .= "\cB"; }
472 else { $text .= "\cB"; }
473
474 $section = $doc_obj->get_next_section($section);
475 }
476
477 print $handle "$text$text_extra";
478}
479
480# converts leading number in classification back
481# to letter it represents
482# i.e 67.2.4 becomes C.2.4
483sub char_classification {
484 my $self = shift (@_);
485 my ($classification) = @_;
486
487 return $classification if $classification eq "";
488 my ($c) = $classification =~ /^\.?(\d+)/;
489 $c = chr($c);
490 $classification =~ s/^\d+/$c/;
491
492 return $classification;
493}
494
495# converts leading letter of a classification into its ascii equivalent
496# i.e C.2.4 becomes 67.2.4
497sub int_classification {
498 my $self = shift (@_);
499 my ($classification) = @_;
500 my $c = ord($classification);
501 $classification =~ s/^./$c/;
502
503 return $classification;
504}
5051;
506
Note: See TracBrowser for help on using the repository browser.