source: trunk/gsdl/perllib/mgbuildproc.pm@ 139

Last change on this file since 139 was 139, checked in by sjboddie, 25 years ago

Got building stuff to handle subcollections and language subcollections

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 14.7 KB
Line 
1# This document processor outputs a document
2# for mg to process
3
4
5package mgbuildproc;
6
7use docproc;
8use util;
9
10
11BEGIN {
12 @ISA = ('docproc');
13}
14
15
16sub new {
17 my ($class, $collection, $source_dir, $build_dir, $verbosity) = @_;
18 my $self = new docproc ();
19
20 $self->{'collection'} = $collection;
21 $self->{'source_dir'} = $source_dir;
22 $self->{'build_dir'} = $build_dir;
23 $self->{'verbosity'} = $verbosity;
24 $self->{'mode'} = "text";
25 $self->{'index'} = "section:text";
26 $self->{'indexexparr'} = [];
27 $self->{'output_handle'} = "STDOUT";
28 $self->{'num_docs'} = 0;
29 $self->{'num_sections'} = 0;
30 $self->{'num_bytes'} = 0;
31
32 return bless $self, $class;
33}
34
35sub reset {
36 my $self = shift (@_);
37
38 $self->{'num_docs'} = 0;
39 $self->{'num_sections'} = 0;
40 $self->{'num_bytes'} = 0;
41}
42
43sub get_num_docs {
44 my $self = shift (@_);
45
46 return $self->{'num_docs'};
47}
48
49sub get_num_sections {
50 my $self = shift (@_);
51
52 return $self->{'num_sections'};
53}
54
55sub get_num_bytes {
56 my $self = shift (@_);
57
58 return $self->{'num_bytes'};
59}
60
61sub set_output_handle {
62 my $self = shift (@_);
63 my ($handle) = @_;
64
65 $self->{'output_handle'} = $handle;
66}
67
68sub set_mode {
69 my $self = shift (@_);
70 my ($mode) = @_;
71
72 $self->{'mode'} = $mode;
73}
74
75sub set_index {
76 my $self = shift (@_);
77 my ($index, $indexexparr) = @_;
78
79 $self->{'index'} = $index;
80 $self->{'indexexparr'} = $indexexparr if defined $indexexparr;
81}
82
83sub process {
84 my $self = shift (@_);
85 my $method = $self->{'mode'};
86
87 $self->$method(@_);
88}
89
90sub infodb {
91 my $self = shift (@_);
92 my ($doc_obj, $filename) = @_;
93 my $handle = $self->{'output_handle'};
94
95 if ($doc_obj eq 'classifications') {
96 # output classifications if all books have been processed
97 foreach $key (keys %$saved_classifications) {
98 $saved_classifications->{$key}->{'contains'} = undef if
99 $saved_classifications->{$key}->{'contains'} eq "";
100 $saved_classifications->{$key}->{'parent'} = undef if
101 (!defined $saved_classifications->{$key}->{'parent'}) ||
102 ($saved_classifications->{$key}->{'parent'} eq "");
103 $self->write_to_gdbm ($handle, $key, $saved_classifications->{$key}->{'title'},
104 undef, undef, undef, undef,
105 $saved_classifications->{$key}->{'contains'}, undef,
106 $saved_classifications->{$key}->{'parent'}, undef, undef);
107 }
108 return;
109 }
110
111 my $doctype = $doc_obj->get_doc_type();
112
113 # only output this document if it is one to be indexed
114 return if ($doctype ne "indexed_doc" &&
115 $doctype ne "classification");
116
117 # found classification document
118 $saved_classifications = {} if ($doctype eq "classification");
119 $sectionmap = {};
120
121 # this is another document
122 $self->{'num_docs'} += 1 unless ($doctype eq "classification");
123
124 my $section = $doc_obj->get_top_section();
125 while (defined $section) {
126 # update a few statistics
127 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
128 $self->{'num_sections'} += 1 unless ($doctype eq "classification");
129
130 my $title = $doc_obj->get_metadata_element($section, "Title");
131 my $source = $doc_obj->get_metadata_element($section, "Source");
132 my $date = $doc_obj->get_metadata_element($section, "Date");
133 my $jobnumber = $doc_obj->get_source_filename();
134
135 my $mapped_section = $self->map_section($doctype, $section);
136 if ($doctype eq "classification") {
137 $mapped_section = $self->char_classification($mapped_section);
138 } else {
139 $mapped_section = "B.$self->{'num_docs'}.$mapped_section";
140 }
141 $mapped_section =~ s/\.+$//;
142
143 my ($parent, $classification, $creator);
144
145 $classification = $self->get_classifications($doc_obj, $section, $mapped_section)
146 unless $doctype eq "classification";
147
148 if ($section ne $doc_obj->get_top_section()) {
149 $parent = $self->map_section($doctype, $doc_obj->get_parent_section($section));
150 if ($doctype eq "classification") {
151 $parent = $self->char_classification($parent);
152 } else {
153 $parent = "B.$self->{'num_docs'}.$parent";
154 }
155 $parent =~ s/\.+$//;
156 } else {
157 $creator = $doc_obj->get_metadata_element($section, "Creator");
158
159 # need filename so we know what directory to look in for associated files
160 $filename =~ s/^\/?(.*?\.dir).*$/$1/ if (defined $filename);
161 }
162
163 if ($doc_obj->is_leaf_section($section)) {
164 if ($doctype eq "classification") {
165 if (defined $saved_classifications->{$mapped_section}) {
166 print STDERR "mgbuildproc:warning: classification $mapped_section " .
167 "declared multiple times\n";
168 } else {
169 $saved_classifications->{$mapped_section} = {'title' => $title, 'contains' => "",
170 'parent' => $parent};
171 }
172 } else {
173 $self->write_to_gdbm($handle, $mapped_section, $title, $creator, $source, $date, $jobnumber, undef,
174 $self->{'num_sections'}, $parent, $classification, $filename);
175 }
176 } else {
177
178 # add the introductory section if it exists
179 my $contains = "";
180 if ($doc_obj->get_text_length($section) > 0) {
181 $contains .= "B.$self->{'num_docs'}." . $self->map_section ($doctype, "$section.0");
182 }
183
184 # add the rest of the children
185 my @children = @{$doc_obj->get_children($section)};
186 if ($doctype eq "classification") {
187 map {$_ = $self->char_classification($_);} @children;
188 if (defined $saved_classifications->{$mapped_section}) {
189 print STDERR "mgbuildproc:warning: classification $mapped_section " .
190 "declared multiple times\n";
191 } else {
192 $saved_classifications->{$mapped_section} = {'title' => $title, 'contains' => ""};
193 $saved_classifications->{$mapped_section}->{'contains'} = join ":", @children;
194 $saved_classifications->{$mapped_section}->{'parent'} = $parent;
195 }
196 } else {
197 map {$_ = "B.$self->{'num_docs'}." . $self->map_section($doctype, $_);} @children;
198 $contains .= ":" if $contains ne "";
199 $contains .= join ":", @children;
200 $self->write_to_gdbm ($handle, $mapped_section, $title, $creator, $source,
201 $date, $jobnumber, $contains, $self->{'num_sections'},
202 $parent, $classification, $filename);
203
204 if ($doc_obj->get_text_length($section) > 0) {
205 my $intromapsection = "B.$self->{'num_docs'}." .
206 $self->map_section($doctype, "$section.0");
207 $self->write_to_gdbm ($handle, $intromapsection, "<i>(introductory text)</i>", $creator,
208 $source, $date, $jobnumber, undef, $self->{'num_sections'},
209 $mapped_section, $classification, $filename);
210 }
211 }
212 }
213 $section = $doc_obj->get_next_section($section);
214 }
215
216 # update classification list with those books that
217 # were processed before classification list
218 if ($doctype eq "classification" && defined $temp_classifications) {
219 foreach $key (keys(%$temp_classifications)) {
220 if (!defined $saved_classifications->{$key}) {
221 print STDERR "mgbuildproc:$temp_classifications->{$key} belong to " .
222 "undefined classification $key\n";
223 }
224 $saved_classifications->{$key}->{'contains'} = $temp_classifications->{$key}->{'contains'};
225 }
226 $temp_classifications = undef;
227 }
228}
229
230
231sub write_to_gdbm {
232 my $self = shift (@_);
233 my ($handle, $section, $title, $creator, $source, $date,
234 $jobnumber, $contains, $docnum, $parent, $classification, $OID) = @_;
235
236 print $handle "[$section]\n";
237 print $handle "<t>$title\n" if (defined $title && $title ne "");
238 print $handle "<a>$creator\n" if (defined $creator && $creator ne "");
239 print $handle "<s>$source\n" if (defined $source && $source ne "");
240 print $handle "<i>$date\n" if (defined $date && $date ne "");
241 print $handle "<j>$jobnumber\n" if (defined $jobnumber && $jobnumber ne "");
242 print $handle "<c>$contains\n" if (defined $contains && $contains ne "");
243 print $handle "<d>$docnum\n" if (defined $docnum && $docnum ne "");
244 print $handle "<p>$parent\n" if (defined $parent && $parent ne "");
245 print $handle "<x>$classification\n" if (defined $classification && $classification ne "");
246 print $handle "<o>$OID\n" if defined $OID;
247 print $handle '-' x 70, "\n";
248
249 if (defined $docnum) {
250 print $handle "[$docnum]\n";
251 print $handle "<x>$section\n";
252 print $handle '-' x 70, "\n";
253 }
254}
255
256sub get_classifications {
257 my $self = shift (@_);
258 my ($doc_obj, $section, $mapped_section) = @_;
259
260 my ($classificationsref);
261 if (defined $saved_classifications) {
262 # classification list has been processed
263 $classificationsref = $saved_classifications;
264 } else {
265 # classification list has yet to be processed, save
266 # books in temp_classifications until list is processed
267 # and they can be moved into saved_classifications
268 $temp_classifications = {} unless defined $temp_classifications;
269 $classificationsref = $temp_classifications;
270 }
271
272 my $classifications = $doc_obj->get_metadata($section, "Subject");
273
274 # need to save which books belong in each classification
275 # to output later
276 foreach $classification (@$classifications) {
277
278 if (!defined $classificationsref->{$classification}) {
279 $classificationsref->{$classification}->{'parent'} = $doc_obj->get_parent_section($section);
280 $classificationsref->{$classification}->{'contains'} = "";
281 }
282 $classificationsref->{$classification}->{'contains'} .= ":" unless
283 $classificationsref->{$classification}->{'contains'} eq "";
284 $classificationsref->{$classification}->{'contains'} .= $mapped_section;
285 }
286
287 return (join ":", @$classifications);
288}
289
290sub map_section {
291 my $self = shift (@_);
292 my ($doctype, $section) = @_;
293
294 # classifications should never need to be mapped
295 return $section if $doctype eq "classification";
296
297 return "" unless (defined $section) && ($section ne "");
298
299 $sectionmap = {} unless defined $sectionmap;
300
301
302
303 # get the section into a standard format
304 $section =~ s/^\.+|\.+$//g;
305
306 # return the mapped section if it has been seen before
307 if (defined $sectionmap->{$section}) {
308 return $sectionmap->{$section};
309 }
310
311 # find out the parent section
312 my ($parentsection, $num);
313 if ($section =~ /^(.+)\.(\d+)$/) {
314 $parentsection = $1;
315 $num = $2;
316 } elsif ($section =~ /^(\d+)$/) {
317 $parentsection = "";
318 $num = $1;
319 } else {
320 print STDERR "mgbuildproc:map_section - misformed section $section\n";
321 }
322 $parentsection = "" unless defined $parentsection;
323
324 if ($parentsection eq "") {
325 $num ++ if $num == 1 and defined $sectionmap->{'0'};
326 } else {
327 $num ++ if $num == 1 and defined $sectionmap->{"$parentsection.0"};
328 }
329
330 # find out the mapped parent section
331 my $mappedparentsection = $self->map_section ($doctype, $parentsection);
332
333 # find the next unused child section
334 my $previousnum = $num - 1;
335 my $previoussection = "";
336 if ($parentsection eq "") {
337 $previoussection = $previousnum;
338 $previoussection = 0 if ($section == 1) && (defined $sectionmap->{'0'});
339 } else{
340 $previoussection = "$parentsection.$previousnum";
341 $previoussection = "$parentsection.0" if ($section =~ /\.1$/) && (defined $sectionmap->{"$parentsection.0"});
342 }
343 while (($previousnum > 0) && (!defined $sectionmap->{$previoussection})) {
344 $previousnum--;
345 $previoussection = "$parentsection.$previousnum";
346 $previoussection = $previousnum if $parentsection eq "";
347 }
348
349 # there has been no children under this parent, this section will be number 1
350 if ($previousnum <= 0) {
351 if ($mappedparentsection eq "") {
352 $sectionmap->{$section} = "1";
353 } else {
354 $sectionmap->{$section} = "$mappedparentsection.1";
355 }
356
357 } else {
358 # get the previous mapped number
359 my $previousmapnum = 0;
360 if ($sectionmap->{$previoussection} =~ /(^|\.)?(\d+)$/) {
361 $previousmapnum = $2;
362 }
363
364 # increment it to get this mapped child
365 my $mappednum = $previousmapnum+1;
366
367 if ($mappedparentsection eq "") {
368 $sectionmap->{$section} = $mappednum;
369 } else {
370 $sectionmap->{$section} = "$mappedparentsection.$mappednum";
371 }
372 }
373
374 return $sectionmap->{$section};
375}
376
377sub text {
378 my $self = shift (@_);
379 my ($doc_obj) = @_;
380 my $handle = $self->{'output_handle'};
381 my $indexed_doc = 1;
382
383 # only output this document if it is one to be indexed
384 return if ($doc_obj->get_doc_type() ne "indexed_doc");
385
386 # see if this document belongs to this subcollection
387 foreach $indexexp (@{$self->{'indexexparr'}}) {
388 $indexed_doc = 0;
389 my ($field, $exp, $options) = split /\//, $indexexp;
390 if (defined ($field) && defined ($exp)) {
391 my ($bool) = $field =~ /^(.)/;
392 $field =~ s/^.// if $bool eq '!';
393 if ($field eq "filename") {
394 $field = $doc_obj->get_source_filename();
395 } else {
396 $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
397 }
398 next unless defined $field;
399 if ($bool eq '!') {
400 if ($options =~ /^i$/i) {
401 if ($field !~ /$exp/i) {$indexed_doc = 1; last;}
402 } else {
403 if ($field !~ /$exp/) {$indexed_doc = 1; last;}
404 }
405 } else {
406 if ($options =~ /^i$/i) {
407 if ($field =~ /$exp/i) {$indexed_doc = 1; last;}
408 } else {
409 if ($field =~ /$exp/) {$indexed_doc = 1; last;}
410 }
411 }
412 }
413 }
414
415 # this is another document
416 $self->{'num_docs'} += 1;
417
418 # get the parameters for the output
419 my ($level, $fields) = split (/:/, $self->{'index'});
420 $fields =~ s/\ball\b/Title,Creator,text/;
421 $fields =~ s/\btopall\b/topTitle,topCreator,toptext/;
422
423 my $doc_section = 0; # just for this document
424 my $text = "";
425 my $text_extra = "";
426
427 # get the text for this document
428 my $section = $doc_obj->get_top_section();
429 while (defined $section) {
430 # update a few statistics
431 $doc_section++;
432 $self->{'num_sections'} += 1;
433
434 if ($indexed_doc) {
435 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
436 foreach $field (split (/,/, $fields)) {
437 # only deal with this field if it doesn't start with top or
438 # this is the first section
439 my $real_field = $field;
440 if (!($real_field =~ s/^top//) || ($doc_section == 1)) {
441 my $new_text = "";
442 if ($real_field eq "text") {
443 $new_text = $doc_obj->get_text ($section);
444 $new_text =~ s/[\cB\cC]//g;
445 $new_text =~ s/(<p\b)/\cC$1/gi;
446
447 } else {
448 $new_text = join ("\cC", @{$doc_obj->get_metadata ($section, $real_field)});
449 }
450
451 $text .= "$new_text\cC";
452 }
453 }
454 }
455
456 if ($level eq "document") { $text_extra .= "\cB"; }
457 else { $text .= "\cB"; }
458
459 $section = $doc_obj->get_next_section($section);
460 }
461
462 print $handle "$text$text_extra";
463}
464
465# converts leading number in classification back
466# to letter it represents
467# i.e 67.2.4 becomes C.2.4
468sub char_classification {
469 my $self = shift (@_);
470 my ($classification) = @_;
471
472 return $classification if $classification eq "";
473 my ($c) = $classification =~ /^\.?(\d+)/;
474 $c = chr($c);
475 $classification =~ s/^\d+/$c/;
476
477 return $classification;
478}
479
480# converts leading letter of a classification into its ascii equivalent
481# i.e C.2.4 becomes 67.2.4
482sub int_classification {
483 my $self = shift (@_);
484 my ($classification) = @_;
485 my $c = ord($classification);
486 $classification =~ s/^./$c/;
487
488 return $classification;
489}
4901;
491
Note: See TracBrowser for help on using the repository browser.