1 | # This document processor saves a document in the
|
---|
2 | # archives directory of a collection
|
---|
3 |
|
---|
4 |
|
---|
5 | package mgbuildproc;
|
---|
6 |
|
---|
7 | use docproc;
|
---|
8 | use util;
|
---|
9 |
|
---|
10 |
|
---|
11 | BEGIN {
|
---|
12 | @ISA = ('docproc');
|
---|
13 | }
|
---|
14 |
|
---|
15 |
|
---|
16 | sub new {
|
---|
17 | my ($class, $collection, $source_dir, $build_dir, $verbosity) = @_;
|
---|
18 | my $self = new docproc ();
|
---|
19 |
|
---|
20 | $self->{'collection'} = $collection;
|
---|
21 | $self->{'source_dir'} = $source_dir;
|
---|
22 | $self->{'build_dir'} = $build_dir;
|
---|
23 | $self->{'verbosity'} = $verbosity;
|
---|
24 | $self->{'mode'} = "text";
|
---|
25 | $self->{'index'} = "section:text";
|
---|
26 | $self->{'output_handle'} = "STDOUT";
|
---|
27 | $self->{'num_docs'} = 0;
|
---|
28 | $self->{'num_sections'} = 0;
|
---|
29 | $self->{'num_bytes'} = 0;
|
---|
30 |
|
---|
31 | return bless $self, $class;
|
---|
32 | }
|
---|
33 |
|
---|
34 | sub reset {
|
---|
35 | my $self = shift (@_);
|
---|
36 |
|
---|
37 | $self->{'num_docs'} = 0;
|
---|
38 | $self->{'num_sections'} = 0;
|
---|
39 | $self->{'num_bytes'} = 0;
|
---|
40 | }
|
---|
41 |
|
---|
42 | sub get_num_docs {
|
---|
43 | my $self = shift (@_);
|
---|
44 |
|
---|
45 | return $self->{'num_docs'};
|
---|
46 | }
|
---|
47 |
|
---|
48 | sub get_num_sections {
|
---|
49 | my $self = shift (@_);
|
---|
50 |
|
---|
51 | return $self->{'num_sections'};
|
---|
52 | }
|
---|
53 |
|
---|
54 | sub get_num_bytes {
|
---|
55 | my $self = shift (@_);
|
---|
56 |
|
---|
57 | return $self->{'num_bytes'};
|
---|
58 | }
|
---|
59 |
|
---|
60 | sub set_output_handle {
|
---|
61 | my $self = shift (@_);
|
---|
62 | my ($handle) = @_;
|
---|
63 |
|
---|
64 | $self->{'output_handle'} = $handle;
|
---|
65 | }
|
---|
66 |
|
---|
67 | sub set_mode {
|
---|
68 | my $self = shift (@_);
|
---|
69 | my ($mode) = @_;
|
---|
70 |
|
---|
71 | $self->{'mode'} = $mode;
|
---|
72 | }
|
---|
73 |
|
---|
74 | sub set_index {
|
---|
75 | my $self = shift (@_);
|
---|
76 | my ($index) = @_;
|
---|
77 |
|
---|
78 | $self->{'index'} = $index;
|
---|
79 | }
|
---|
80 |
|
---|
81 | sub process {
|
---|
82 | my $self = shift (@_);
|
---|
83 | my ($doc_obj) = @_;
|
---|
84 | my $method = $self->{'mode'};
|
---|
85 |
|
---|
86 | $self->$method($doc_obj);
|
---|
87 | }
|
---|
88 |
|
---|
89 | sub infodb {
|
---|
90 | my $self = shift (@_);
|
---|
91 | my ($doc_obj) = @_;
|
---|
92 | my $handle = $self->{'output_handle'};
|
---|
93 |
|
---|
94 | if ($doc_obj eq 'classifications') {
|
---|
95 | # output classifications if all books have been processed
|
---|
96 | foreach $key (keys %$saved_classifications) {
|
---|
97 | $saved_classifications->{$key}->{'contains'} = undef if
|
---|
98 | $saved_classifications->{$key}->{'contains'} eq "";
|
---|
99 | $saved_classifications->{$key}->{'parent'} = undef if
|
---|
100 | (!defined $saved_classifications->{$key}->{'parent'}) ||
|
---|
101 | ($saved_classifications->{$key}->{'parent'} eq "");
|
---|
102 | $self->write_to_gdbm ($handle, $key, $saved_classifications->{$key}->{'title'},
|
---|
103 | undef, undef,
|
---|
104 | $saved_classifications->{$key}->{'contains'}, undef,
|
---|
105 | $saved_classifications->{$key}->{'parent'}, undef);
|
---|
106 | }
|
---|
107 | return;
|
---|
108 | }
|
---|
109 |
|
---|
110 | my $doctype = $doc_obj->get_doc_type();
|
---|
111 |
|
---|
112 | # only output this document if it is one to be indexed
|
---|
113 | return if ($doctype ne "indexed_doc" &&
|
---|
114 | $doctype ne "classification");
|
---|
115 |
|
---|
116 | # found classification document
|
---|
117 | $saved_classifications = {} if ($doctype eq "classification");
|
---|
118 | $sectionmap = {};
|
---|
119 |
|
---|
120 | # this is another document
|
---|
121 | $self->{'num_docs'} += 1 unless ($doctype eq "classification");
|
---|
122 |
|
---|
123 | my $section = $doc_obj->get_top_section();
|
---|
124 | while (defined $section) {
|
---|
125 | # update a few statistics
|
---|
126 | $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
|
---|
127 | $self->{'num_sections'} += 1 unless ($doctype eq "classification");
|
---|
128 |
|
---|
129 | my $title = $doc_obj->get_metadata_element($section, "Title");
|
---|
130 | my $jobnumber = $doc_obj->get_source_filename();
|
---|
131 |
|
---|
132 | my $mapped_section = $self->map_section($doctype, $section);
|
---|
133 | if ($doctype eq "classification") {
|
---|
134 | $mapped_section = $self->char_classification($mapped_section);
|
---|
135 | } else {
|
---|
136 | $mapped_section = "B.$self->{'num_docs'}.$mapped_section";
|
---|
137 | }
|
---|
138 | $mapped_section =~ s/\.+$//;
|
---|
139 |
|
---|
140 | my ($parent, $classification, $creator, $OID);
|
---|
141 |
|
---|
142 | if ($section ne $doc_obj->get_top_section()) {
|
---|
143 | $parent = $self->map_section($doctype, $doc_obj->get_parent_section($section));
|
---|
144 | if ($doctype eq "classification") {
|
---|
145 | $parent = $self->char_classification($parent);
|
---|
146 | } else {
|
---|
147 | $parent = "B.$self->{'num_docs'}.$parent";
|
---|
148 | }
|
---|
149 | $parent =~ s/\.+$//;
|
---|
150 | } else {
|
---|
151 | $classification = $self->get_classifications($doc_obj, $section, $mapped_section)
|
---|
152 | unless $doctype eq "classification";
|
---|
153 | $creator = $doc_obj->get_metadata_element($section, "Creator");
|
---|
154 | $OID = $doc_obj->get_OID();
|
---|
155 | # this is a hack at getting OID to look like the directory path - I'm sure there's
|
---|
156 | # a better way to do it but it's late... Stefan
|
---|
157 | my @OIDchars = split //, $OID;
|
---|
158 | $OID = "";
|
---|
159 | my $count = 0;
|
---|
160 | foreach $i (@OIDchars) {
|
---|
161 | if ($count == 7) {
|
---|
162 | $OID .= "$i/";
|
---|
163 | $count = 0;
|
---|
164 | } else {
|
---|
165 | $OID .= $i;
|
---|
166 | $count ++;
|
---|
167 | }
|
---|
168 | }
|
---|
169 | $OID =~ s/\/$//;
|
---|
170 | $OID .= ".dir";
|
---|
171 | }
|
---|
172 |
|
---|
173 | if ($doc_obj->is_leaf_section($section)) {
|
---|
174 | if ($doctype eq "classification") {
|
---|
175 | if (defined $saved_classifications->{$mapped_section}) {
|
---|
176 | print STDERR "mgbuildproc:warning: classification $mapped_section " .
|
---|
177 | "declared multiple times\n";
|
---|
178 | } else {
|
---|
179 | $saved_classifications->{$mapped_section} = {'title' => $title, 'contains' => "",
|
---|
180 | 'parent' => $parent};
|
---|
181 | }
|
---|
182 | } else {
|
---|
183 | $self->write_to_gdbm($handle, $mapped_section, $title, $creator, $jobnumber, undef,
|
---|
184 | $self->{'num_sections'}, $parent, $classification, $OID);
|
---|
185 | }
|
---|
186 | } else {
|
---|
187 |
|
---|
188 | # add the introductory section if it exists
|
---|
189 | my $contains = "";
|
---|
190 | if ($doc_obj->get_text_length($section) > 0) {
|
---|
191 | $contains .= "B.$self->{'num_docs'}." . $self->map_section ($doctype, "$section.0");
|
---|
192 | }
|
---|
193 |
|
---|
194 | # add the rest of the children
|
---|
195 | my @children = @{$doc_obj->get_children($section)};
|
---|
196 | if ($doctype eq "classification") {
|
---|
197 | map {$_ = $self->char_classification($_);} @children;
|
---|
198 | if (defined $saved_classifications->{$mapped_section}) {
|
---|
199 | print STDERR "mgbuildproc:warning: classification $mapped_section " .
|
---|
200 | "declared multiple times\n";
|
---|
201 | } else {
|
---|
202 | $saved_classifications->{$mapped_section} = {'title' => $title, 'contains' => ""};
|
---|
203 | $saved_classifications->{$mapped_section}->{'contains'} = join ":", @children;
|
---|
204 | $saved_classifications->{$mapped_section}->{'parent'} = $parent;
|
---|
205 | }
|
---|
206 | } else {
|
---|
207 | map {$_ = "B.$self->{'num_docs'}." . $self->map_section($doctype, $_);} @children;
|
---|
208 | $contains .= ":" if $contains ne "";
|
---|
209 | $contains .= join ":", @children;
|
---|
210 | $self->write_to_gdbm ($handle, $mapped_section, $title, $creator,
|
---|
211 | $jobnumber, $contains, $self->{'num_sections'},
|
---|
212 | $parent, $classification, $OID);
|
---|
213 |
|
---|
214 | if ($doc_obj->get_text_length($section) > 0) {
|
---|
215 | my $intromapsection = "B.$self->{'num_docs'}." .
|
---|
216 | $self->map_section($doctype, "$section.0");
|
---|
217 | $self->write_to_gdbm ($handle, $intromapsection, "<i>(introductory text)</i>", $creator,
|
---|
218 | $jobnumber, undef, $self->{'num_sections'},
|
---|
219 | $mapped_section, undef, $OID);
|
---|
220 | }
|
---|
221 | }
|
---|
222 | }
|
---|
223 | $section = $doc_obj->get_next_section($section);
|
---|
224 | }
|
---|
225 |
|
---|
226 | # update classification list with those books that
|
---|
227 | # were processed before classification list
|
---|
228 | if ($doctype eq "classification" && defined $temp_classifications) {
|
---|
229 | foreach $key (keys(%$temp_classifications)) {
|
---|
230 | if (!defined $saved_classifications->{$key}) {
|
---|
231 | print STDERR "mgbuildproc:$temp_classifications->{$key} belong to " .
|
---|
232 | "undefined classification $key\n";
|
---|
233 | }
|
---|
234 | $saved_classifications->{$key}->{'contains'} = $temp_classifications->{$key}->{'contains'};
|
---|
235 | }
|
---|
236 | $temp_classifications = undef;
|
---|
237 | }
|
---|
238 | }
|
---|
239 |
|
---|
240 |
|
---|
241 | sub write_to_gdbm {
|
---|
242 | my $self = shift (@_);
|
---|
243 | my ($handle, $section, $title, $creator, $jobnumber, $contains,
|
---|
244 | $docnum, $parent, $classification, $OID) = @_;
|
---|
245 |
|
---|
246 | print $handle "[$section]\n";
|
---|
247 | print $handle "<t>$title\n" if defined $title;
|
---|
248 | print $handle "<a>$creator\n" if defined $creator;
|
---|
249 | print $handle "<j>$jobnumber\n" if defined $jobnumber;
|
---|
250 | print $handle "<c>$contains\n" if defined $contains;
|
---|
251 | print $handle "<d>$docnum\n" if defined $docnum;
|
---|
252 | print $handle "<p>$parent\n" if defined $parent;
|
---|
253 | print $handle "<x>$classification\n" if defined $classification;
|
---|
254 | print $handle "<o>$OID\n" if defined $OID;
|
---|
255 | print $handle '-' x 70, "\n";
|
---|
256 |
|
---|
257 | if (defined $docnum) {
|
---|
258 | print $handle "[$docnum]\n";
|
---|
259 | print $handle "<x>$section\n";
|
---|
260 | print $handle '-' x 70, "\n";
|
---|
261 | }
|
---|
262 | }
|
---|
263 |
|
---|
264 | sub get_classifications {
|
---|
265 | my $self = shift (@_);
|
---|
266 | my ($doc_obj, $section, $mapped_section) = @_;
|
---|
267 |
|
---|
268 | my ($classificationsref);
|
---|
269 | if (defined $saved_classifications) {
|
---|
270 | # classification list has been processed
|
---|
271 | $classificationsref = $saved_classifications;
|
---|
272 | } else {
|
---|
273 | # classification list has yet to be processed, save
|
---|
274 | # books in temp_classifications until list is processed
|
---|
275 | # and they can be moved into saved_classifications
|
---|
276 | $temp_classifications = {} unless defined $temp_classifications;
|
---|
277 | $classificationsref = $temp_classifications;
|
---|
278 | }
|
---|
279 |
|
---|
280 | my $classifications = $doc_obj->get_metadata($section, "Subject");
|
---|
281 |
|
---|
282 | # need to save which books belong in each classification
|
---|
283 | # to output later
|
---|
284 | foreach $classification (@$classifications) {
|
---|
285 |
|
---|
286 | if (!defined $classificationsref->{$classification}) {
|
---|
287 | $classificationsref->{$classification}->{'parent'} = $doc_obj->get_parent_section($section);
|
---|
288 | $classificationsref->{$classification}->{'contains'} = "";
|
---|
289 | }
|
---|
290 | $classificationsref->{$classification}->{'contains'} .= ":" unless
|
---|
291 | $classificationsref->{$classification}->{'contains'} eq "";
|
---|
292 | $classificationsref->{$classification}->{'contains'} .= $mapped_section;
|
---|
293 | }
|
---|
294 |
|
---|
295 | return (join ":", @$classifications);
|
---|
296 | }
|
---|
297 |
|
---|
298 | sub map_section {
|
---|
299 | my $self = shift (@_);
|
---|
300 | my ($doctype, $section) = @_;
|
---|
301 |
|
---|
302 | # classifications should never need to be mapped
|
---|
303 | return $section if $doctype eq "classification";
|
---|
304 |
|
---|
305 | return "" unless (defined $section) && ($section ne "");
|
---|
306 |
|
---|
307 | $sectionmap = {} unless defined $sectionmap;
|
---|
308 |
|
---|
309 |
|
---|
310 |
|
---|
311 | # get the section into a standard format
|
---|
312 | $section =~ s/^\.+|\.+$//g;
|
---|
313 |
|
---|
314 | # return the mapped section if it has been seen before
|
---|
315 | if (defined $sectionmap->{$section}) {
|
---|
316 | return $sectionmap->{$section};
|
---|
317 | }
|
---|
318 |
|
---|
319 | # find out the parent section
|
---|
320 | my ($parentsection, $num);
|
---|
321 | if ($section =~ /^(.+)\.(\d+)$/) {
|
---|
322 | $parentsection = $1;
|
---|
323 | $num = $2;
|
---|
324 | } elsif ($section =~ /^(\d+)$/) {
|
---|
325 | $parentsection = "";
|
---|
326 | $num = $1;
|
---|
327 | } else {
|
---|
328 | print STDERR "mgbuildproc:map_section - misformed section $section\n";
|
---|
329 | }
|
---|
330 | $parentsection = "" unless defined $parentsection;
|
---|
331 |
|
---|
332 | if ($parentsection eq "") {
|
---|
333 | $num ++ if $num == 1 and defined $sectionmap->{'0'};
|
---|
334 | } else {
|
---|
335 | $num ++ if $num == 1 and defined $sectionmap->{"$parentsection.0"};
|
---|
336 | }
|
---|
337 |
|
---|
338 | # find out the mapped parent section
|
---|
339 | my $mappedparentsection = $self->map_section ($doctype, $parentsection);
|
---|
340 |
|
---|
341 | # find the next unused child section
|
---|
342 | my $previousnum = $num - 1;
|
---|
343 | my $previoussection = "";
|
---|
344 | if ($parentsection eq "") {
|
---|
345 | $previoussection = $previousnum;
|
---|
346 | $previoussection = 0 if ($section == 1) && (defined $sectionmap->{'0'});
|
---|
347 | } else{
|
---|
348 | $previoussection = "$parentsection.$previousnum";
|
---|
349 | $previoussection = "$parentsection.0" if ($section =~ /\.1$/) && (defined $sectionmap->{"$parentsection.0"});
|
---|
350 | }
|
---|
351 | while (($previousnum > 0) && (!defined $sectionmap->{$previoussection})) {
|
---|
352 | $previousnum--;
|
---|
353 | $previoussection = "$parentsection.$previousnum";
|
---|
354 | $previoussection = $previousnum if $parentsection eq "";
|
---|
355 | }
|
---|
356 |
|
---|
357 | # there has been no children under this parent, this section will be number 1
|
---|
358 | if ($previousnum <= 0) {
|
---|
359 | if ($mappedparentsection eq "") {
|
---|
360 | $sectionmap->{$section} = "1";
|
---|
361 | } else {
|
---|
362 | $sectionmap->{$section} = "$mappedparentsection.1";
|
---|
363 | }
|
---|
364 |
|
---|
365 | } else {
|
---|
366 | # get the previous mapped number
|
---|
367 | my $previousmapnum = 0;
|
---|
368 | if ($sectionmap->{$previoussection} =~ /(^|\.)?(\d+)$/) {
|
---|
369 | $previousmapnum = $2;
|
---|
370 | }
|
---|
371 |
|
---|
372 | # increment it to get this mapped child
|
---|
373 | my $mappednum = $previousmapnum+1;
|
---|
374 |
|
---|
375 | if ($mappedparentsection eq "") {
|
---|
376 | $sectionmap->{$section} = $mappednum;
|
---|
377 | } else {
|
---|
378 | $sectionmap->{$section} = "$mappedparentsection.$mappednum";
|
---|
379 | }
|
---|
380 | }
|
---|
381 |
|
---|
382 | return $sectionmap->{$section};
|
---|
383 | }
|
---|
384 |
|
---|
385 | sub text {
|
---|
386 | my $self = shift (@_);
|
---|
387 | my ($doc_obj) = @_;
|
---|
388 | my $handle = $self->{'output_handle'};
|
---|
389 |
|
---|
390 | # only output this document if it is one to be indexed
|
---|
391 | return if ($doc_obj->get_doc_type() ne "indexed_doc");
|
---|
392 |
|
---|
393 | # this is another document
|
---|
394 | $self->{'num_docs'} += 1;
|
---|
395 |
|
---|
396 | # get the parameters for the output
|
---|
397 | my ($level, $fields) = split (/:/, $self->{'index'});
|
---|
398 | $fields =~ s/\ball\b/Title,Creator,text/;
|
---|
399 | $fields =~ s/\btopall\b/topTitle,topCreator,toptext/;
|
---|
400 |
|
---|
401 | my $doc_section = 0; # just for this document
|
---|
402 | my $text = "";
|
---|
403 | my $text_extra = "";
|
---|
404 |
|
---|
405 | # get the text for this document
|
---|
406 | my $section = $doc_obj->get_top_section();
|
---|
407 | while (defined $section) {
|
---|
408 | # update a few statistics
|
---|
409 | $doc_section++;
|
---|
410 | $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
|
---|
411 | $self->{'num_sections'} += 1;
|
---|
412 |
|
---|
413 | foreach $field (split (/,/, $fields)) {
|
---|
414 | # only deal with this field if it doesn't start with top or
|
---|
415 | # this is the first section
|
---|
416 | my $real_field = $field;
|
---|
417 | if (!($real_field =~ s/^top//) || ($doc_section == 1)) {
|
---|
418 | my $new_text = "";
|
---|
419 | if ($real_field eq "text") {
|
---|
420 | $new_text = $doc_obj->get_text ($section);
|
---|
421 | $new_text =~ s/[\cB\cC]//g;
|
---|
422 | $new_text =~ s/(<p\b)/\cC$1/gi;
|
---|
423 |
|
---|
424 | } else {
|
---|
425 | $new_text = join ("\cC", @{$doc_obj->get_metadata ($section, $real_field)});
|
---|
426 | }
|
---|
427 |
|
---|
428 | $text .= "$new_text\cC";
|
---|
429 | }
|
---|
430 | }
|
---|
431 |
|
---|
432 | if ($level eq "document") { $text_extra .= "\cB"; }
|
---|
433 | else { $text .= "\cB"; }
|
---|
434 |
|
---|
435 | $section = $doc_obj->get_next_section($section);
|
---|
436 | }
|
---|
437 |
|
---|
438 | print $handle "$text$text_extra";
|
---|
439 | }
|
---|
440 |
|
---|
441 | # converts leading number in classification back
|
---|
442 | # to letter it represents
|
---|
443 | # i.e 67.2.4 becomes C.2.4
|
---|
444 | sub char_classification {
|
---|
445 | my $self = shift (@_);
|
---|
446 | my ($classification) = @_;
|
---|
447 |
|
---|
448 | return $classification if $classification eq "";
|
---|
449 | my ($c) = $classification =~ /^\.?(\d+)/;
|
---|
450 | $c = chr($c);
|
---|
451 | $classification =~ s/^\d+/$c/;
|
---|
452 |
|
---|
453 | return $classification;
|
---|
454 | }
|
---|
455 |
|
---|
456 | # converts leading letter of a classification into its ascii equivalent
|
---|
457 | # i.e C.2.4 becomes 67.2.4
|
---|
458 | sub int_classification {
|
---|
459 | my $self = shift (@_);
|
---|
460 | my ($classification) = @_;
|
---|
461 | my $c = ord($classification);
|
---|
462 | $classification =~ s/^./$c/;
|
---|
463 |
|
---|
464 | return $classification;
|
---|
465 | }
|
---|
466 | 1;
|
---|
467 |
|
---|