1 | # This document processor outputs a document
|
---|
2 | # for mg to process
|
---|
3 |
|
---|
4 |
|
---|
5 | package mgbuildproc;
|
---|
6 |
|
---|
7 | use docproc;
|
---|
8 | use util;
|
---|
9 |
|
---|
10 |
|
---|
11 | BEGIN {
|
---|
12 | @ISA = ('docproc');
|
---|
13 | }
|
---|
14 |
|
---|
15 |
|
---|
16 | sub new {
|
---|
17 | my ($class, $collection, $source_dir, $build_dir, $verbosity) = @_;
|
---|
18 | my $self = new docproc ();
|
---|
19 |
|
---|
20 | $self->{'collection'} = $collection;
|
---|
21 | $self->{'source_dir'} = $source_dir;
|
---|
22 | $self->{'build_dir'} = $build_dir;
|
---|
23 | $self->{'verbosity'} = $verbosity;
|
---|
24 | $self->{'mode'} = "text";
|
---|
25 | $self->{'index'} = "section:text";
|
---|
26 | $self->{'output_handle'} = "STDOUT";
|
---|
27 | $self->{'num_docs'} = 0;
|
---|
28 | $self->{'num_sections'} = 0;
|
---|
29 | $self->{'num_bytes'} = 0;
|
---|
30 |
|
---|
31 | return bless $self, $class;
|
---|
32 | }
|
---|
33 |
|
---|
34 | sub reset {
|
---|
35 | my $self = shift (@_);
|
---|
36 |
|
---|
37 | $self->{'num_docs'} = 0;
|
---|
38 | $self->{'num_sections'} = 0;
|
---|
39 | $self->{'num_bytes'} = 0;
|
---|
40 | }
|
---|
41 |
|
---|
42 | sub get_num_docs {
|
---|
43 | my $self = shift (@_);
|
---|
44 |
|
---|
45 | return $self->{'num_docs'};
|
---|
46 | }
|
---|
47 |
|
---|
48 | sub get_num_sections {
|
---|
49 | my $self = shift (@_);
|
---|
50 |
|
---|
51 | return $self->{'num_sections'};
|
---|
52 | }
|
---|
53 |
|
---|
54 | sub get_num_bytes {
|
---|
55 | my $self = shift (@_);
|
---|
56 |
|
---|
57 | return $self->{'num_bytes'};
|
---|
58 | }
|
---|
59 |
|
---|
60 | sub set_output_handle {
|
---|
61 | my $self = shift (@_);
|
---|
62 | my ($handle) = @_;
|
---|
63 |
|
---|
64 | $self->{'output_handle'} = $handle;
|
---|
65 | }
|
---|
66 |
|
---|
67 | sub set_mode {
|
---|
68 | my $self = shift (@_);
|
---|
69 | my ($mode) = @_;
|
---|
70 |
|
---|
71 | $self->{'mode'} = $mode;
|
---|
72 | }
|
---|
73 |
|
---|
74 | sub set_index {
|
---|
75 | my $self = shift (@_);
|
---|
76 | my ($index) = @_;
|
---|
77 |
|
---|
78 | $self->{'index'} = $index;
|
---|
79 | }
|
---|
80 |
|
---|
81 | sub process {
|
---|
82 | my $self = shift (@_);
|
---|
83 | my ($doc_obj) = @_;
|
---|
84 | my $method = $self->{'mode'};
|
---|
85 |
|
---|
86 | $self->$method($doc_obj);
|
---|
87 | }
|
---|
88 |
|
---|
89 | sub infodb {
|
---|
90 | my $self = shift (@_);
|
---|
91 | my ($doc_obj) = @_;
|
---|
92 | my $handle = $self->{'output_handle'};
|
---|
93 |
|
---|
94 | if ($doc_obj eq 'classifications') {
|
---|
95 | # output classifications if all books have been processed
|
---|
96 | foreach $key (keys %$saved_classifications) {
|
---|
97 | $saved_classifications->{$key}->{'contains'} = undef if
|
---|
98 | $saved_classifications->{$key}->{'contains'} eq "";
|
---|
99 | $saved_classifications->{$key}->{'parent'} = undef if
|
---|
100 | (!defined $saved_classifications->{$key}->{'parent'}) ||
|
---|
101 | ($saved_classifications->{$key}->{'parent'} eq "");
|
---|
102 | $self->write_to_gdbm ($handle, $key, $saved_classifications->{$key}->{'title'},
|
---|
103 | undef, undef, undef, undef,
|
---|
104 | $saved_classifications->{$key}->{'contains'}, undef,
|
---|
105 | $saved_classifications->{$key}->{'parent'}, undef, undef);
|
---|
106 | }
|
---|
107 | return;
|
---|
108 | }
|
---|
109 |
|
---|
110 | my $doctype = $doc_obj->get_doc_type();
|
---|
111 |
|
---|
112 | # only output this document if it is one to be indexed
|
---|
113 | return if ($doctype ne "indexed_doc" &&
|
---|
114 | $doctype ne "classification");
|
---|
115 |
|
---|
116 | # found classification document
|
---|
117 | $saved_classifications = {} if ($doctype eq "classification");
|
---|
118 | $sectionmap = {};
|
---|
119 |
|
---|
120 | # this is another document
|
---|
121 | $self->{'num_docs'} += 1 unless ($doctype eq "classification");
|
---|
122 |
|
---|
123 | my $section = $doc_obj->get_top_section();
|
---|
124 | while (defined $section) {
|
---|
125 | # update a few statistics
|
---|
126 | $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
|
---|
127 | $self->{'num_sections'} += 1 unless ($doctype eq "classification");
|
---|
128 |
|
---|
129 | my $title = $doc_obj->get_metadata_element($section, "Title");
|
---|
130 | my $source = $doc_obj->get_metadata_element($section, "Source");
|
---|
131 | my $date = $doc_obj->get_metadata_element($section, "Date");
|
---|
132 | my $jobnumber = $doc_obj->get_source_filename();
|
---|
133 |
|
---|
134 | my $mapped_section = $self->map_section($doctype, $section);
|
---|
135 | if ($doctype eq "classification") {
|
---|
136 | $mapped_section = $self->char_classification($mapped_section);
|
---|
137 | } else {
|
---|
138 | $mapped_section = "B.$self->{'num_docs'}.$mapped_section";
|
---|
139 | }
|
---|
140 | $mapped_section =~ s/\.+$//;
|
---|
141 |
|
---|
142 | my ($parent, $classification, $creator, $OID);
|
---|
143 |
|
---|
144 | if ($section ne $doc_obj->get_top_section()) {
|
---|
145 | $parent = $self->map_section($doctype, $doc_obj->get_parent_section($section));
|
---|
146 | if ($doctype eq "classification") {
|
---|
147 | $parent = $self->char_classification($parent);
|
---|
148 | } else {
|
---|
149 | $parent = "B.$self->{'num_docs'}.$parent";
|
---|
150 | }
|
---|
151 | $parent =~ s/\.+$//;
|
---|
152 | } else {
|
---|
153 | $classification = $self->get_classifications($doc_obj, $section, $mapped_section)
|
---|
154 | unless $doctype eq "classification";
|
---|
155 | $creator = $doc_obj->get_metadata_element($section, "Creator");
|
---|
156 | $OID = $doc_obj->get_OID();
|
---|
157 | # this is a hack at getting OID to look like the directory path - I'm sure there's
|
---|
158 | # a better way to do it but it's late... Stefan
|
---|
159 | my @OIDchars = split //, $OID;
|
---|
160 | $OID = "";
|
---|
161 | my $count = 0;
|
---|
162 | foreach $i (@OIDchars) {
|
---|
163 | if ($count == 7) {
|
---|
164 | $OID .= "$i/";
|
---|
165 | $count = 0;
|
---|
166 | } else {
|
---|
167 | $OID .= $i;
|
---|
168 | $count ++;
|
---|
169 | }
|
---|
170 | }
|
---|
171 | $OID =~ s/\/$//;
|
---|
172 | $OID .= ".dir";
|
---|
173 | }
|
---|
174 |
|
---|
175 | if ($doc_obj->is_leaf_section($section)) {
|
---|
176 | if ($doctype eq "classification") {
|
---|
177 | if (defined $saved_classifications->{$mapped_section}) {
|
---|
178 | print STDERR "mgbuildproc:warning: classification $mapped_section " .
|
---|
179 | "declared multiple times\n";
|
---|
180 | } else {
|
---|
181 | $saved_classifications->{$mapped_section} = {'title' => $title, 'contains' => "",
|
---|
182 | 'parent' => $parent};
|
---|
183 | }
|
---|
184 | } else {
|
---|
185 | $self->write_to_gdbm($handle, $mapped_section, $title, $creator, $source, $date, $jobnumber, undef,
|
---|
186 | $self->{'num_sections'}, $parent, $classification, $OID);
|
---|
187 | }
|
---|
188 | } else {
|
---|
189 |
|
---|
190 | # add the introductory section if it exists
|
---|
191 | my $contains = "";
|
---|
192 | if ($doc_obj->get_text_length($section) > 0) {
|
---|
193 | $contains .= "B.$self->{'num_docs'}." . $self->map_section ($doctype, "$section.0");
|
---|
194 | }
|
---|
195 |
|
---|
196 | # add the rest of the children
|
---|
197 | my @children = @{$doc_obj->get_children($section)};
|
---|
198 | if ($doctype eq "classification") {
|
---|
199 | map {$_ = $self->char_classification($_);} @children;
|
---|
200 | if (defined $saved_classifications->{$mapped_section}) {
|
---|
201 | print STDERR "mgbuildproc:warning: classification $mapped_section " .
|
---|
202 | "declared multiple times\n";
|
---|
203 | } else {
|
---|
204 | $saved_classifications->{$mapped_section} = {'title' => $title, 'contains' => ""};
|
---|
205 | $saved_classifications->{$mapped_section}->{'contains'} = join ":", @children;
|
---|
206 | $saved_classifications->{$mapped_section}->{'parent'} = $parent;
|
---|
207 | }
|
---|
208 | } else {
|
---|
209 | map {$_ = "B.$self->{'num_docs'}." . $self->map_section($doctype, $_);} @children;
|
---|
210 | $contains .= ":" if $contains ne "";
|
---|
211 | $contains .= join ":", @children;
|
---|
212 | $self->write_to_gdbm ($handle, $mapped_section, $title, $creator, $source,
|
---|
213 | $date, $jobnumber, $contains, $self->{'num_sections'},
|
---|
214 | $parent, $classification, $OID);
|
---|
215 |
|
---|
216 | if ($doc_obj->get_text_length($section) > 0) {
|
---|
217 | my $intromapsection = "B.$self->{'num_docs'}." .
|
---|
218 | $self->map_section($doctype, "$section.0");
|
---|
219 | $self->write_to_gdbm ($handle, $intromapsection, "<i>(introductory text)</i>", $creator,
|
---|
220 | $source, $date, $jobnumber, undef, $self->{'num_sections'},
|
---|
221 | $mapped_section, undef, $OID);
|
---|
222 | }
|
---|
223 | }
|
---|
224 | }
|
---|
225 | $section = $doc_obj->get_next_section($section);
|
---|
226 | }
|
---|
227 |
|
---|
228 | # update classification list with those books that
|
---|
229 | # were processed before classification list
|
---|
230 | if ($doctype eq "classification" && defined $temp_classifications) {
|
---|
231 | foreach $key (keys(%$temp_classifications)) {
|
---|
232 | if (!defined $saved_classifications->{$key}) {
|
---|
233 | print STDERR "mgbuildproc:$temp_classifications->{$key} belong to " .
|
---|
234 | "undefined classification $key\n";
|
---|
235 | }
|
---|
236 | $saved_classifications->{$key}->{'contains'} = $temp_classifications->{$key}->{'contains'};
|
---|
237 | }
|
---|
238 | $temp_classifications = undef;
|
---|
239 | }
|
---|
240 | }
|
---|
241 |
|
---|
242 |
|
---|
243 | sub write_to_gdbm {
|
---|
244 | my $self = shift (@_);
|
---|
245 | my ($handle, $section, $title, $creator, $source, $date,
|
---|
246 | $jobnumber, $contains, $docnum, $parent, $classification, $OID) = @_;
|
---|
247 |
|
---|
248 | print $handle "[$section]\n";
|
---|
249 | print $handle "<t>$title\n" if defined $title;
|
---|
250 | print $handle "<a>$creator\n" if defined $creator;
|
---|
251 | print $handle "<s>$source\n" if defined $source;
|
---|
252 | print $handle "<i>$date\n" if defined $date;
|
---|
253 | print $handle "<j>$jobnumber\n" if defined $jobnumber;
|
---|
254 | print $handle "<c>$contains\n" if defined $contains;
|
---|
255 | print $handle "<d>$docnum\n" if defined $docnum;
|
---|
256 | print $handle "<p>$parent\n" if defined $parent;
|
---|
257 | print $handle "<x>$classification\n" if defined $classification;
|
---|
258 | print $handle "<o>$OID\n" if defined $OID;
|
---|
259 | print $handle '-' x 70, "\n";
|
---|
260 |
|
---|
261 | if (defined $docnum) {
|
---|
262 | print $handle "[$docnum]\n";
|
---|
263 | print $handle "<x>$section\n";
|
---|
264 | print $handle '-' x 70, "\n";
|
---|
265 | }
|
---|
266 | }
|
---|
267 |
|
---|
268 | sub get_classifications {
|
---|
269 | my $self = shift (@_);
|
---|
270 | my ($doc_obj, $section, $mapped_section) = @_;
|
---|
271 |
|
---|
272 | my ($classificationsref);
|
---|
273 | if (defined $saved_classifications) {
|
---|
274 | # classification list has been processed
|
---|
275 | $classificationsref = $saved_classifications;
|
---|
276 | } else {
|
---|
277 | # classification list has yet to be processed, save
|
---|
278 | # books in temp_classifications until list is processed
|
---|
279 | # and they can be moved into saved_classifications
|
---|
280 | $temp_classifications = {} unless defined $temp_classifications;
|
---|
281 | $classificationsref = $temp_classifications;
|
---|
282 | }
|
---|
283 |
|
---|
284 | my $classifications = $doc_obj->get_metadata($section, "Subject");
|
---|
285 |
|
---|
286 | # need to save which books belong in each classification
|
---|
287 | # to output later
|
---|
288 | foreach $classification (@$classifications) {
|
---|
289 |
|
---|
290 | if (!defined $classificationsref->{$classification}) {
|
---|
291 | $classificationsref->{$classification}->{'parent'} = $doc_obj->get_parent_section($section);
|
---|
292 | $classificationsref->{$classification}->{'contains'} = "";
|
---|
293 | }
|
---|
294 | $classificationsref->{$classification}->{'contains'} .= ":" unless
|
---|
295 | $classificationsref->{$classification}->{'contains'} eq "";
|
---|
296 | $classificationsref->{$classification}->{'contains'} .= $mapped_section;
|
---|
297 | }
|
---|
298 |
|
---|
299 | return (join ":", @$classifications);
|
---|
300 | }
|
---|
301 |
|
---|
302 | sub map_section {
|
---|
303 | my $self = shift (@_);
|
---|
304 | my ($doctype, $section) = @_;
|
---|
305 |
|
---|
306 | # classifications should never need to be mapped
|
---|
307 | return $section if $doctype eq "classification";
|
---|
308 |
|
---|
309 | return "" unless (defined $section) && ($section ne "");
|
---|
310 |
|
---|
311 | $sectionmap = {} unless defined $sectionmap;
|
---|
312 |
|
---|
313 |
|
---|
314 |
|
---|
315 | # get the section into a standard format
|
---|
316 | $section =~ s/^\.+|\.+$//g;
|
---|
317 |
|
---|
318 | # return the mapped section if it has been seen before
|
---|
319 | if (defined $sectionmap->{$section}) {
|
---|
320 | return $sectionmap->{$section};
|
---|
321 | }
|
---|
322 |
|
---|
323 | # find out the parent section
|
---|
324 | my ($parentsection, $num);
|
---|
325 | if ($section =~ /^(.+)\.(\d+)$/) {
|
---|
326 | $parentsection = $1;
|
---|
327 | $num = $2;
|
---|
328 | } elsif ($section =~ /^(\d+)$/) {
|
---|
329 | $parentsection = "";
|
---|
330 | $num = $1;
|
---|
331 | } else {
|
---|
332 | print STDERR "mgbuildproc:map_section - misformed section $section\n";
|
---|
333 | }
|
---|
334 | $parentsection = "" unless defined $parentsection;
|
---|
335 |
|
---|
336 | if ($parentsection eq "") {
|
---|
337 | $num ++ if $num == 1 and defined $sectionmap->{'0'};
|
---|
338 | } else {
|
---|
339 | $num ++ if $num == 1 and defined $sectionmap->{"$parentsection.0"};
|
---|
340 | }
|
---|
341 |
|
---|
342 | # find out the mapped parent section
|
---|
343 | my $mappedparentsection = $self->map_section ($doctype, $parentsection);
|
---|
344 |
|
---|
345 | # find the next unused child section
|
---|
346 | my $previousnum = $num - 1;
|
---|
347 | my $previoussection = "";
|
---|
348 | if ($parentsection eq "") {
|
---|
349 | $previoussection = $previousnum;
|
---|
350 | $previoussection = 0 if ($section == 1) && (defined $sectionmap->{'0'});
|
---|
351 | } else{
|
---|
352 | $previoussection = "$parentsection.$previousnum";
|
---|
353 | $previoussection = "$parentsection.0" if ($section =~ /\.1$/) && (defined $sectionmap->{"$parentsection.0"});
|
---|
354 | }
|
---|
355 | while (($previousnum > 0) && (!defined $sectionmap->{$previoussection})) {
|
---|
356 | $previousnum--;
|
---|
357 | $previoussection = "$parentsection.$previousnum";
|
---|
358 | $previoussection = $previousnum if $parentsection eq "";
|
---|
359 | }
|
---|
360 |
|
---|
361 | # there has been no children under this parent, this section will be number 1
|
---|
362 | if ($previousnum <= 0) {
|
---|
363 | if ($mappedparentsection eq "") {
|
---|
364 | $sectionmap->{$section} = "1";
|
---|
365 | } else {
|
---|
366 | $sectionmap->{$section} = "$mappedparentsection.1";
|
---|
367 | }
|
---|
368 |
|
---|
369 | } else {
|
---|
370 | # get the previous mapped number
|
---|
371 | my $previousmapnum = 0;
|
---|
372 | if ($sectionmap->{$previoussection} =~ /(^|\.)?(\d+)$/) {
|
---|
373 | $previousmapnum = $2;
|
---|
374 | }
|
---|
375 |
|
---|
376 | # increment it to get this mapped child
|
---|
377 | my $mappednum = $previousmapnum+1;
|
---|
378 |
|
---|
379 | if ($mappedparentsection eq "") {
|
---|
380 | $sectionmap->{$section} = $mappednum;
|
---|
381 | } else {
|
---|
382 | $sectionmap->{$section} = "$mappedparentsection.$mappednum";
|
---|
383 | }
|
---|
384 | }
|
---|
385 |
|
---|
386 | return $sectionmap->{$section};
|
---|
387 | }
|
---|
388 |
|
---|
389 | sub text {
|
---|
390 | my $self = shift (@_);
|
---|
391 | my ($doc_obj) = @_;
|
---|
392 | my $handle = $self->{'output_handle'};
|
---|
393 |
|
---|
394 | # only output this document if it is one to be indexed
|
---|
395 | return if ($doc_obj->get_doc_type() ne "indexed_doc");
|
---|
396 |
|
---|
397 | # this is another document
|
---|
398 | $self->{'num_docs'} += 1;
|
---|
399 |
|
---|
400 | # get the parameters for the output
|
---|
401 | my ($level, $fields) = split (/:/, $self->{'index'});
|
---|
402 | $fields =~ s/\ball\b/Title,Creator,text/;
|
---|
403 | $fields =~ s/\btopall\b/topTitle,topCreator,toptext/;
|
---|
404 |
|
---|
405 | my $doc_section = 0; # just for this document
|
---|
406 | my $text = "";
|
---|
407 | my $text_extra = "";
|
---|
408 |
|
---|
409 | # get the text for this document
|
---|
410 | my $section = $doc_obj->get_top_section();
|
---|
411 | while (defined $section) {
|
---|
412 | # update a few statistics
|
---|
413 | $doc_section++;
|
---|
414 | $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
|
---|
415 | $self->{'num_sections'} += 1;
|
---|
416 |
|
---|
417 | foreach $field (split (/,/, $fields)) {
|
---|
418 | # only deal with this field if it doesn't start with top or
|
---|
419 | # this is the first section
|
---|
420 | my $real_field = $field;
|
---|
421 | if (!($real_field =~ s/^top//) || ($doc_section == 1)) {
|
---|
422 | my $new_text = "";
|
---|
423 | if ($real_field eq "text") {
|
---|
424 | $new_text = $doc_obj->get_text ($section);
|
---|
425 | $new_text =~ s/[\cB\cC]//g;
|
---|
426 | $new_text =~ s/(<p\b)/\cC$1/gi;
|
---|
427 |
|
---|
428 | } else {
|
---|
429 | $new_text = join ("\cC", @{$doc_obj->get_metadata ($section, $real_field)});
|
---|
430 | }
|
---|
431 |
|
---|
432 | $text .= "$new_text\cC";
|
---|
433 | }
|
---|
434 | }
|
---|
435 |
|
---|
436 | if ($level eq "document") { $text_extra .= "\cB"; }
|
---|
437 | else { $text .= "\cB"; }
|
---|
438 |
|
---|
439 | $section = $doc_obj->get_next_section($section);
|
---|
440 | }
|
---|
441 |
|
---|
442 | print $handle "$text$text_extra";
|
---|
443 | }
|
---|
444 |
|
---|
445 | # converts leading number in classification back
|
---|
446 | # to letter it represents
|
---|
447 | # i.e 67.2.4 becomes C.2.4
|
---|
448 | sub char_classification {
|
---|
449 | my $self = shift (@_);
|
---|
450 | my ($classification) = @_;
|
---|
451 |
|
---|
452 | return $classification if $classification eq "";
|
---|
453 | my ($c) = $classification =~ /^\.?(\d+)/;
|
---|
454 | $c = chr($c);
|
---|
455 | $classification =~ s/^\d+/$c/;
|
---|
456 |
|
---|
457 | return $classification;
|
---|
458 | }
|
---|
459 |
|
---|
460 | # converts leading letter of a classification into its ascii equivalent
|
---|
461 | # i.e C.2.4 becomes 67.2.4
|
---|
462 | sub int_classification {
|
---|
463 | my $self = shift (@_);
|
---|
464 | my ($classification) = @_;
|
---|
465 | my $c = ord($classification);
|
---|
466 | $classification =~ s/^./$c/;
|
---|
467 |
|
---|
468 | return $classification;
|
---|
469 | }
|
---|
470 | 1;
|
---|
471 |
|
---|