source: trunk/gsdl/perllib/mgbuildproc.pm@ 35

Last change on this file since 35 was 35, checked in by sjboddie, 25 years ago

Allowed for adding the 'Source' attribute to the infodb

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 13.7 KB
Line 
1# This document processor outputs a document
2# for mg to process
3
4
5package mgbuildproc;
6
7use docproc;
8use util;
9
10
11BEGIN {
12 @ISA = ('docproc');
13}
14
15
16sub new {
17 my ($class, $collection, $source_dir, $build_dir, $verbosity) = @_;
18 my $self = new docproc ();
19
20 $self->{'collection'} = $collection;
21 $self->{'source_dir'} = $source_dir;
22 $self->{'build_dir'} = $build_dir;
23 $self->{'verbosity'} = $verbosity;
24 $self->{'mode'} = "text";
25 $self->{'index'} = "section:text";
26 $self->{'output_handle'} = "STDOUT";
27 $self->{'num_docs'} = 0;
28 $self->{'num_sections'} = 0;
29 $self->{'num_bytes'} = 0;
30
31 return bless $self, $class;
32}
33
34sub reset {
35 my $self = shift (@_);
36
37 $self->{'num_docs'} = 0;
38 $self->{'num_sections'} = 0;
39 $self->{'num_bytes'} = 0;
40}
41
42sub get_num_docs {
43 my $self = shift (@_);
44
45 return $self->{'num_docs'};
46}
47
48sub get_num_sections {
49 my $self = shift (@_);
50
51 return $self->{'num_sections'};
52}
53
54sub get_num_bytes {
55 my $self = shift (@_);
56
57 return $self->{'num_bytes'};
58}
59
60sub set_output_handle {
61 my $self = shift (@_);
62 my ($handle) = @_;
63
64 $self->{'output_handle'} = $handle;
65}
66
67sub set_mode {
68 my $self = shift (@_);
69 my ($mode) = @_;
70
71 $self->{'mode'} = $mode;
72}
73
74sub set_index {
75 my $self = shift (@_);
76 my ($index) = @_;
77
78 $self->{'index'} = $index;
79}
80
81sub process {
82 my $self = shift (@_);
83 my ($doc_obj) = @_;
84 my $method = $self->{'mode'};
85
86 $self->$method($doc_obj);
87}
88
89sub infodb {
90 my $self = shift (@_);
91 my ($doc_obj) = @_;
92 my $handle = $self->{'output_handle'};
93
94 if ($doc_obj eq 'classifications') {
95 # output classifications if all books have been processed
96 foreach $key (keys %$saved_classifications) {
97 $saved_classifications->{$key}->{'contains'} = undef if
98 $saved_classifications->{$key}->{'contains'} eq "";
99 $saved_classifications->{$key}->{'parent'} = undef if
100 (!defined $saved_classifications->{$key}->{'parent'}) ||
101 ($saved_classifications->{$key}->{'parent'} eq "");
102 $self->write_to_gdbm ($handle, $key, $saved_classifications->{$key}->{'title'},
103 undef, undef,
104 $saved_classifications->{$key}->{'contains'}, undef,
105 $saved_classifications->{$key}->{'parent'}, undef);
106 }
107 return;
108 }
109
110 my $doctype = $doc_obj->get_doc_type();
111
112 # only output this document if it is one to be indexed
113 return if ($doctype ne "indexed_doc" &&
114 $doctype ne "classification");
115
116 # found classification document
117 $saved_classifications = {} if ($doctype eq "classification");
118 $sectionmap = {};
119
120 # this is another document
121 $self->{'num_docs'} += 1 unless ($doctype eq "classification");
122
123 my $section = $doc_obj->get_top_section();
124 while (defined $section) {
125 # update a few statistics
126 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
127 $self->{'num_sections'} += 1 unless ($doctype eq "classification");
128
129 my $title = $doc_obj->get_metadata_element($section, "Title");
130 my $source = $doc_obj->get_metadata_element($section, "Source");
131 my $jobnumber = $doc_obj->get_source_filename();
132
133 my $mapped_section = $self->map_section($doctype, $section);
134 if ($doctype eq "classification") {
135 $mapped_section = $self->char_classification($mapped_section);
136 } else {
137 $mapped_section = "B.$self->{'num_docs'}.$mapped_section";
138 }
139 $mapped_section =~ s/\.+$//;
140
141 my ($parent, $classification, $creator, $OID);
142
143 if ($section ne $doc_obj->get_top_section()) {
144 $parent = $self->map_section($doctype, $doc_obj->get_parent_section($section));
145 if ($doctype eq "classification") {
146 $parent = $self->char_classification($parent);
147 } else {
148 $parent = "B.$self->{'num_docs'}.$parent";
149 }
150 $parent =~ s/\.+$//;
151 } else {
152 $classification = $self->get_classifications($doc_obj, $section, $mapped_section)
153 unless $doctype eq "classification";
154 $creator = $doc_obj->get_metadata_element($section, "Creator");
155 $OID = $doc_obj->get_OID();
156 # this is a hack at getting OID to look like the directory path - I'm sure there's
157 # a better way to do it but it's late... Stefan
158 my @OIDchars = split //, $OID;
159 $OID = "";
160 my $count = 0;
161 foreach $i (@OIDchars) {
162 if ($count == 7) {
163 $OID .= "$i/";
164 $count = 0;
165 } else {
166 $OID .= $i;
167 $count ++;
168 }
169 }
170 $OID =~ s/\/$//;
171 $OID .= ".dir";
172 }
173
174 if ($doc_obj->is_leaf_section($section)) {
175 if ($doctype eq "classification") {
176 if (defined $saved_classifications->{$mapped_section}) {
177 print STDERR "mgbuildproc:warning: classification $mapped_section " .
178 "declared multiple times\n";
179 } else {
180 $saved_classifications->{$mapped_section} = {'title' => $title, 'contains' => "",
181 'parent' => $parent};
182 }
183 } else {
184 $self->write_to_gdbm($handle, $mapped_section, $title, $creator, $jobnumber, undef,
185 $self->{'num_sections'}, $parent, $classification, $OID, $source);
186 }
187 } else {
188
189 # add the introductory section if it exists
190 my $contains = "";
191 if ($doc_obj->get_text_length($section) > 0) {
192 $contains .= "B.$self->{'num_docs'}." . $self->map_section ($doctype, "$section.0");
193 }
194
195 # add the rest of the children
196 my @children = @{$doc_obj->get_children($section)};
197 if ($doctype eq "classification") {
198 map {$_ = $self->char_classification($_);} @children;
199 if (defined $saved_classifications->{$mapped_section}) {
200 print STDERR "mgbuildproc:warning: classification $mapped_section " .
201 "declared multiple times\n";
202 } else {
203 $saved_classifications->{$mapped_section} = {'title' => $title, 'contains' => ""};
204 $saved_classifications->{$mapped_section}->{'contains'} = join ":", @children;
205 $saved_classifications->{$mapped_section}->{'parent'} = $parent;
206 }
207 } else {
208 map {$_ = "B.$self->{'num_docs'}." . $self->map_section($doctype, $_);} @children;
209 $contains .= ":" if $contains ne "";
210 $contains .= join ":", @children;
211 $self->write_to_gdbm ($handle, $mapped_section, $title, $creator,
212 $jobnumber, $contains, $self->{'num_sections'},
213 $parent, $classification, $OID, $source);
214
215 if ($doc_obj->get_text_length($section) > 0) {
216 my $intromapsection = "B.$self->{'num_docs'}." .
217 $self->map_section($doctype, "$section.0");
218 $self->write_to_gdbm ($handle, $intromapsection, "<i>(introductory text)</i>", $creator,
219 $jobnumber, undef, $self->{'num_sections'},
220 $mapped_section, undef, $OID, $source);
221 }
222 }
223 }
224 $section = $doc_obj->get_next_section($section);
225 }
226
227 # update classification list with those books that
228 # were processed before classification list
229 if ($doctype eq "classification" && defined $temp_classifications) {
230 foreach $key (keys(%$temp_classifications)) {
231 if (!defined $saved_classifications->{$key}) {
232 print STDERR "mgbuildproc:$temp_classifications->{$key} belong to " .
233 "undefined classification $key\n";
234 }
235 $saved_classifications->{$key}->{'contains'} = $temp_classifications->{$key}->{'contains'};
236 }
237 $temp_classifications = undef;
238 }
239}
240
241
242sub write_to_gdbm {
243 my $self = shift (@_);
244 my ($handle, $section, $title, $creator, $jobnumber, $contains,
245 $docnum, $parent, $classification, $OID) = @_;
246
247 print $handle "[$section]\n";
248 print $handle "<t>$title\n" if defined $title;
249 print $handle "<a>$creator\n" if defined $creator;
250 print $handle "<s>$source\n" if defined $source;
251 print $handle "<j>$jobnumber\n" if defined $jobnumber;
252 print $handle "<c>$contains\n" if defined $contains;
253 print $handle "<d>$docnum\n" if defined $docnum;
254 print $handle "<p>$parent\n" if defined $parent;
255 print $handle "<x>$classification\n" if defined $classification;
256 print $handle "<o>$OID\n" if defined $OID;
257 print $handle '-' x 70, "\n";
258
259 if (defined $docnum) {
260 print $handle "[$docnum]\n";
261 print $handle "<x>$section\n";
262 print $handle '-' x 70, "\n";
263 }
264}
265
266sub get_classifications {
267 my $self = shift (@_);
268 my ($doc_obj, $section, $mapped_section) = @_;
269
270 my ($classificationsref);
271 if (defined $saved_classifications) {
272 # classification list has been processed
273 $classificationsref = $saved_classifications;
274 } else {
275 # classification list has yet to be processed, save
276 # books in temp_classifications until list is processed
277 # and they can be moved into saved_classifications
278 $temp_classifications = {} unless defined $temp_classifications;
279 $classificationsref = $temp_classifications;
280 }
281
282 my $classifications = $doc_obj->get_metadata($section, "Subject");
283
284 # need to save which books belong in each classification
285 # to output later
286 foreach $classification (@$classifications) {
287
288 if (!defined $classificationsref->{$classification}) {
289 $classificationsref->{$classification}->{'parent'} = $doc_obj->get_parent_section($section);
290 $classificationsref->{$classification}->{'contains'} = "";
291 }
292 $classificationsref->{$classification}->{'contains'} .= ":" unless
293 $classificationsref->{$classification}->{'contains'} eq "";
294 $classificationsref->{$classification}->{'contains'} .= $mapped_section;
295 }
296
297 return (join ":", @$classifications);
298}
299
300sub map_section {
301 my $self = shift (@_);
302 my ($doctype, $section) = @_;
303
304 # classifications should never need to be mapped
305 return $section if $doctype eq "classification";
306
307 return "" unless (defined $section) && ($section ne "");
308
309 $sectionmap = {} unless defined $sectionmap;
310
311
312
313 # get the section into a standard format
314 $section =~ s/^\.+|\.+$//g;
315
316 # return the mapped section if it has been seen before
317 if (defined $sectionmap->{$section}) {
318 return $sectionmap->{$section};
319 }
320
321 # find out the parent section
322 my ($parentsection, $num);
323 if ($section =~ /^(.+)\.(\d+)$/) {
324 $parentsection = $1;
325 $num = $2;
326 } elsif ($section =~ /^(\d+)$/) {
327 $parentsection = "";
328 $num = $1;
329 } else {
330 print STDERR "mgbuildproc:map_section - misformed section $section\n";
331 }
332 $parentsection = "" unless defined $parentsection;
333
334 if ($parentsection eq "") {
335 $num ++ if $num == 1 and defined $sectionmap->{'0'};
336 } else {
337 $num ++ if $num == 1 and defined $sectionmap->{"$parentsection.0"};
338 }
339
340 # find out the mapped parent section
341 my $mappedparentsection = $self->map_section ($doctype, $parentsection);
342
343 # find the next unused child section
344 my $previousnum = $num - 1;
345 my $previoussection = "";
346 if ($parentsection eq "") {
347 $previoussection = $previousnum;
348 $previoussection = 0 if ($section == 1) && (defined $sectionmap->{'0'});
349 } else{
350 $previoussection = "$parentsection.$previousnum";
351 $previoussection = "$parentsection.0" if ($section =~ /\.1$/) && (defined $sectionmap->{"$parentsection.0"});
352 }
353 while (($previousnum > 0) && (!defined $sectionmap->{$previoussection})) {
354 $previousnum--;
355 $previoussection = "$parentsection.$previousnum";
356 $previoussection = $previousnum if $parentsection eq "";
357 }
358
359 # there has been no children under this parent, this section will be number 1
360 if ($previousnum <= 0) {
361 if ($mappedparentsection eq "") {
362 $sectionmap->{$section} = "1";
363 } else {
364 $sectionmap->{$section} = "$mappedparentsection.1";
365 }
366
367 } else {
368 # get the previous mapped number
369 my $previousmapnum = 0;
370 if ($sectionmap->{$previoussection} =~ /(^|\.)?(\d+)$/) {
371 $previousmapnum = $2;
372 }
373
374 # increment it to get this mapped child
375 my $mappednum = $previousmapnum+1;
376
377 if ($mappedparentsection eq "") {
378 $sectionmap->{$section} = $mappednum;
379 } else {
380 $sectionmap->{$section} = "$mappedparentsection.$mappednum";
381 }
382 }
383
384 return $sectionmap->{$section};
385}
386
387sub text {
388 my $self = shift (@_);
389 my ($doc_obj) = @_;
390 my $handle = $self->{'output_handle'};
391
392 # only output this document if it is one to be indexed
393 return if ($doc_obj->get_doc_type() ne "indexed_doc");
394
395 # this is another document
396 $self->{'num_docs'} += 1;
397
398 # get the parameters for the output
399 my ($level, $fields) = split (/:/, $self->{'index'});
400 $fields =~ s/\ball\b/Title,Creator,text/;
401 $fields =~ s/\btopall\b/topTitle,topCreator,toptext/;
402
403 my $doc_section = 0; # just for this document
404 my $text = "";
405 my $text_extra = "";
406
407 # get the text for this document
408 my $section = $doc_obj->get_top_section();
409 while (defined $section) {
410 # update a few statistics
411 $doc_section++;
412 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
413 $self->{'num_sections'} += 1;
414
415 foreach $field (split (/,/, $fields)) {
416 # only deal with this field if it doesn't start with top or
417 # this is the first section
418 my $real_field = $field;
419 if (!($real_field =~ s/^top//) || ($doc_section == 1)) {
420 my $new_text = "";
421 if ($real_field eq "text") {
422 $new_text = $doc_obj->get_text ($section);
423 $new_text =~ s/[\cB\cC]//g;
424 $new_text =~ s/(<p\b)/\cC$1/gi;
425
426 } else {
427 $new_text = join ("\cC", @{$doc_obj->get_metadata ($section, $real_field)});
428 }
429
430 $text .= "$new_text\cC";
431 }
432 }
433
434 if ($level eq "document") { $text_extra .= "\cB"; }
435 else { $text .= "\cB"; }
436
437 $section = $doc_obj->get_next_section($section);
438 }
439
440 print $handle "$text$text_extra";
441}
442
443# converts leading number in classification back
444# to letter it represents
445# i.e 67.2.4 becomes C.2.4
446sub char_classification {
447 my $self = shift (@_);
448 my ($classification) = @_;
449
450 return $classification if $classification eq "";
451 my ($c) = $classification =~ /^\.?(\d+)/;
452 $c = chr($c);
453 $classification =~ s/^\d+/$c/;
454
455 return $classification;
456}
457
458# converts leading letter of a classification into its ascii equivalent
459# i.e C.2.4 becomes 67.2.4
460sub int_classification {
461 my $self = shift (@_);
462 my ($classification) = @_;
463 my $c = ord($classification);
464 $classification =~ s/^./$c/;
465
466 return $classification;
467}
4681;
469
Note: See TracBrowser for help on using the repository browser.