source: trunk/gsdl/perllib/mgbuildproc.pm@ 38

Last change on this file since 38 was 38, checked in by sjboddie, 25 years ago

Allowed for adding date to infodb

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 13.8 KB
Line 
1# This document processor outputs a document
2# for mg to process
3
4
5package mgbuildproc;
6
7use docproc;
8use util;
9
10
11BEGIN {
12 @ISA = ('docproc');
13}
14
15
16sub new {
17 my ($class, $collection, $source_dir, $build_dir, $verbosity) = @_;
18 my $self = new docproc ();
19
20 $self->{'collection'} = $collection;
21 $self->{'source_dir'} = $source_dir;
22 $self->{'build_dir'} = $build_dir;
23 $self->{'verbosity'} = $verbosity;
24 $self->{'mode'} = "text";
25 $self->{'index'} = "section:text";
26 $self->{'output_handle'} = "STDOUT";
27 $self->{'num_docs'} = 0;
28 $self->{'num_sections'} = 0;
29 $self->{'num_bytes'} = 0;
30
31 return bless $self, $class;
32}
33
34sub reset {
35 my $self = shift (@_);
36
37 $self->{'num_docs'} = 0;
38 $self->{'num_sections'} = 0;
39 $self->{'num_bytes'} = 0;
40}
41
42sub get_num_docs {
43 my $self = shift (@_);
44
45 return $self->{'num_docs'};
46}
47
48sub get_num_sections {
49 my $self = shift (@_);
50
51 return $self->{'num_sections'};
52}
53
54sub get_num_bytes {
55 my $self = shift (@_);
56
57 return $self->{'num_bytes'};
58}
59
60sub set_output_handle {
61 my $self = shift (@_);
62 my ($handle) = @_;
63
64 $self->{'output_handle'} = $handle;
65}
66
67sub set_mode {
68 my $self = shift (@_);
69 my ($mode) = @_;
70
71 $self->{'mode'} = $mode;
72}
73
74sub set_index {
75 my $self = shift (@_);
76 my ($index) = @_;
77
78 $self->{'index'} = $index;
79}
80
81sub process {
82 my $self = shift (@_);
83 my ($doc_obj) = @_;
84 my $method = $self->{'mode'};
85
86 $self->$method($doc_obj);
87}
88
89sub infodb {
90 my $self = shift (@_);
91 my ($doc_obj) = @_;
92 my $handle = $self->{'output_handle'};
93
94 if ($doc_obj eq 'classifications') {
95 # output classifications if all books have been processed
96 foreach $key (keys %$saved_classifications) {
97 $saved_classifications->{$key}->{'contains'} = undef if
98 $saved_classifications->{$key}->{'contains'} eq "";
99 $saved_classifications->{$key}->{'parent'} = undef if
100 (!defined $saved_classifications->{$key}->{'parent'}) ||
101 ($saved_classifications->{$key}->{'parent'} eq "");
102 $self->write_to_gdbm ($handle, $key, $saved_classifications->{$key}->{'title'},
103 undef, undef, undef, undef,
104 $saved_classifications->{$key}->{'contains'}, undef,
105 $saved_classifications->{$key}->{'parent'}, undef, undef);
106 }
107 return;
108 }
109
110 my $doctype = $doc_obj->get_doc_type();
111
112 # only output this document if it is one to be indexed
113 return if ($doctype ne "indexed_doc" &&
114 $doctype ne "classification");
115
116 # found classification document
117 $saved_classifications = {} if ($doctype eq "classification");
118 $sectionmap = {};
119
120 # this is another document
121 $self->{'num_docs'} += 1 unless ($doctype eq "classification");
122
123 my $section = $doc_obj->get_top_section();
124 while (defined $section) {
125 # update a few statistics
126 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
127 $self->{'num_sections'} += 1 unless ($doctype eq "classification");
128
129 my $title = $doc_obj->get_metadata_element($section, "Title");
130 my $source = $doc_obj->get_metadata_element($section, "Source");
131 my $date = $doc_obj->get_metadata_element($section, "Date");
132 my $jobnumber = $doc_obj->get_source_filename();
133
134 my $mapped_section = $self->map_section($doctype, $section);
135 if ($doctype eq "classification") {
136 $mapped_section = $self->char_classification($mapped_section);
137 } else {
138 $mapped_section = "B.$self->{'num_docs'}.$mapped_section";
139 }
140 $mapped_section =~ s/\.+$//;
141
142 my ($parent, $classification, $creator, $OID);
143
144 if ($section ne $doc_obj->get_top_section()) {
145 $parent = $self->map_section($doctype, $doc_obj->get_parent_section($section));
146 if ($doctype eq "classification") {
147 $parent = $self->char_classification($parent);
148 } else {
149 $parent = "B.$self->{'num_docs'}.$parent";
150 }
151 $parent =~ s/\.+$//;
152 } else {
153 $classification = $self->get_classifications($doc_obj, $section, $mapped_section)
154 unless $doctype eq "classification";
155 $creator = $doc_obj->get_metadata_element($section, "Creator");
156 $OID = $doc_obj->get_OID();
157 # this is a hack at getting OID to look like the directory path - I'm sure there's
158 # a better way to do it but it's late... Stefan
159 my @OIDchars = split //, $OID;
160 $OID = "";
161 my $count = 0;
162 foreach $i (@OIDchars) {
163 if ($count == 7) {
164 $OID .= "$i/";
165 $count = 0;
166 } else {
167 $OID .= $i;
168 $count ++;
169 }
170 }
171 $OID =~ s/\/$//;
172 $OID .= ".dir";
173 }
174
175 if ($doc_obj->is_leaf_section($section)) {
176 if ($doctype eq "classification") {
177 if (defined $saved_classifications->{$mapped_section}) {
178 print STDERR "mgbuildproc:warning: classification $mapped_section " .
179 "declared multiple times\n";
180 } else {
181 $saved_classifications->{$mapped_section} = {'title' => $title, 'contains' => "",
182 'parent' => $parent};
183 }
184 } else {
185 $self->write_to_gdbm($handle, $mapped_section, $title, $creator, $source, $date, $jobnumber, undef,
186 $self->{'num_sections'}, $parent, $classification, $OID);
187 }
188 } else {
189
190 # add the introductory section if it exists
191 my $contains = "";
192 if ($doc_obj->get_text_length($section) > 0) {
193 $contains .= "B.$self->{'num_docs'}." . $self->map_section ($doctype, "$section.0");
194 }
195
196 # add the rest of the children
197 my @children = @{$doc_obj->get_children($section)};
198 if ($doctype eq "classification") {
199 map {$_ = $self->char_classification($_);} @children;
200 if (defined $saved_classifications->{$mapped_section}) {
201 print STDERR "mgbuildproc:warning: classification $mapped_section " .
202 "declared multiple times\n";
203 } else {
204 $saved_classifications->{$mapped_section} = {'title' => $title, 'contains' => ""};
205 $saved_classifications->{$mapped_section}->{'contains'} = join ":", @children;
206 $saved_classifications->{$mapped_section}->{'parent'} = $parent;
207 }
208 } else {
209 map {$_ = "B.$self->{'num_docs'}." . $self->map_section($doctype, $_);} @children;
210 $contains .= ":" if $contains ne "";
211 $contains .= join ":", @children;
212 $self->write_to_gdbm ($handle, $mapped_section, $title, $creator, $source,
213 $date, $jobnumber, $contains, $self->{'num_sections'},
214 $parent, $classification, $OID);
215
216 if ($doc_obj->get_text_length($section) > 0) {
217 my $intromapsection = "B.$self->{'num_docs'}." .
218 $self->map_section($doctype, "$section.0");
219 $self->write_to_gdbm ($handle, $intromapsection, "<i>(introductory text)</i>", $creator,
220 $source, $date, $jobnumber, undef, $self->{'num_sections'},
221 $mapped_section, undef, $OID);
222 }
223 }
224 }
225 $section = $doc_obj->get_next_section($section);
226 }
227
228 # update classification list with those books that
229 # were processed before classification list
230 if ($doctype eq "classification" && defined $temp_classifications) {
231 foreach $key (keys(%$temp_classifications)) {
232 if (!defined $saved_classifications->{$key}) {
233 print STDERR "mgbuildproc:$temp_classifications->{$key} belong to " .
234 "undefined classification $key\n";
235 }
236 $saved_classifications->{$key}->{'contains'} = $temp_classifications->{$key}->{'contains'};
237 }
238 $temp_classifications = undef;
239 }
240}
241
242
243sub write_to_gdbm {
244 my $self = shift (@_);
245 my ($handle, $section, $title, $creator, $source, $date,
246 $jobnumber, $contains, $docnum, $parent, $classification, $OID) = @_;
247
248 print $handle "[$section]\n";
249 print $handle "<t>$title\n" if defined $title;
250 print $handle "<a>$creator\n" if defined $creator;
251 print $handle "<s>$source\n" if defined $source;
252 print $handle "<i>$date\n" if defined $date;
253 print $handle "<j>$jobnumber\n" if defined $jobnumber;
254 print $handle "<c>$contains\n" if defined $contains;
255 print $handle "<d>$docnum\n" if defined $docnum;
256 print $handle "<p>$parent\n" if defined $parent;
257 print $handle "<x>$classification\n" if defined $classification;
258 print $handle "<o>$OID\n" if defined $OID;
259 print $handle '-' x 70, "\n";
260
261 if (defined $docnum) {
262 print $handle "[$docnum]\n";
263 print $handle "<x>$section\n";
264 print $handle '-' x 70, "\n";
265 }
266}
267
268sub get_classifications {
269 my $self = shift (@_);
270 my ($doc_obj, $section, $mapped_section) = @_;
271
272 my ($classificationsref);
273 if (defined $saved_classifications) {
274 # classification list has been processed
275 $classificationsref = $saved_classifications;
276 } else {
277 # classification list has yet to be processed, save
278 # books in temp_classifications until list is processed
279 # and they can be moved into saved_classifications
280 $temp_classifications = {} unless defined $temp_classifications;
281 $classificationsref = $temp_classifications;
282 }
283
284 my $classifications = $doc_obj->get_metadata($section, "Subject");
285
286 # need to save which books belong in each classification
287 # to output later
288 foreach $classification (@$classifications) {
289
290 if (!defined $classificationsref->{$classification}) {
291 $classificationsref->{$classification}->{'parent'} = $doc_obj->get_parent_section($section);
292 $classificationsref->{$classification}->{'contains'} = "";
293 }
294 $classificationsref->{$classification}->{'contains'} .= ":" unless
295 $classificationsref->{$classification}->{'contains'} eq "";
296 $classificationsref->{$classification}->{'contains'} .= $mapped_section;
297 }
298
299 return (join ":", @$classifications);
300}
301
302sub map_section {
303 my $self = shift (@_);
304 my ($doctype, $section) = @_;
305
306 # classifications should never need to be mapped
307 return $section if $doctype eq "classification";
308
309 return "" unless (defined $section) && ($section ne "");
310
311 $sectionmap = {} unless defined $sectionmap;
312
313
314
315 # get the section into a standard format
316 $section =~ s/^\.+|\.+$//g;
317
318 # return the mapped section if it has been seen before
319 if (defined $sectionmap->{$section}) {
320 return $sectionmap->{$section};
321 }
322
323 # find out the parent section
324 my ($parentsection, $num);
325 if ($section =~ /^(.+)\.(\d+)$/) {
326 $parentsection = $1;
327 $num = $2;
328 } elsif ($section =~ /^(\d+)$/) {
329 $parentsection = "";
330 $num = $1;
331 } else {
332 print STDERR "mgbuildproc:map_section - misformed section $section\n";
333 }
334 $parentsection = "" unless defined $parentsection;
335
336 if ($parentsection eq "") {
337 $num ++ if $num == 1 and defined $sectionmap->{'0'};
338 } else {
339 $num ++ if $num == 1 and defined $sectionmap->{"$parentsection.0"};
340 }
341
342 # find out the mapped parent section
343 my $mappedparentsection = $self->map_section ($doctype, $parentsection);
344
345 # find the next unused child section
346 my $previousnum = $num - 1;
347 my $previoussection = "";
348 if ($parentsection eq "") {
349 $previoussection = $previousnum;
350 $previoussection = 0 if ($section == 1) && (defined $sectionmap->{'0'});
351 } else{
352 $previoussection = "$parentsection.$previousnum";
353 $previoussection = "$parentsection.0" if ($section =~ /\.1$/) && (defined $sectionmap->{"$parentsection.0"});
354 }
355 while (($previousnum > 0) && (!defined $sectionmap->{$previoussection})) {
356 $previousnum--;
357 $previoussection = "$parentsection.$previousnum";
358 $previoussection = $previousnum if $parentsection eq "";
359 }
360
361 # there has been no children under this parent, this section will be number 1
362 if ($previousnum <= 0) {
363 if ($mappedparentsection eq "") {
364 $sectionmap->{$section} = "1";
365 } else {
366 $sectionmap->{$section} = "$mappedparentsection.1";
367 }
368
369 } else {
370 # get the previous mapped number
371 my $previousmapnum = 0;
372 if ($sectionmap->{$previoussection} =~ /(^|\.)?(\d+)$/) {
373 $previousmapnum = $2;
374 }
375
376 # increment it to get this mapped child
377 my $mappednum = $previousmapnum+1;
378
379 if ($mappedparentsection eq "") {
380 $sectionmap->{$section} = $mappednum;
381 } else {
382 $sectionmap->{$section} = "$mappedparentsection.$mappednum";
383 }
384 }
385
386 return $sectionmap->{$section};
387}
388
389sub text {
390 my $self = shift (@_);
391 my ($doc_obj) = @_;
392 my $handle = $self->{'output_handle'};
393
394 # only output this document if it is one to be indexed
395 return if ($doc_obj->get_doc_type() ne "indexed_doc");
396
397 # this is another document
398 $self->{'num_docs'} += 1;
399
400 # get the parameters for the output
401 my ($level, $fields) = split (/:/, $self->{'index'});
402 $fields =~ s/\ball\b/Title,Creator,text/;
403 $fields =~ s/\btopall\b/topTitle,topCreator,toptext/;
404
405 my $doc_section = 0; # just for this document
406 my $text = "";
407 my $text_extra = "";
408
409 # get the text for this document
410 my $section = $doc_obj->get_top_section();
411 while (defined $section) {
412 # update a few statistics
413 $doc_section++;
414 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
415 $self->{'num_sections'} += 1;
416
417 foreach $field (split (/,/, $fields)) {
418 # only deal with this field if it doesn't start with top or
419 # this is the first section
420 my $real_field = $field;
421 if (!($real_field =~ s/^top//) || ($doc_section == 1)) {
422 my $new_text = "";
423 if ($real_field eq "text") {
424 $new_text = $doc_obj->get_text ($section);
425 $new_text =~ s/[\cB\cC]//g;
426 $new_text =~ s/(<p\b)/\cC$1/gi;
427
428 } else {
429 $new_text = join ("\cC", @{$doc_obj->get_metadata ($section, $real_field)});
430 }
431
432 $text .= "$new_text\cC";
433 }
434 }
435
436 if ($level eq "document") { $text_extra .= "\cB"; }
437 else { $text .= "\cB"; }
438
439 $section = $doc_obj->get_next_section($section);
440 }
441
442 print $handle "$text$text_extra";
443}
444
445# converts leading number in classification back
446# to letter it represents
447# i.e 67.2.4 becomes C.2.4
448sub char_classification {
449 my $self = shift (@_);
450 my ($classification) = @_;
451
452 return $classification if $classification eq "";
453 my ($c) = $classification =~ /^\.?(\d+)/;
454 $c = chr($c);
455 $classification =~ s/^\d+/$c/;
456
457 return $classification;
458}
459
460# converts leading letter of a classification into its ascii equivalent
461# i.e C.2.4 becomes 67.2.4
462sub int_classification {
463 my $self = shift (@_);
464 my ($classification) = @_;
465 my $c = ord($classification);
466 $classification =~ s/^./$c/;
467
468 return $classification;
469}
4701;
471
Note: See TracBrowser for help on using the repository browser.