source: trunk/gsdl/perllib/mgbuildproc.pm@ 69

Last change on this file since 69 was 69, checked in by sjboddie, 25 years ago

Sub-collection indexes may now be defined within the collect.cfg file as
subcollection blah1 Title/blah/i
subcollection blah2 !Title/blah/i
indexsubcollections blah1 blah2 blah1,blah2
indexes section:text document:text
This example would create section:text and document:text indexes for:

  1. the blah1 subcollection (i.e those documents whose Title field contains 'blah')
  2. the blah2 subcollection (i.e. those documents whose Title field doesn't contain 'blah')
  3. both subcollections (i.e. all documents)

The field to match the regular expression against (Title in this example) may be
any valid metadata tag or 'filename'.
The regular expression (blah in this example) may be any valid perl regular expression.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 15.0 KB
Line 
1# This document processor outputs a document
2# for mg to process
3
4
5package mgbuildproc;
6
7use docproc;
8use util;
9
10
11BEGIN {
12 @ISA = ('docproc');
13}
14
15
16sub new {
17 my ($class, $collection, $source_dir, $build_dir, $verbosity) = @_;
18 my $self = new docproc ();
19
20 $self->{'collection'} = $collection;
21 $self->{'source_dir'} = $source_dir;
22 $self->{'build_dir'} = $build_dir;
23 $self->{'verbosity'} = $verbosity;
24 $self->{'mode'} = "text";
25 $self->{'index'} = "section:text";
26 $self->{'indexexparr'} = [];
27 $self->{'output_handle'} = "STDOUT";
28 $self->{'num_docs'} = 0;
29 $self->{'num_sections'} = 0;
30 $self->{'num_bytes'} = 0;
31
32 return bless $self, $class;
33}
34
35sub reset {
36 my $self = shift (@_);
37
38 $self->{'num_docs'} = 0;
39 $self->{'num_sections'} = 0;
40 $self->{'num_bytes'} = 0;
41}
42
43sub get_num_docs {
44 my $self = shift (@_);
45
46 return $self->{'num_docs'};
47}
48
49sub get_num_sections {
50 my $self = shift (@_);
51
52 return $self->{'num_sections'};
53}
54
55sub get_num_bytes {
56 my $self = shift (@_);
57
58 return $self->{'num_bytes'};
59}
60
61sub set_output_handle {
62 my $self = shift (@_);
63 my ($handle) = @_;
64
65 $self->{'output_handle'} = $handle;
66}
67
68sub set_mode {
69 my $self = shift (@_);
70 my ($mode) = @_;
71
72 $self->{'mode'} = $mode;
73}
74
75sub set_index {
76 my $self = shift (@_);
77 my ($index, $indexexparr) = @_;
78
79 $self->{'index'} = $index;
80 $self->{'indexexparr'} = $indexexparr if defined $indexexparr;
81}
82
83sub process {
84 my $self = shift (@_);
85 my ($doc_obj) = @_;
86 my $method = $self->{'mode'};
87
88 $self->$method($doc_obj);
89}
90
91sub infodb {
92 my $self = shift (@_);
93 my ($doc_obj) = @_;
94 my $handle = $self->{'output_handle'};
95
96 if ($doc_obj eq 'classifications') {
97 # output classifications if all books have been processed
98 foreach $key (keys %$saved_classifications) {
99 $saved_classifications->{$key}->{'contains'} = undef if
100 $saved_classifications->{$key}->{'contains'} eq "";
101 $saved_classifications->{$key}->{'parent'} = undef if
102 (!defined $saved_classifications->{$key}->{'parent'}) ||
103 ($saved_classifications->{$key}->{'parent'} eq "");
104 $self->write_to_gdbm ($handle, $key, $saved_classifications->{$key}->{'title'},
105 undef, undef, undef, undef,
106 $saved_classifications->{$key}->{'contains'}, undef,
107 $saved_classifications->{$key}->{'parent'}, undef, undef);
108 }
109 return;
110 }
111
112 my $doctype = $doc_obj->get_doc_type();
113
114 # only output this document if it is one to be indexed
115 return if ($doctype ne "indexed_doc" &&
116 $doctype ne "classification");
117
118 # found classification document
119 $saved_classifications = {} if ($doctype eq "classification");
120 $sectionmap = {};
121
122 # this is another document
123 $self->{'num_docs'} += 1 unless ($doctype eq "classification");
124
125 my $section = $doc_obj->get_top_section();
126 while (defined $section) {
127 # update a few statistics
128 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
129 $self->{'num_sections'} += 1 unless ($doctype eq "classification");
130
131 my $title = $doc_obj->get_metadata_element($section, "Title");
132 my $source = $doc_obj->get_metadata_element($section, "Source");
133 my $date = $doc_obj->get_metadata_element($section, "Date");
134 my $jobnumber = $doc_obj->get_source_filename();
135
136 my $mapped_section = $self->map_section($doctype, $section);
137 if ($doctype eq "classification") {
138 $mapped_section = $self->char_classification($mapped_section);
139 } else {
140 $mapped_section = "B.$self->{'num_docs'}.$mapped_section";
141 }
142 $mapped_section =~ s/\.+$//;
143
144 my ($parent, $classification, $creator, $OID);
145
146 if ($section ne $doc_obj->get_top_section()) {
147 $parent = $self->map_section($doctype, $doc_obj->get_parent_section($section));
148 if ($doctype eq "classification") {
149 $parent = $self->char_classification($parent);
150 } else {
151 $parent = "B.$self->{'num_docs'}.$parent";
152 }
153 $parent =~ s/\.+$//;
154 } else {
155 $classification = $self->get_classifications($doc_obj, $section, $mapped_section)
156 unless $doctype eq "classification";
157 $creator = $doc_obj->get_metadata_element($section, "Creator");
158 $OID = $doc_obj->get_OID();
159 # this is a hack at getting OID to look like the directory path - I'm sure there's
160 # a better way to do it but it's late... Stefan
161 my @OIDchars = split //, $OID;
162 $OID = "";
163 my $count = 0;
164 foreach $i (@OIDchars) {
165 if ($count == 7) {
166 $OID .= "$i/";
167 $count = 0;
168 } else {
169 $OID .= $i;
170 $count ++;
171 }
172 }
173 $OID =~ s/\/$//;
174 $OID .= ".dir";
175 }
176
177 if ($doc_obj->is_leaf_section($section)) {
178 if ($doctype eq "classification") {
179 if (defined $saved_classifications->{$mapped_section}) {
180 print STDERR "mgbuildproc:warning: classification $mapped_section " .
181 "declared multiple times\n";
182 } else {
183 $saved_classifications->{$mapped_section} = {'title' => $title, 'contains' => "",
184 'parent' => $parent};
185 }
186 } else {
187 $self->write_to_gdbm($handle, $mapped_section, $title, $creator, $source, $date, $jobnumber, undef,
188 $self->{'num_sections'}, $parent, $classification, $OID);
189 }
190 } else {
191
192 # add the introductory section if it exists
193 my $contains = "";
194 if ($doc_obj->get_text_length($section) > 0) {
195 $contains .= "B.$self->{'num_docs'}." . $self->map_section ($doctype, "$section.0");
196 }
197
198 # add the rest of the children
199 my @children = @{$doc_obj->get_children($section)};
200 if ($doctype eq "classification") {
201 map {$_ = $self->char_classification($_);} @children;
202 if (defined $saved_classifications->{$mapped_section}) {
203 print STDERR "mgbuildproc:warning: classification $mapped_section " .
204 "declared multiple times\n";
205 } else {
206 $saved_classifications->{$mapped_section} = {'title' => $title, 'contains' => ""};
207 $saved_classifications->{$mapped_section}->{'contains'} = join ":", @children;
208 $saved_classifications->{$mapped_section}->{'parent'} = $parent;
209 }
210 } else {
211 map {$_ = "B.$self->{'num_docs'}." . $self->map_section($doctype, $_);} @children;
212 $contains .= ":" if $contains ne "";
213 $contains .= join ":", @children;
214 $self->write_to_gdbm ($handle, $mapped_section, $title, $creator, $source,
215 $date, $jobnumber, $contains, $self->{'num_sections'},
216 $parent, $classification, $OID);
217
218 if ($doc_obj->get_text_length($section) > 0) {
219 my $intromapsection = "B.$self->{'num_docs'}." .
220 $self->map_section($doctype, "$section.0");
221 $self->write_to_gdbm ($handle, $intromapsection, "<i>(introductory text)</i>", $creator,
222 $source, $date, $jobnumber, undef, $self->{'num_sections'},
223 $mapped_section, undef, $OID);
224 }
225 }
226 }
227 $section = $doc_obj->get_next_section($section);
228 }
229
230 # update classification list with those books that
231 # were processed before classification list
232 if ($doctype eq "classification" && defined $temp_classifications) {
233 foreach $key (keys(%$temp_classifications)) {
234 if (!defined $saved_classifications->{$key}) {
235 print STDERR "mgbuildproc:$temp_classifications->{$key} belong to " .
236 "undefined classification $key\n";
237 }
238 $saved_classifications->{$key}->{'contains'} = $temp_classifications->{$key}->{'contains'};
239 }
240 $temp_classifications = undef;
241 }
242}
243
244
245sub write_to_gdbm {
246 my $self = shift (@_);
247 my ($handle, $section, $title, $creator, $source, $date,
248 $jobnumber, $contains, $docnum, $parent, $classification, $OID) = @_;
249
250 print $handle "[$section]\n";
251 print $handle "<t>$title\n" if (defined $title && $title ne "");
252 print $handle "<a>$creator\n" if (defined $creator && $creator ne "");
253 print $handle "<s>$source\n" if (defined $source && $source ne "");
254 print $handle "<i>$date\n" if (defined $date && $date ne "");
255 print $handle "<j>$jobnumber\n" if (defined $jobnumber && $jobnumber ne "");
256 print $handle "<c>$contains\n" if (defined $contains && $contains ne "");
257 print $handle "<d>$docnum\n" if (defined $docnum && $docnum ne "");
258 print $handle "<p>$parent\n" if (defined $parent && $parent ne "");
259 print $handle "<x>$classification\n" if (defined $classification && $classification ne "");
260 print $handle "<o>$OID\n" if defined $OID;
261 print $handle '-' x 70, "\n";
262
263 if (defined $docnum) {
264 print $handle "[$docnum]\n";
265 print $handle "<x>$section\n";
266 print $handle '-' x 70, "\n";
267 }
268}
269
270sub get_classifications {
271 my $self = shift (@_);
272 my ($doc_obj, $section, $mapped_section) = @_;
273
274 my ($classificationsref);
275 if (defined $saved_classifications) {
276 # classification list has been processed
277 $classificationsref = $saved_classifications;
278 } else {
279 # classification list has yet to be processed, save
280 # books in temp_classifications until list is processed
281 # and they can be moved into saved_classifications
282 $temp_classifications = {} unless defined $temp_classifications;
283 $classificationsref = $temp_classifications;
284 }
285
286 my $classifications = $doc_obj->get_metadata($section, "Subject");
287
288 # need to save which books belong in each classification
289 # to output later
290 foreach $classification (@$classifications) {
291
292 if (!defined $classificationsref->{$classification}) {
293 $classificationsref->{$classification}->{'parent'} = $doc_obj->get_parent_section($section);
294 $classificationsref->{$classification}->{'contains'} = "";
295 }
296 $classificationsref->{$classification}->{'contains'} .= ":" unless
297 $classificationsref->{$classification}->{'contains'} eq "";
298 $classificationsref->{$classification}->{'contains'} .= $mapped_section;
299 }
300
301 return (join ":", @$classifications);
302}
303
304sub map_section {
305 my $self = shift (@_);
306 my ($doctype, $section) = @_;
307
308 # classifications should never need to be mapped
309 return $section if $doctype eq "classification";
310
311 return "" unless (defined $section) && ($section ne "");
312
313 $sectionmap = {} unless defined $sectionmap;
314
315
316
317 # get the section into a standard format
318 $section =~ s/^\.+|\.+$//g;
319
320 # return the mapped section if it has been seen before
321 if (defined $sectionmap->{$section}) {
322 return $sectionmap->{$section};
323 }
324
325 # find out the parent section
326 my ($parentsection, $num);
327 if ($section =~ /^(.+)\.(\d+)$/) {
328 $parentsection = $1;
329 $num = $2;
330 } elsif ($section =~ /^(\d+)$/) {
331 $parentsection = "";
332 $num = $1;
333 } else {
334 print STDERR "mgbuildproc:map_section - misformed section $section\n";
335 }
336 $parentsection = "" unless defined $parentsection;
337
338 if ($parentsection eq "") {
339 $num ++ if $num == 1 and defined $sectionmap->{'0'};
340 } else {
341 $num ++ if $num == 1 and defined $sectionmap->{"$parentsection.0"};
342 }
343
344 # find out the mapped parent section
345 my $mappedparentsection = $self->map_section ($doctype, $parentsection);
346
347 # find the next unused child section
348 my $previousnum = $num - 1;
349 my $previoussection = "";
350 if ($parentsection eq "") {
351 $previoussection = $previousnum;
352 $previoussection = 0 if ($section == 1) && (defined $sectionmap->{'0'});
353 } else{
354 $previoussection = "$parentsection.$previousnum";
355 $previoussection = "$parentsection.0" if ($section =~ /\.1$/) && (defined $sectionmap->{"$parentsection.0"});
356 }
357 while (($previousnum > 0) && (!defined $sectionmap->{$previoussection})) {
358 $previousnum--;
359 $previoussection = "$parentsection.$previousnum";
360 $previoussection = $previousnum if $parentsection eq "";
361 }
362
363 # there has been no children under this parent, this section will be number 1
364 if ($previousnum <= 0) {
365 if ($mappedparentsection eq "") {
366 $sectionmap->{$section} = "1";
367 } else {
368 $sectionmap->{$section} = "$mappedparentsection.1";
369 }
370
371 } else {
372 # get the previous mapped number
373 my $previousmapnum = 0;
374 if ($sectionmap->{$previoussection} =~ /(^|\.)?(\d+)$/) {
375 $previousmapnum = $2;
376 }
377
378 # increment it to get this mapped child
379 my $mappednum = $previousmapnum+1;
380
381 if ($mappedparentsection eq "") {
382 $sectionmap->{$section} = $mappednum;
383 } else {
384 $sectionmap->{$section} = "$mappedparentsection.$mappednum";
385 }
386 }
387
388 return $sectionmap->{$section};
389}
390
391sub text {
392 my $self = shift (@_);
393 my ($doc_obj) = @_;
394 my $handle = $self->{'output_handle'};
395 my $indexed_doc = 1;
396
397 # only output this document if it is one to be indexed
398 return if ($doc_obj->get_doc_type() ne "indexed_doc");
399
400 # see if this document belongs to this subcollection
401 foreach $indexexp (@{$self->{'indexexparr'}}) {
402 $indexed_doc = 0;
403 my ($field, $exp, $options) = split /\//, $indexexp;
404 if (defined ($field) && defined ($exp)) {
405 my ($bool) = $field =~ /^(.)/;
406 $field =~ s/^.// if $bool eq '!';
407 if ($field eq "filename") {
408 $field = $doc_obj->get_source_filename();
409 } else {
410 $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
411 }
412 next unless defined $field;
413 if ($bool eq '!') {
414 if ($options =~ /^i$/i) {
415 if ($field !~ /$exp/i) {$indexed_doc = 1; last;}
416 } else {
417 if ($field !~ /$exp/) {$indexed_doc = 1; last;}
418 }
419 } else {
420 if ($options =~ /^i$/i) {
421 if ($field =~ /$exp/i) {$indexed_doc = 1; last;}
422 } else {
423 if ($field =~ /$exp/) {$indexed_doc = 1; last;}
424 }
425 }
426 }
427 }
428
429 # this is another document
430 $self->{'num_docs'} += 1;
431
432 # get the parameters for the output
433 my ($level, $fields) = split (/:/, $self->{'index'});
434 $fields =~ s/\ball\b/Title,Creator,text/;
435 $fields =~ s/\btopall\b/topTitle,topCreator,toptext/;
436
437 my $doc_section = 0; # just for this document
438 my $text = "";
439 my $text_extra = "";
440
441 # get the text for this document
442 my $section = $doc_obj->get_top_section();
443 while (defined $section) {
444 # update a few statistics
445 $doc_section++;
446 $self->{'num_sections'} += 1;
447
448 if ($indexed_doc) {
449 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
450 foreach $field (split (/,/, $fields)) {
451 # only deal with this field if it doesn't start with top or
452 # this is the first section
453 my $real_field = $field;
454 if (!($real_field =~ s/^top//) || ($doc_section == 1)) {
455 my $new_text = "";
456 if ($real_field eq "text") {
457 $new_text = $doc_obj->get_text ($section);
458 $new_text =~ s/[\cB\cC]//g;
459 $new_text =~ s/(<p\b)/\cC$1/gi;
460
461 } else {
462 $new_text = join ("\cC", @{$doc_obj->get_metadata ($section, $real_field)});
463 }
464
465 $text .= "$new_text\cC";
466 }
467 }
468 }
469
470 if ($level eq "document") { $text_extra .= "\cB"; }
471 else { $text .= "\cB"; }
472
473 $section = $doc_obj->get_next_section($section);
474 }
475
476 print $handle "$text$text_extra";
477}
478
479# converts leading number in classification back
480# to letter it represents
481# i.e 67.2.4 becomes C.2.4
482sub char_classification {
483 my $self = shift (@_);
484 my ($classification) = @_;
485
486 return $classification if $classification eq "";
487 my ($c) = $classification =~ /^\.?(\d+)/;
488 $c = chr($c);
489 $classification =~ s/^\d+/$c/;
490
491 return $classification;
492}
493
494# converts leading letter of a classification into its ascii equivalent
495# i.e C.2.4 becomes 67.2.4
496sub int_classification {
497 my $self = shift (@_);
498 my ($classification) = @_;
499 my $c = ord($classification);
500 $classification =~ s/^./$c/;
501
502 return $classification;
503}
5041;
505
Note: See TracBrowser for help on using the repository browser.