source: trunk/gsdl/perllib/mgbuildproc.pm@ 810

Last change on this file since 810 was 810, checked in by sjboddie, 24 years ago

plugins now take options, files are associated at build time as
well as import time

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 11.0 KB
Line 
1###########################################################################
2#
3# mgbuildproc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor outputs a document
27# for mg to process
28
29
30package mgbuildproc;
31
32use classify;
33use doc;
34use docproc;
35use util;
36
37
38BEGIN {
39 @ISA = ('docproc');
40}
41
42
43sub new {
44 my ($class, $collection, $source_dir, $build_dir, $verbosity) = @_;
45 my $self = new docproc ();
46
47 $self->{'collection'} = $collection;
48 $self->{'source_dir'} = $source_dir;
49 $self->{'build_dir'} = $build_dir;
50 $self->{'verbosity'} = $verbosity;
51 $self->{'classifiers'} = [];
52 $self->{'mode'} = "text";
53 $self->{'assocdir'} = $build_dir;
54 $self->{'dontgdbm'} = {};
55 $self->{'index'} = "section:text";
56 $self->{'indexexparr'} = [];
57 $self->{'output_handle'} = "STDOUT";
58 $self->{'num_docs'} = 0;
59 $self->{'num_sections'} = 0;
60 $self->{'num_bytes'} = 0;
61
62 $self->{'indexing_text'} = 0;
63
64 return bless $self, $class;
65}
66
67sub reset {
68 my $self = shift (@_);
69
70 $self->{'num_docs'} = 0;
71 $self->{'num_sections'} = 0;
72 $self->{'num_bytes'} = 0;
73}
74
75sub get_num_docs {
76 my $self = shift (@_);
77
78 return $self->{'num_docs'};
79}
80
81sub get_num_sections {
82 my $self = shift (@_);
83
84 return $self->{'num_sections'};
85}
86
87sub get_num_bytes {
88 my $self = shift (@_);
89
90 return $self->{'num_bytes'};
91}
92
93sub set_output_handle {
94 my $self = shift (@_);
95 my ($handle) = @_;
96
97 $self->{'output_handle'} = $handle;
98}
99
100sub set_mode {
101 my $self = shift (@_);
102 my ($mode) = @_;
103
104 $self->{'mode'} = $mode;
105}
106
107sub set_assocdir {
108 my $self = shift (@_);
109 my ($assocdir) = @_;
110
111 $self->{'assocdir'} = $assocdir;
112}
113
114sub set_dontgdbm {
115 my $self = shift (@_);
116 my ($dontgdbm) = @_;
117
118 $self->{'dontgdbm'} = $dontgdbm;
119}
120
121sub set_index {
122 my $self = shift (@_);
123 my ($index, $indexexparr) = @_;
124
125 $self->{'index'} = $index;
126 $self->{'indexexparr'} = $indexexparr if defined $indexexparr;
127}
128
129sub set_classifiers {
130 my $self = shift (@_);
131 my ($classifiers) = @_;
132
133 $self->{'classifiers'} = $classifiers;
134}
135
136sub set_indexing_text {
137 my $self = shift (@_);
138 my ($indexing_text) = @_;
139
140 $self->{'indexing_text'} = $indexing_text;
141}
142
143sub process {
144 my $self = shift (@_);
145 my $method = $self->{'mode'};
146
147 $self->$method(@_);
148}
149
150# use 'Paged' if document has no more than 2 levels
151# and each section at second level has a number for
152# Title metadata
153sub get_document_type {
154 my $self = shift (@_);
155 my ($doc_obj) = @_;
156
157 my $thistype = "VList";
158 my $childtype = "VList";
159 my $title;
160 my @tmp = ();
161
162 my $section = $doc_obj->get_top_section ();
163 my $first = 1;
164 while (defined $section) {
165 @tmp = split /\./, $section;
166 if (scalar(@tmp) > 1) {
167 return ($thistype, $childtype);
168 }
169 if (!$first) {
170 $title = $doc_obj->get_metadata_element ($section, "Title");
171 if (!defined $title || $title !~ /^\d+$/) {
172 return ($thistype, $childtype);
173 }
174 }
175 $first = 0;
176 $section = $doc_obj->get_next_section($section);
177 }
178 if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
179 $thistype = "Paged";
180 } else {
181 $thistype = "Invisible";
182 }
183 $childtype = "Paged";
184 return ($thistype, $childtype);
185}
186
187sub assoc_files {
188 my $self = shift (@_);
189 my ($doc_obj) = @_;
190
191 foreach $assoc_file (@{$doc_obj->get_assoc_files()}) {
192 $afile = &util::filename_cat($self->{'assocdir'}, $assoc_file->[1]);
193 &util::hard_link ($assoc_file->[0], $afile);
194 }
195}
196
197sub infodb {
198 my $self = shift (@_);
199 my ($doc_obj, $filename) = @_;
200 my $handle = $self->{'output_handle'};
201# $handle = "main::STDOUT";
202
203 $self->assoc_files ($doc_obj);
204
205 my $doctype = $doc_obj->get_doc_type();
206
207 # only output this document if it is one to be indexed
208 return if ($doctype ne "indexed_doc");
209
210 # this is another document
211 $self->{'num_docs'} += 1 unless ($doctype eq "classification");
212
213 # is this a paged or a hierarchical document
214 my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
215
216 my $section = $doc_obj->get_top_section ();
217 my $doc_OID = $doc_obj->get_OID();
218 my $first = 1;
219 my $url = "";
220 while (defined $section) {
221 # update a few statistics
222 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
223 $self->{'num_sections'} += 1 unless ($doctype eq "classification");
224
225 # output the section name
226 if ($section eq "") { print $handle "[$doc_OID]\n"; }
227 else { print $handle "[$doc_OID.$section]\n"; }
228
229 # output the fact that this document is a document
230 print $handle "<doctype>doc\n";
231
232 # output whether this node contains text
233 if ($doc_obj->get_text_length($section) > 0) {
234 print $handle "<hastxt>1\n";
235 } else {
236 print $handle "<hastxt>0\n";
237 }
238
239 # output all the section metadata
240 my $metadata = $doc_obj->get_all_metadata ($section);
241 foreach $pair (@$metadata) {
242 my ($field, $value) = (@$pair);
243
244 if ($field ne "Identifier" && $field !~ /^gsdl/ &&
245 defined $value && $value ne "") {
246
247 # escape problematic stuff
248 $value =~ s/\\/\\\\/g;
249 $value =~ s/\n/\\n/g;
250 $value =~ s/\r/\\r/g;
251
252 # special case for URL metadata
253 if ($field =~ /^URL$/i) {
254 $url .= "[$value]\n";
255 if ($section eq "") {$url .= "<section>$doc_OID\n";}
256 else {$url .= "<section>$doc_OID.$section\n";}
257 $url .= '-' x 70 . "\n";
258 }
259
260 if (!defined $self->{'dontgdbm'}->{$field}) {
261 print $handle "<$field>$value\n";
262 }
263 }
264 }
265
266 # output archivedir if at top level
267 if ($section eq $doc_obj->get_top_section()) {
268 my ($archivedir) = $filename =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
269 $archivedir = "" unless defined $archivedir;
270 $archivedir =~ s/\\/\//g;
271 $archivedir =~ s/^\/+//;
272 $archivedir =~ s/\/+$//;
273 print $handle "<archivedir>$archivedir\n";
274 }
275
276 # output document display type
277 if ($first) {
278 print $handle "<thistype>$thistype\n";
279 }
280
281 # output a list of children
282 my $children = $doc_obj->get_children ($section);
283 if (scalar(@$children) > 0) {
284 print $handle "<childtype>$childtype\n";
285 print $handle "<contains>";
286 my $firstchild = 1;
287 foreach $child (@$children) {
288 print $handle ";" unless $firstchild;
289 $firstchild = 0;
290 if ($child =~ /^.*?\.(\d+)$/) {
291 print $handle "\".$1";
292 } else {
293 print $handle "\".$child";
294 }
295# if ($child eq "") { print $handle "$doc_OID"; }
296# elsif ($section eq "") { print $handle "$doc_OID.$child"; }
297# else { print $handle "$doc_OID.$section.$child"; }
298 }
299 print $handle "\n";
300 }
301
302 # output the matching document number
303 print $handle "<docnum>$self->{'num_sections'}\n";
304
305 print $handle '-' x 70, "\n";
306
307
308 # output a database entry for the document number
309 print $handle "[$self->{'num_sections'}]\n";
310 if ($section eq "") { print $handle "<section>$doc_OID\n"; }
311 else { print $handle "<section>$doc_OID.$section\n"; }
312 print $handle '-' x 70, "\n";
313
314 # output entry for url
315 if ($url ne "") {
316 print $handle $url;
317 }
318
319 $first = 0;
320 $section = $doc_obj->get_next_section($section);
321 }
322
323 # classify this document
324 &classify::classify_doc ($self->{'classifiers'}, $doc_obj);
325
326}
327
328sub find_paragraphs {
329 $_[1] =~ s/(<p\b)/\cC$1/gi;
330}
331
332sub filter_text {
333 # $self->filter_text ($field, $new_text);
334 # don't want to do anything for this version, however,
335 # in a particular collection you might want to override
336 # this method to post-process certain fields depending on
337 # the field, or whether we are outputting it for indexing
338}
339
340sub text {
341 my $self = shift (@_);
342 my ($doc_obj) = @_;
343 my $handle = $self->{'output_handle'};
344 my $indexed_doc = 1;
345
346 # only output this document if it is one to be indexed
347 return if ($doc_obj->get_doc_type() ne "indexed_doc");
348
349 # see if this document belongs to this subcollection
350 foreach $indexexp (@{$self->{'indexexparr'}}) {
351 $indexed_doc = 0;
352 my ($field, $exp, $options) = split /\//, $indexexp;
353 if (defined ($field) && defined ($exp)) {
354 my ($bool) = $field =~ /^(.)/;
355 $field =~ s/^.// if $bool eq '!';
356 if ($field =~ /^filename$/i) {
357 $field = $doc_obj->get_source_filename();
358 } else {
359 $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
360 }
361 next unless defined $field;
362 if ($bool eq '!') {
363 if ($options =~ /^i$/i) {
364 if ($field !~ /$exp/i) {$indexed_doc = 1; last;}
365 } else {
366 if ($field !~ /$exp/) {$indexed_doc = 1; last;}
367 }
368 } else {
369 if ($options =~ /^i$/i) {
370 if ($field =~ /$exp/i) {$indexed_doc = 1; last;}
371 } else {
372 if ($field =~ /$exp/) {$indexed_doc = 1; last;}
373 }
374 }
375 }
376 }
377
378 # this is another document
379 $self->{'num_docs'} += 1;
380
381 # get the parameters for the output
382 my ($level, $fields) = split (/:/, $self->{'index'});
383 $fields =~ s/\ball\b/Title,Creator,text/;
384 $fields =~ s/\btopall\b/topTitle,topCreator,toptext/;
385
386 my $doc_section = 0; # just for this document
387 my $text = "";
388 my $text_extra = "";
389
390 # get the text for this document
391 my $section = $doc_obj->get_top_section();
392 while (defined $section) {
393 # update a few statistics
394 $doc_section++;
395 $self->{'num_sections'} += 1;
396
397 if ($indexed_doc) {
398 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
399 foreach $field (split (/,/, $fields)) {
400 # only deal with this field if it doesn't start with top or
401 # this is the first section
402 my $real_field = $field;
403 if (!($real_field =~ s/^top//) || ($doc_section == 1)) {
404 my $new_text = "";
405 if ($real_field eq "text") {
406 $new_text = $doc_obj->get_text ($section);
407 $new_text =~ s/[\cB\cC]//g;
408 $self->find_paragraphs($new_text);
409
410 } else {
411 $new_text = join ("\cC", @{$doc_obj->get_metadata ($section, $real_field)});
412 }
413
414 # filter the text
415 $self->filter_text ($field, $new_text);
416
417 if ($self->{'indexing_text'} &&
418 $new_text =~ /[\(\)\{\}]/) {
419 }
420
421 $text .= "$new_text\cC";
422 }
423 }
424 }
425
426 if ($level eq "document") { $text_extra .= "\cB"; }
427 else { $text .= "\cB"; }
428
429 $section = $doc_obj->get_next_section($section);
430 }
431
432 print $handle "$text$text_extra";
433}
434
4351;
436
Note: See TracBrowser for help on using the repository browser.