source: trunk/gsdl/perllib/mgbuildproc.pm@ 2450

Last change on this file since 2450 was 2336, checked in by sjboddie, 23 years ago

added a -no_text option to buildcol.pl to allow collections to be built
without storing compressed text (intended for use in collections where
original documents (PDFs or Word docs maybe) are returned instead of the
compressed text)

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 12.6 KB
Line 
1###########################################################################
2#
3# mgbuildproc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor outputs a document
27# for mg to process
28
29
30package mgbuildproc;
31
32use classify;
33use doc;
34use docproc;
35use util;
36
37BEGIN {
38 @ISA = ('docproc');
39}
40
41
42sub new {
43 my ($class, $collection, $source_dir, $build_dir,
44 $verbosity, $outhandle) = @_;
45 my $self = new docproc ();
46
47 # outhandle is where all the debugging info goes
48 # output_handle is where the output of the plugins is piped
49 # to (i.e. mg, gdbm etc.)
50 $outhandle = STDERR unless defined $outhandle;
51
52 $self->{'collection'} = $collection;
53 $self->{'source_dir'} = $source_dir;
54 $self->{'build_dir'} = $build_dir;
55 $self->{'verbosity'} = $verbosity;
56 $self->{'classifiers'} = [];
57 $self->{'mode'} = "text";
58 $self->{'assocdir'} = $build_dir;
59 $self->{'dontgdbm'} = {};
60 $self->{'index'} = "section:text";
61 $self->{'indexexparr'} = [];
62 $self->{'output_handle'} = "STDOUT";
63 $self->{'num_docs'} = 0;
64 $self->{'num_sections'} = 0;
65 $self->{'num_bytes'} = 0;
66 $self->{'num_processed_bytes'} = 0;
67 $self->{'store_text'} = 1;
68 $self->{'outhandle'} = $outhandle;
69
70 $self->{'indexing_text'} = 0;
71
72 return bless $self, $class;
73}
74
75sub reset {
76 my $self = shift (@_);
77
78 $self->{'num_docs'} = 0;
79 $self->{'num_sections'} = 0;
80 $self->{'num_processed_bytes'} = 0;
81 $self->{'num_bytes'} = 0;
82}
83
84sub get_num_docs {
85 my $self = shift (@_);
86
87 return $self->{'num_docs'};
88}
89
90sub get_num_sections {
91 my $self = shift (@_);
92
93 return $self->{'num_sections'};
94}
95
96# num_bytes is the actual number of bytes in the collection
97# this is normally the same as what's processed during text compression
98sub get_num_bytes {
99 my $self = shift (@_);
100
101 return $self->{'num_bytes'};
102}
103
104# num_processed_bytes is the number of bytes actually passed
105# to mg for the current index
106sub get_num_processed_bytes {
107 my $self = shift (@_);
108
109 return $self->{'num_processed_bytes'};
110}
111
112sub set_output_handle {
113 my $self = shift (@_);
114 my ($handle) = @_;
115
116 $self->{'output_handle'} = $handle;
117}
118
119sub set_mode {
120 my $self = shift (@_);
121 my ($mode) = @_;
122
123 $self->{'mode'} = $mode;
124}
125
126sub set_assocdir {
127 my $self = shift (@_);
128 my ($assocdir) = @_;
129
130 $self->{'assocdir'} = $assocdir;
131}
132
133sub set_dontgdbm {
134 my $self = shift (@_);
135 my ($dontgdbm) = @_;
136
137 $self->{'dontgdbm'} = $dontgdbm;
138}
139
140sub set_index {
141 my $self = shift (@_);
142 my ($index, $indexexparr) = @_;
143
144 $self->{'index'} = $index;
145 $self->{'indexexparr'} = $indexexparr if defined $indexexparr;
146}
147
148sub get_index {
149 my $self = shift (@_);
150
151 return $self->{'index'};
152}
153
154sub set_classifiers {
155 my $self = shift (@_);
156 my ($classifiers) = @_;
157
158 $self->{'classifiers'} = $classifiers;
159}
160
161sub set_indexing_text {
162 my $self = shift (@_);
163 my ($indexing_text) = @_;
164
165 $self->{'indexing_text'} = $indexing_text;
166}
167
168sub get_indexing_text {
169 my $self = shift (@_);
170
171 return $self->{'indexing_text'};
172}
173
174sub set_store_text {
175 my $self = shift (@_);
176 my ($store_text) = @_;
177
178 $self->{'store_text'} = $store_text;
179}
180
181
182sub process {
183 my $self = shift (@_);
184 my $method = $self->{'mode'};
185
186 $self->$method(@_);
187}
188
189# use 'Paged' if document has no more than 2 levels
190# and each section at second level has a number for
191# Title metadata
192sub get_document_type {
193 my $self = shift (@_);
194 my ($doc_obj) = @_;
195
196 my $thistype = "VList";
197 my $childtype = "VList";
198 my $title;
199 my @tmp = ();
200
201 my $section = $doc_obj->get_top_section ();
202 my $first = 1;
203 while (defined $section) {
204 @tmp = split /\./, $section;
205 if (scalar(@tmp) > 1) {
206 return ($thistype, $childtype);
207 }
208 if (!$first) {
209 $title = $doc_obj->get_metadata_element ($section, "Title");
210 if (!defined $title || $title !~ /^\d+$/) {
211 return ($thistype, $childtype);
212 }
213 }
214 $first = 0;
215 $section = $doc_obj->get_next_section($section);
216 }
217 if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
218 $thistype = "Paged";
219 } else {
220 $thistype = "Invisible";
221 }
222 $childtype = "Paged";
223 return ($thistype, $childtype);
224}
225
226sub assoc_files {
227 my $self = shift (@_);
228 my ($doc_obj, $archivedir) = @_;
229 my ($afile);
230
231 foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
232 # if assoc file contains directory structure of
233 # its own use it, otherwise use HASH... directory
234 if ($assoc_file->[1] =~ /[\/\\]/) {
235 $afile = &util::filename_cat($self->{'assocdir'}, $assoc_file->[1]);
236 } else {
237 $afile = &util::filename_cat($self->{'assocdir'}, $archivedir, $assoc_file->[1]);
238 }
239 &util::hard_link ($assoc_file->[0], $afile);
240 }
241}
242
243sub infodb {
244 my $self = shift (@_);
245 my ($doc_obj, $filename) = @_;
246 my $handle = $self->{'output_handle'};
247# $handle = "main::STDOUT";
248
249 my $doctype = $doc_obj->get_doc_type();
250
251 # only output this document if it is one to be indexed
252 return if ($doctype ne "indexed_doc");
253
254 my ($archivedir) = $filename =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
255 $archivedir = "" unless defined $archivedir;
256 $archivedir =~ s/\\/\//g;
257 $archivedir =~ s/^\/+//;
258 $archivedir =~ s/\/+$//;
259
260 $self->assoc_files ($doc_obj, $archivedir);
261
262 # this is another document
263 $self->{'num_docs'} += 1 unless ($doctype eq "classification");
264
265 # is this a paged or a hierarchical document
266 my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
267
268 my $section = $doc_obj->get_top_section ();
269 my $doc_OID = $doc_obj->get_OID();
270 my $first = 1;
271 my $url = "";
272 while (defined $section) {
273 # update a few statistics
274 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
275 $self->{'num_sections'} += 1 unless ($doctype eq "classification");
276
277 # output the section name
278 if ($section eq "") { print $handle "[$doc_OID]\n"; }
279 else { print $handle "[$doc_OID.$section]\n"; }
280
281 # output the fact that this document is a document (unless doctype
282 # has been set to something else from within a plugin
283 my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
284 if (!defined $dtype || $dtype !~ /\w/) {
285 print $handle "<doctype>doc\n";
286 }
287
288 # output whether this node contains text
289 if ($doc_obj->get_text_length($section) > 0) {
290 print $handle "<hastxt>1\n";
291 } else {
292 print $handle "<hastxt>0\n";
293 }
294
295 # output all the section metadata
296 my $metadata = $doc_obj->get_all_metadata ($section);
297 foreach my $pair (@$metadata) {
298 my ($field, $value) = (@$pair);
299
300 if ($field ne "Identifier" && $field !~ /^gsdl/ &&
301 defined $value && $value ne "") {
302
303 # escape problematic stuff
304 $value =~ s/\\/\\\\/g;
305 $value =~ s/\n/\\n/g;
306 $value =~ s/\r/\\r/g;
307
308 # special case for URL metadata
309 if ($field =~ /^URL$/i) {
310 $url .= "[$value]\n";
311 if ($section eq "") {$url .= "<section>$doc_OID\n";}
312 else {$url .= "<section>$doc_OID.$section\n";}
313 $url .= '-' x 70 . "\n";
314 }
315
316 if (!defined $self->{'dontgdbm'}->{$field}) {
317 print $handle "<$field>$value\n";
318 }
319 }
320 }
321
322 # output archivedir if at top level
323 if ($section eq $doc_obj->get_top_section()) {
324 print $handle "<archivedir>$archivedir\n";
325 }
326
327 # output document display type
328 if ($first) {
329 print $handle "<thistype>$thistype\n";
330 }
331
332 # output a list of children
333 my $children = $doc_obj->get_children ($section);
334 if (scalar(@$children) > 0) {
335 print $handle "<childtype>$childtype\n";
336 print $handle "<contains>";
337 my $firstchild = 1;
338 foreach my $child (@$children) {
339 print $handle ";" unless $firstchild;
340 $firstchild = 0;
341 if ($child =~ /^.*?\.(\d+)$/) {
342 print $handle "\".$1";
343 } else {
344 print $handle "\".$child";
345 }
346# if ($child eq "") { print $handle "$doc_OID"; }
347# elsif ($section eq "") { print $handle "$doc_OID.$child"; }
348# else { print $handle "$doc_OID.$section.$child"; }
349 }
350 print $handle "\n";
351 }
352
353 # output the matching document number
354 print $handle "<docnum>$self->{'num_sections'}\n";
355
356 print $handle '-' x 70, "\n";
357
358
359 # output a database entry for the document number
360 print $handle "[$self->{'num_sections'}]\n";
361 if ($section eq "") { print $handle "<section>$doc_OID\n"; }
362 else { print $handle "<section>$doc_OID.$section\n"; }
363 print $handle '-' x 70, "\n";
364
365 # output entry for url
366 if ($url ne "") {
367 print $handle $url;
368 }
369
370 $first = 0;
371 $section = $doc_obj->get_next_section($section);
372 }
373
374 # classify this document
375 &classify::classify_doc ($self->{'classifiers'}, $doc_obj);
376
377}
378
379sub find_paragraphs {
380 $_[1] =~ s/(<p\b)/\cC$1/gi;
381}
382
383sub filter_text {
384 # $self->filter_text ($field, $new_text);
385 # don't want to do anything for this version, however,
386 # in a particular collection you might want to override
387 # this method to post-process certain fields depending on
388 # the field, or whether we are outputting it for indexing
389}
390
391sub text {
392 my $self = shift (@_);
393 my ($doc_obj) = @_;
394 my $handle = $self->{'output_handle'};
395 my $indexed_doc = 1;
396
397 # only output this document if it is one to be indexed
398 return if ($doc_obj->get_doc_type() ne "indexed_doc");
399
400 # see if this document belongs to this subcollection
401 foreach my $indexexp (@{$self->{'indexexparr'}}) {
402 $indexed_doc = 0;
403 my ($field, $exp, $options) = split /\//, $indexexp;
404 if (defined ($field) && defined ($exp)) {
405 my ($bool) = $field =~ /^(.)/;
406 $field =~ s/^.// if $bool eq '!';
407 if ($field =~ /^filename$/i) {
408 $field = $doc_obj->get_source_filename();
409 } else {
410 $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
411 }
412 next unless defined $field;
413 if ($bool eq '!') {
414 if ($options =~ /^i$/i) {
415 if ($field !~ /$exp/i) {$indexed_doc = 1; last;}
416 } else {
417 if ($field !~ /$exp/) {$indexed_doc = 1; last;}
418 }
419 } else {
420 if ($options =~ /^i$/i) {
421 if ($field =~ /$exp/i) {$indexed_doc = 1; last;}
422 } else {
423 if ($field =~ /$exp/) {$indexed_doc = 1; last;}
424 }
425 }
426 }
427 }
428
429 # this is another document
430 $self->{'num_docs'} += 1;
431
432 # get the parameters for the output
433 my ($level, $fields) = split (/:/, $self->{'index'});
434 $fields =~ s/\ball\b/Title,Creator,text/;
435 $fields =~ s/\btopall\b/topTitle,topCreator,toptext/;
436
437 my $doc_section = 0; # just for this document
438 my $text = "";
439 my $text_extra = "";
440
441 # get the text for this document
442 my $section = $doc_obj->get_top_section();
443 while (defined $section) {
444 # update a few statistics
445 $doc_section++;
446 $self->{'num_sections'} += 1;
447
448 if ($indexed_doc) {
449 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
450 foreach my $field (split (/,/, $fields)) {
451 # only deal with this field if it doesn't start with top or
452 # this is the first section
453 my $real_field = $field;
454 if (!($real_field =~ s/^top//) || ($doc_section == 1)) {
455 my $new_text = "";
456 if ($real_field eq "text") {
457 $new_text = $doc_obj->get_text ($section) if $self->{'store_text'};
458 $self->{'num_processed_bytes'} += length ($new_text);
459 $new_text =~ s/[\cB\cC]//g;
460 $self->find_paragraphs($new_text);
461
462 } else {
463 my $first = 1;
464 foreach $meta (@{$doc_obj->get_metadata ($section, $real_field)}) {
465 $meta =~ s/[\cB\cC]//g;
466 $self->{'num_processed_bytes'} += length ($meta);
467 $new_text .= "\cC" unless $first;
468 $new_text .= $meta if $self->{'store_text'};
469 $first = 0;
470 }
471 }
472
473 # filter the text
474 $self->filter_text ($field, $new_text);
475
476 $text .= "$new_text\cC";
477 }
478 }
479 }
480
481 if ($level eq "document") { $text_extra .= "\cB"; }
482 else { $text .= "\cB"; }
483
484 $section = $doc_obj->get_next_section($section);
485 }
486
487 print $handle "$text$text_extra";
488}
489
4901;
491
Note: See TracBrowser for help on using the repository browser.