source: trunk/gsdl/perllib/mgbuildproc.pm@ 4743

Last change on this file since 4743 was 4743, checked in by sjboddie, 21 years ago

Build code changes allowing mg collections containing no indexes to
be built (it in fact builds a small "dummy:text" index if none are
specified since we need an index for the runtime code to be able to
retrieve the compressed text).

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 13.6 KB
Line 
1###########################################################################
2#
3# mgbuildproc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor outputs a document
27# for mg to process
28
29
30package mgbuildproc;
31
32eval {require bytes};
33
34use classify;
35use doc;
36use docproc;
37use util;
38
39BEGIN {
40 @ISA = ('docproc');
41}
42
43
44sub new {
45 my ($class, $collection, $source_dir, $build_dir,
46 $verbosity, $outhandle) = @_;
47 my $self = new docproc ();
48
49 # outhandle is where all the debugging info goes
50 # output_handle is where the output of the plugins is piped
51 # to (i.e. mg, gdbm etc.)
52 $outhandle = STDERR unless defined $outhandle;
53
54 $self->{'collection'} = $collection;
55 $self->{'source_dir'} = $source_dir;
56 $self->{'build_dir'} = $build_dir;
57 $self->{'verbosity'} = $verbosity;
58 $self->{'classifiers'} = [];
59 $self->{'mode'} = "text";
60 $self->{'assocdir'} = $build_dir;
61 $self->{'dontgdbm'} = {};
62 $self->{'index'} = "section:text";
63 $self->{'indexexparr'} = [];
64 $self->{'output_handle'} = "STDOUT";
65 $self->{'num_docs'} = 0;
66 $self->{'num_sections'} = 0;
67 $self->{'num_bytes'} = 0;
68 $self->{'num_processed_bytes'} = 0;
69 $self->{'store_text'} = 1;
70 $self->{'outhandle'} = $outhandle;
71
72 #used by browse interface
73 $self->{'doclist'} = [];
74
75 $self->{'indexing_text'} = 0;
76
77 return bless $self, $class;
78}
79
80sub reset {
81 my $self = shift (@_);
82
83 $self->{'num_docs'} = 0;
84 $self->{'num_sections'} = 0;
85 $self->{'num_processed_bytes'} = 0;
86 $self->{'num_bytes'} = 0;
87}
88
89sub get_num_docs {
90 my $self = shift (@_);
91
92 return $self->{'num_docs'};
93}
94
95sub get_num_sections {
96 my $self = shift (@_);
97
98 return $self->{'num_sections'};
99}
100
101# num_bytes is the actual number of bytes in the collection
102# this is normally the same as what's processed during text compression
103sub get_num_bytes {
104 my $self = shift (@_);
105
106 return $self->{'num_bytes'};
107}
108
109# num_processed_bytes is the number of bytes actually passed
110# to mg for the current index
111sub get_num_processed_bytes {
112 my $self = shift (@_);
113
114 return $self->{'num_processed_bytes'};
115}
116
117sub set_output_handle {
118 my $self = shift (@_);
119 my ($handle) = @_;
120
121 $self->{'output_handle'} = $handle;
122}
123
124sub set_mode {
125 my $self = shift (@_);
126 my ($mode) = @_;
127
128 $self->{'mode'} = $mode;
129}
130
131sub set_assocdir {
132 my $self = shift (@_);
133 my ($assocdir) = @_;
134
135 $self->{'assocdir'} = $assocdir;
136}
137
138sub set_dontgdbm {
139 my $self = shift (@_);
140 my ($dontgdbm) = @_;
141
142 $self->{'dontgdbm'} = $dontgdbm;
143}
144
145sub set_index {
146 my $self = shift (@_);
147 my ($index, $indexexparr) = @_;
148
149 $self->{'index'} = $index;
150 $self->{'indexexparr'} = $indexexparr if defined $indexexparr;
151}
152
153sub get_index {
154 my $self = shift (@_);
155
156 return $self->{'index'};
157}
158
159sub set_classifiers {
160 my $self = shift (@_);
161 my ($classifiers) = @_;
162
163 $self->{'classifiers'} = $classifiers;
164}
165
166sub set_indexing_text {
167 my $self = shift (@_);
168 my ($indexing_text) = @_;
169
170 $self->{'indexing_text'} = $indexing_text;
171}
172
173sub get_indexing_text {
174 my $self = shift (@_);
175
176 return $self->{'indexing_text'};
177}
178
179sub set_store_text {
180 my $self = shift (@_);
181 my ($store_text) = @_;
182
183 $self->{'store_text'} = $store_text;
184}
185
186sub get_doc_list {
187 my $self = shift(@_);
188
189 return @{$self->{'doclist'}};
190}
191
192
193sub process {
194 my $self = shift (@_);
195 my $method = $self->{'mode'};
196
197 $self->$method(@_);
198}
199
200# use 'Paged' if document has no more than 2 levels
201# and each section at second level has a number for
202# Title metadata
203sub get_document_type {
204 my $self = shift (@_);
205 my ($doc_obj) = @_;
206
207 my $thistype = "VList";
208 my $childtype = "VList";
209 my $title;
210 my @tmp = ();
211
212 my $section = $doc_obj->get_top_section ();
213 my $first = 1;
214 while (defined $section) {
215 @tmp = split /\./, $section;
216 if (scalar(@tmp) > 1) {
217 return ($thistype, $childtype);
218 }
219 if (!$first) {
220 $title = $doc_obj->get_metadata_element ($section, "Title");
221 if (!defined $title || $title !~ /^\d+$/) {
222 return ($thistype, $childtype);
223 }
224 }
225 $first = 0;
226 $section = $doc_obj->get_next_section($section);
227 }
228 if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
229 $thistype = "Paged";
230 } else {
231 $thistype = "Invisible";
232 }
233 $childtype = "Paged";
234 return ($thistype, $childtype);
235}
236
237sub assoc_files {
238 my $self = shift (@_);
239 my ($doc_obj, $archivedir) = @_;
240 my ($afile);
241
242 foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
243 # if assoc file starts with a slash, we put it relative to the assoc
244 # dir, otherwise it is relative to the HASH... directory
245 if ($assoc_file->[1] =~ m@^[/\\]@) {
246 $afile = &util::filename_cat($self->{'assocdir'},$assoc_file->[1]);
247 } else {
248 $afile = &util::filename_cat($self->{'assocdir'}, $archivedir, $assoc_file->[1]);
249 }
250 &util::hard_link ($assoc_file->[0], $afile);
251 }
252}
253
254sub infodb {
255 my $self = shift (@_);
256 my ($doc_obj, $filename) = @_;
257 my $handle = $self->{'output_handle'};
258# $handle = "main::STDOUT";
259
260 my $doctype = $doc_obj->get_doc_type();
261
262 # only output this document if it is one to be indexed
263 return if ($doctype ne "indexed_doc");
264
265 my ($archivedir) = $filename =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
266 $archivedir = "" unless defined $archivedir;
267 $archivedir =~ s/\\/\//g;
268 $archivedir =~ s/^\/+//;
269 $archivedir =~ s/\/+$//;
270
271 $self->assoc_files ($doc_obj, $archivedir);
272
273 # this is another document
274 $self->{'num_docs'} += 1 unless ($doctype eq "classification");
275
276 # is this a paged or a hierarchical document
277 my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
278
279 my $section = $doc_obj->get_top_section ();
280 my $doc_OID = $doc_obj->get_OID();
281 my $first = 1;
282 my $url = "";
283 while (defined $section) {
284 # update a few statistics
285 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
286 $self->{'num_sections'} += 1 unless ($doctype eq "classification");
287
288 # output the section name
289 if ($section eq "") { print $handle "[$doc_OID]\n"; }
290 else { print $handle "[$doc_OID.$section]\n"; }
291
292 # output the fact that this document is a document (unless doctype
293 # has been set to something else from within a plugin
294 my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
295 if (!defined $dtype || $dtype !~ /\w/) {
296 print $handle "<doctype>doc\n";
297 }
298
299 # output whether this node contains text
300 if ($doc_obj->get_text_length($section) > 0) {
301 print $handle "<hastxt>1\n";
302 } else {
303 print $handle "<hastxt>0\n";
304 }
305
306 # output all the section metadata
307 my $metadata = $doc_obj->get_all_metadata ($section);
308 foreach my $pair (@$metadata) {
309 my ($field, $value) = (@$pair);
310
311 if ($field ne "Identifier" && $field !~ /^gsdl/ &&
312 defined $value && $value ne "") {
313
314 # escape problematic stuff
315 $value =~ s/\\/\\\\/g;
316 $value =~ s/\n/\\n/g;
317 $value =~ s/\r/\\r/g;
318 if ($value =~ /-{70,}/) {
319 # if value contains 70 or more hyphens in a row we need
320 # to escape them to prevent txt2db from treating them
321 # as a separator
322 $value =~ s/-/&\#045;/gi;
323 }
324
325 # special case for URL metadata
326 if ($field =~ /^URL$/i) {
327 $url .= "[$value]\n";
328 if ($section eq "") {$url .= "<section>$doc_OID\n";}
329 else {$url .= "<section>$doc_OID.$section\n";}
330 $url .= '-' x 70 . "\n";
331 }
332
333 if (!defined $self->{'dontgdbm'}->{$field}) {
334 print $handle "<$field>$value\n";
335 }
336 }
337 }
338
339 # output archivedir if at top level
340 if ($section eq $doc_obj->get_top_section()) {
341 print $handle "<archivedir>$archivedir\n";
342 }
343
344 # output document display type
345 if ($first) {
346 print $handle "<thistype>$thistype\n";
347 }
348
349 # output a list of children
350 my $children = $doc_obj->get_children ($section);
351 if (scalar(@$children) > 0) {
352 print $handle "<childtype>$childtype\n";
353 print $handle "<contains>";
354 my $firstchild = 1;
355 foreach my $child (@$children) {
356 print $handle ";" unless $firstchild;
357 $firstchild = 0;
358 if ($child =~ /^.*?\.(\d+)$/) {
359 print $handle "\".$1";
360 } else {
361 print $handle "\".$child";
362 }
363# if ($child eq "") { print $handle "$doc_OID"; }
364# elsif ($section eq "") { print $handle "$doc_OID.$child"; }
365# else { print $handle "$doc_OID.$section.$child"; }
366 }
367 print $handle "\n";
368 }
369
370 # output the matching document number
371 print $handle "<docnum>$self->{'num_sections'}\n";
372
373 print $handle '-' x 70, "\n";
374
375
376 # output a database entry for the document number
377 print $handle "[$self->{'num_sections'}]\n";
378 if ($section eq "") { print $handle "<section>$doc_OID\n"; }
379 else { print $handle "<section>$doc_OID.$section\n"; }
380 print $handle '-' x 70, "\n";
381
382 # output entry for url
383 if ($url ne "") {
384 print $handle $url;
385 }
386
387 $first = 0;
388 $section = $doc_obj->get_next_section($section);
389 }
390
391 #add this document to the browse structure
392 push(@{$self->{'doclist'}},$doc_obj->get_OID())
393 unless ($doctype eq "classification");
394
395 # classify this document
396 &classify::classify_doc ($self->{'classifiers'}, $doc_obj);
397
398}
399
400sub find_paragraphs {
401 $_[1] =~ s/(<p\b)/\cC$1/gi;
402}
403
404sub filter_text {
405 # $self->filter_text ($field, $new_text);
406 # don't want to do anything for this version, however,
407 # in a particular collection you might want to override
408 # this method to post-process certain fields depending on
409 # the field, or whether we are outputting it for indexing
410}
411
412sub text {
413 my $self = shift (@_);
414 my ($doc_obj) = @_;
415 my $handle = $self->{'output_handle'};
416 my $indexed_doc = 1;
417
418 # only output this document if it is one to be indexed
419 return if ($doc_obj->get_doc_type() ne "indexed_doc");
420
421 # see if this document belongs to this subcollection
422 foreach my $indexexp (@{$self->{'indexexparr'}}) {
423 $indexed_doc = 0;
424 my ($field, $exp, $options) = split /\//, $indexexp;
425 if (defined ($field) && defined ($exp)) {
426 my ($bool) = $field =~ /^(.)/;
427 $field =~ s/^.// if $bool eq '!';
428 if ($field =~ /^filename$/i) {
429 $field = $doc_obj->get_source_filename();
430 } else {
431 $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
432 }
433 next unless defined $field;
434 if ($bool eq '!') {
435 if ($options =~ /^i$/i) {
436 if ($field !~ /$exp/i) {$indexed_doc = 1; last;}
437 } else {
438 if ($field !~ /$exp/) {$indexed_doc = 1; last;}
439 }
440 } else {
441 if ($options =~ /^i$/i) {
442 if ($field =~ /$exp/i) {$indexed_doc = 1; last;}
443 } else {
444 if ($field =~ /$exp/) {$indexed_doc = 1; last;}
445 }
446 }
447 }
448 }
449
450 # this is another document
451 $self->{'num_docs'} += 1;
452
453 # get the parameters for the output
454 my ($level, $fields) = split (/:/, $self->{'index'});
455 $fields =~ s/\ball\b/Title,Creator,text/;
456 $fields =~ s/\btopall\b/topTitle,topCreator,toptext/;
457
458 my $doc_section = 0; # just for this document
459 my $text = "";
460 my $text_extra = "";
461
462 # get the text for this document
463 my $section = $doc_obj->get_top_section();
464 while (defined $section) {
465 # update a few statistics
466 $doc_section++;
467 $self->{'num_sections'} += 1;
468
469 if ($indexed_doc) {
470 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
471 foreach my $field (split (/,/, $fields)) {
472 # only deal with this field if it doesn't start with top or
473 # this is the first section
474 my $real_field = $field;
475 if (!($real_field =~ s/^top//) || ($doc_section == 1)) {
476 my $new_text = "";
477 if ($level eq "dummy") {
478 # a dummy index is a special case used when no
479 # indexes are specified (since there must always be
480 # at least one index or we can't retrieve the
481 # compressed text) - we add a small amount of text
482 # to these dummy indexes which will never be seen
483 # but will overcome mg's problems with building
484 # empty indexes
485 $new_text = "this is dummy text to stop mg barfing";
486 $self->{'num_processed_bytes'} += length ($new_text);
487
488 } elsif ($real_field eq "text") {
489 $new_text = $doc_obj->get_text ($section) if $self->{'store_text'};
490 $self->{'num_processed_bytes'} += length ($new_text);
491 $new_text =~ s/[\cB\cC]//g;
492 $self->find_paragraphs($new_text);
493
494 } else {
495 my $first = 1;
496 foreach $meta (@{$doc_obj->get_metadata ($section, $real_field)}) {
497 $meta =~ s/[\cB\cC]//g;
498 $self->{'num_processed_bytes'} += length ($meta);
499 $new_text .= "\cC" unless $first;
500 $new_text .= $meta if $self->{'store_text'};
501 $first = 0;
502 }
503 }
504
505 # filter the text
506 $self->filter_text ($field, $new_text);
507
508 $text .= "$new_text\cC";
509 }
510 }
511 }
512
513 if ($level eq "document") { $text_extra .= "\cB"; }
514 else { $text .= "\cB"; }
515
516 $section = $doc_obj->get_next_section($section);
517 }
518
519 print $handle "$text$text_extra";
520}
521
5221;
523
Note: See TracBrowser for help on using the repository browser.