source: trunk/gsdl/perllib/mgbuildproc.pm@ 7909

Last change on this file since 7909 was 6767, checked in by kjdon, 20 years ago

small change in the use of gsdlthistype to determine document type (paged vs Vlist)

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 14.0 KB
Line 
1###########################################################################
2#
3# mgbuildproc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor outputs a document
27# for mg to process
28
29
30package mgbuildproc;
31
32eval {require bytes};
33
34use classify;
35use doc;
36use docproc;
37use util;
38
39BEGIN {
40 @ISA = ('docproc');
41}
42
43
44sub new {
45 my ($class, $collection, $source_dir, $build_dir,
46 $verbosity, $outhandle) = @_;
47 my $self = new docproc ();
48
49 # outhandle is where all the debugging info goes
50 # output_handle is where the output of the plugins is piped
51 # to (i.e. mg, gdbm etc.)
52 $outhandle = STDERR unless defined $outhandle;
53
54 $self->{'collection'} = $collection;
55 $self->{'source_dir'} = $source_dir;
56 $self->{'build_dir'} = $build_dir;
57 $self->{'verbosity'} = $verbosity;
58 $self->{'classifiers'} = [];
59 $self->{'mode'} = "text";
60 $self->{'assocdir'} = $build_dir;
61 $self->{'dontgdbm'} = {};
62 $self->{'index'} = "section:text";
63 $self->{'indexexparr'} = [];
64 $self->{'output_handle'} = "STDOUT";
65 $self->{'num_docs'} = 0;
66 $self->{'num_sections'} = 0;
67 $self->{'num_bytes'} = 0;
68 $self->{'num_processed_bytes'} = 0;
69 $self->{'store_text'} = 1;
70 $self->{'outhandle'} = $outhandle;
71
72 #used by browse interface
73 $self->{'doclist'} = [];
74
75 $self->{'indexing_text'} = 0;
76
77 return bless $self, $class;
78}
79
80sub reset {
81 my $self = shift (@_);
82
83 $self->{'num_docs'} = 0;
84 $self->{'num_sections'} = 0;
85 $self->{'num_processed_bytes'} = 0;
86 $self->{'num_bytes'} = 0;
87}
88
89sub get_num_docs {
90 my $self = shift (@_);
91
92 return $self->{'num_docs'};
93}
94
95sub get_num_sections {
96 my $self = shift (@_);
97
98 return $self->{'num_sections'};
99}
100
101# num_bytes is the actual number of bytes in the collection
102# this is normally the same as what's processed during text compression
103sub get_num_bytes {
104 my $self = shift (@_);
105
106 return $self->{'num_bytes'};
107}
108
109# num_processed_bytes is the number of bytes actually passed
110# to mg for the current index
111sub get_num_processed_bytes {
112 my $self = shift (@_);
113
114 return $self->{'num_processed_bytes'};
115}
116
117sub set_output_handle {
118 my $self = shift (@_);
119 my ($handle) = @_;
120
121 $self->{'output_handle'} = $handle;
122}
123
124sub set_mode {
125 my $self = shift (@_);
126 my ($mode) = @_;
127
128 $self->{'mode'} = $mode;
129}
130
131sub set_assocdir {
132 my $self = shift (@_);
133 my ($assocdir) = @_;
134
135 $self->{'assocdir'} = $assocdir;
136}
137
138sub set_dontgdbm {
139 my $self = shift (@_);
140 my ($dontgdbm) = @_;
141
142 $self->{'dontgdbm'} = $dontgdbm;
143}
144
145sub set_index {
146 my $self = shift (@_);
147 my ($index, $indexexparr) = @_;
148
149 $self->{'index'} = $index;
150 $self->{'indexexparr'} = $indexexparr if defined $indexexparr;
151}
152
153sub get_index {
154 my $self = shift (@_);
155
156 return $self->{'index'};
157}
158
159sub set_classifiers {
160 my $self = shift (@_);
161 my ($classifiers) = @_;
162
163 $self->{'classifiers'} = $classifiers;
164}
165
166sub set_indexing_text {
167 my $self = shift (@_);
168 my ($indexing_text) = @_;
169
170 $self->{'indexing_text'} = $indexing_text;
171}
172
173sub get_indexing_text {
174 my $self = shift (@_);
175
176 return $self->{'indexing_text'};
177}
178
179sub set_store_text {
180 my $self = shift (@_);
181 my ($store_text) = @_;
182
183 $self->{'store_text'} = $store_text;
184}
185
186sub get_doc_list {
187 my $self = shift(@_);
188
189 return @{$self->{'doclist'}};
190}
191
192
193sub process {
194 my $self = shift (@_);
195 my $method = $self->{'mode'};
196
197 $self->$method(@_);
198}
199
200# use 'Paged' if document has no more than 2 levels
201# and each section at second level has a number for
202# Title metadata
203#also use Paged if gsdlthistype metadata is set to Paged
204sub get_document_type {
205 my $self = shift (@_);
206 my ($doc_obj) = @_;
207
208 my $thistype = "VList";
209 my $childtype = "VList";
210 my $title;
211 my @tmp = ();
212
213 my $section = $doc_obj->get_top_section ();
214
215 my $gsdlthistype = $doc_obj->get_metadata_element ($section, "gsdlthistype");
216 if (defined $gsdlthistype) {
217 if ($gsdlthistype eq "Paged") {
218 $thistype = "Paged";
219 $childtype = "Paged";
220 return ($thistype, $childtype);
221 } elsif ($gsdlthistype eq "Hierarchy") {
222 return ($thistype, $childtype); # use VList, VList
223 }
224 }
225 my $first = 1;
226 while (defined $section) {
227 @tmp = split /\./, $section;
228 if (scalar(@tmp) > 1) {
229 return ($thistype, $childtype);
230 }
231 if (!$first) {
232 $title = $doc_obj->get_metadata_element ($section, "Title");
233 if (!defined $title || $title !~ /^\d+$/) {
234 return ($thistype, $childtype);
235 }
236 }
237 $first = 0;
238 $section = $doc_obj->get_next_section($section);
239 }
240 if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
241 $thistype = "Paged";
242 } else {
243 $thistype = "Invisible";
244 }
245 $childtype = "Paged";
246 return ($thistype, $childtype);
247}
248
249sub assoc_files {
250 my $self = shift (@_);
251 my ($doc_obj, $archivedir) = @_;
252 my ($afile);
253
254 foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
255 # if assoc file starts with a slash, we put it relative to the assoc
256 # dir, otherwise it is relative to the HASH... directory
257 if ($assoc_file->[1] =~ m@^[/\\]@) {
258 $afile = &util::filename_cat($self->{'assocdir'},$assoc_file->[1]);
259 } else {
260 $afile = &util::filename_cat($self->{'assocdir'}, $archivedir, $assoc_file->[1]);
261 }
262 &util::hard_link ($assoc_file->[0], $afile);
263 }
264}
265
266sub infodb {
267 my $self = shift (@_);
268 my ($doc_obj, $filename) = @_;
269 my $handle = $self->{'output_handle'};
270# $handle = "main::STDOUT";
271
272 my $doctype = $doc_obj->get_doc_type();
273
274 # only output this document if it is one to be indexed
275 return if ($doctype ne "indexed_doc");
276
277 my ($archivedir) = $filename =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
278 $archivedir = "" unless defined $archivedir;
279 $archivedir =~ s/\\/\//g;
280 $archivedir =~ s/^\/+//;
281 $archivedir =~ s/\/+$//;
282
283 $self->assoc_files ($doc_obj, $archivedir);
284
285 # this is another document
286 $self->{'num_docs'} += 1 unless ($doctype eq "classification");
287
288 # is this a paged or a hierarchical document
289 my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
290
291 my $section = $doc_obj->get_top_section ();
292 my $doc_OID = $doc_obj->get_OID();
293 my $first = 1;
294 my $url = "";
295 while (defined $section) {
296 # update a few statistics
297 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
298 $self->{'num_sections'} += 1 unless ($doctype eq "classification");
299
300 # output the section name
301 if ($section eq "") { print $handle "[$doc_OID]\n"; }
302 else { print $handle "[$doc_OID.$section]\n"; }
303
304 # output the fact that this document is a document (unless doctype
305 # has been set to something else from within a plugin
306 my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
307 if (!defined $dtype || $dtype !~ /\w/) {
308 print $handle "<doctype>doc\n";
309 }
310
311 # output whether this node contains text
312 if ($doc_obj->get_text_length($section) > 0) {
313 print $handle "<hastxt>1\n";
314 } else {
315 print $handle "<hastxt>0\n";
316 }
317
318 # output all the section metadata
319 my $metadata = $doc_obj->get_all_metadata ($section);
320 foreach my $pair (@$metadata) {
321 my ($field, $value) = (@$pair);
322
323 if ($field ne "Identifier" && $field !~ /^gsdl/ &&
324 defined $value && $value ne "") {
325
326 # escape problematic stuff
327 $value =~ s/\\/\\\\/g;
328 $value =~ s/\n/\\n/g;
329 $value =~ s/\r/\\r/g;
330 if ($value =~ /-{70,}/) {
331 # if value contains 70 or more hyphens in a row we need
332 # to escape them to prevent txt2db from treating them
333 # as a separator
334 $value =~ s/-/&\#045;/gi;
335 }
336
337 # special case for URL metadata
338 if ($field =~ /^URL$/i) {
339 $url .= "[$value]\n";
340 if ($section eq "") {$url .= "<section>$doc_OID\n";}
341 else {$url .= "<section>$doc_OID.$section\n";}
342 $url .= '-' x 70 . "\n";
343 }
344
345 if (!defined $self->{'dontgdbm'}->{$field}) {
346 print $handle "<$field>$value\n";
347 }
348 }
349 }
350
351 # output archivedir if at top level
352 if ($section eq $doc_obj->get_top_section()) {
353 print $handle "<archivedir>$archivedir\n";
354 }
355
356 # output document display type
357 if ($first) {
358 print $handle "<thistype>$thistype\n";
359 }
360
361 # output a list of children
362 my $children = $doc_obj->get_children ($section);
363 if (scalar(@$children) > 0) {
364 print $handle "<childtype>$childtype\n";
365 print $handle "<contains>";
366 my $firstchild = 1;
367 foreach my $child (@$children) {
368 print $handle ";" unless $firstchild;
369 $firstchild = 0;
370 if ($child =~ /^.*?\.(\d+)$/) {
371 print $handle "\".$1";
372 } else {
373 print $handle "\".$child";
374 }
375# if ($child eq "") { print $handle "$doc_OID"; }
376# elsif ($section eq "") { print $handle "$doc_OID.$child"; }
377# else { print $handle "$doc_OID.$section.$child"; }
378 }
379 print $handle "\n";
380 }
381
382 # output the matching document number
383 print $handle "<docnum>$self->{'num_sections'}\n";
384
385 print $handle '-' x 70, "\n";
386
387
388 # output a database entry for the document number
389 print $handle "[$self->{'num_sections'}]\n";
390 if ($section eq "") { print $handle "<section>$doc_OID\n"; }
391 else { print $handle "<section>$doc_OID.$section\n"; }
392 print $handle '-' x 70, "\n";
393
394 # output entry for url
395 if ($url ne "") {
396 print $handle $url;
397 }
398
399 $first = 0;
400 $section = $doc_obj->get_next_section($section);
401 }
402
403 #add this document to the browse structure
404 push(@{$self->{'doclist'}},$doc_obj->get_OID())
405 unless ($doctype eq "classification");
406
407 # classify this document
408 &classify::classify_doc ($self->{'classifiers'}, $doc_obj);
409
410}
411
412sub find_paragraphs {
413 $_[1] =~ s/(<p\b)/\cC$1/gi;
414}
415
416sub filter_text {
417 # $self->filter_text ($field, $new_text);
418 # don't want to do anything for this version, however,
419 # in a particular collection you might want to override
420 # this method to post-process certain fields depending on
421 # the field, or whether we are outputting it for indexing
422}
423
424sub text {
425 my $self = shift (@_);
426 my ($doc_obj) = @_;
427 my $handle = $self->{'output_handle'};
428 my $indexed_doc = 1;
429
430 # only output this document if it is one to be indexed
431 return if ($doc_obj->get_doc_type() ne "indexed_doc");
432
433 # see if this document belongs to this subcollection
434 foreach my $indexexp (@{$self->{'indexexparr'}}) {
435 $indexed_doc = 0;
436 my ($field, $exp, $options) = split /\//, $indexexp;
437 if (defined ($field) && defined ($exp)) {
438 my ($bool) = $field =~ /^(.)/;
439 $field =~ s/^.// if $bool eq '!';
440 if ($field =~ /^filename$/i) {
441 $field = $doc_obj->get_source_filename();
442 } else {
443 $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
444 }
445 next unless defined $field;
446 if ($bool eq '!') {
447 if ($options =~ /^i$/i) {
448 if ($field !~ /$exp/i) {$indexed_doc = 1; last;}
449 } else {
450 if ($field !~ /$exp/) {$indexed_doc = 1; last;}
451 }
452 } else {
453 if ($options =~ /^i$/i) {
454 if ($field =~ /$exp/i) {$indexed_doc = 1; last;}
455 } else {
456 if ($field =~ /$exp/) {$indexed_doc = 1; last;}
457 }
458 }
459 }
460 }
461
462 # this is another document
463 $self->{'num_docs'} += 1;
464
465 # get the parameters for the output
466 my ($level, $fields) = split (/:/, $self->{'index'});
467 $fields =~ s/\ball\b/Title,Creator,text/;
468 $fields =~ s/\btopall\b/topTitle,topCreator,toptext/;
469
470 my $doc_section = 0; # just for this document
471 my $text = "";
472 my $text_extra = "";
473
474 # get the text for this document
475 my $section = $doc_obj->get_top_section();
476 while (defined $section) {
477 # update a few statistics
478 $doc_section++;
479 $self->{'num_sections'} += 1;
480
481 if ($indexed_doc) {
482 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
483 foreach my $field (split (/,/, $fields)) {
484 # only deal with this field if it doesn't start with top or
485 # this is the first section
486 my $real_field = $field;
487 if (!($real_field =~ s/^top//) || ($doc_section == 1)) {
488 my $new_text = "";
489 if ($level eq "dummy") {
490 # a dummy index is a special case used when no
491 # indexes are specified (since there must always be
492 # at least one index or we can't retrieve the
493 # compressed text) - we add a small amount of text
494 # to these dummy indexes which will never be seen
495 # but will overcome mg's problems with building
496 # empty indexes
497 $new_text = "this is dummy text to stop mg barfing";
498 $self->{'num_processed_bytes'} += length ($new_text);
499
500 } elsif ($real_field eq "text") {
501 $new_text = $doc_obj->get_text ($section) if $self->{'store_text'};
502 $self->{'num_processed_bytes'} += length ($new_text);
503 $new_text =~ s/[\cB\cC]//g;
504 $self->find_paragraphs($new_text);
505
506 } else {
507 my $first = 1;
508 foreach $meta (@{$doc_obj->get_metadata ($section, $real_field)}) {
509 $meta =~ s/[\cB\cC]//g;
510 $self->{'num_processed_bytes'} += length ($meta);
511 $new_text .= "\cC" unless $first;
512 $new_text .= $meta if $self->{'store_text'};
513 $first = 0;
514 }
515 }
516
517 # filter the text
518 $self->filter_text ($field, $new_text);
519
520 $text .= "$new_text\cC";
521 }
522 }
523 }
524
525 if ($level eq "document") { $text_extra .= "\cB"; }
526 else { $text .= "\cB"; }
527
528 $section = $doc_obj->get_next_section($section);
529 }
530
531 print $handle "$text$text_extra";
532}
533
5341;
535
Note: See TracBrowser for help on using the repository browser.