source: trunk/gsdl/perllib/mgbuildproc.pm@ 3665

Last change on this file since 3665 was 3665, checked in by sjboddie, 21 years ago

Prevent occurances of 70 or more hyphens in metadata values from causing txt2db to
wrongly treat them as a separator.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 13.1 KB
Line 
1###########################################################################
2#
3# mgbuildproc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor outputs a document
27# for mg to process
28
29
30package mgbuildproc;
31
32use classify;
33use doc;
34use docproc;
35use util;
36
37BEGIN {
38 @ISA = ('docproc');
39}
40
41
42sub new {
43 my ($class, $collection, $source_dir, $build_dir,
44 $verbosity, $outhandle) = @_;
45 my $self = new docproc ();
46
47 # outhandle is where all the debugging info goes
48 # output_handle is where the output of the plugins is piped
49 # to (i.e. mg, gdbm etc.)
50 $outhandle = STDERR unless defined $outhandle;
51
52 $self->{'collection'} = $collection;
53 $self->{'source_dir'} = $source_dir;
54 $self->{'build_dir'} = $build_dir;
55 $self->{'verbosity'} = $verbosity;
56 $self->{'classifiers'} = [];
57 $self->{'mode'} = "text";
58 $self->{'assocdir'} = $build_dir;
59 $self->{'dontgdbm'} = {};
60 $self->{'index'} = "section:text";
61 $self->{'indexexparr'} = [];
62 $self->{'output_handle'} = "STDOUT";
63 $self->{'num_docs'} = 0;
64 $self->{'num_sections'} = 0;
65 $self->{'num_bytes'} = 0;
66 $self->{'num_processed_bytes'} = 0;
67 $self->{'store_text'} = 1;
68 $self->{'outhandle'} = $outhandle;
69
70 #used by browse interface
71 $self->{'doclist'} = [];
72
73 $self->{'indexing_text'} = 0;
74
75 return bless $self, $class;
76}
77
78sub reset {
79 my $self = shift (@_);
80
81 $self->{'num_docs'} = 0;
82 $self->{'num_sections'} = 0;
83 $self->{'num_processed_bytes'} = 0;
84 $self->{'num_bytes'} = 0;
85}
86
87sub get_num_docs {
88 my $self = shift (@_);
89
90 return $self->{'num_docs'};
91}
92
93sub get_num_sections {
94 my $self = shift (@_);
95
96 return $self->{'num_sections'};
97}
98
99# num_bytes is the actual number of bytes in the collection
100# this is normally the same as what's processed during text compression
101sub get_num_bytes {
102 my $self = shift (@_);
103
104 return $self->{'num_bytes'};
105}
106
107# num_processed_bytes is the number of bytes actually passed
108# to mg for the current index
109sub get_num_processed_bytes {
110 my $self = shift (@_);
111
112 return $self->{'num_processed_bytes'};
113}
114
115sub set_output_handle {
116 my $self = shift (@_);
117 my ($handle) = @_;
118
119 $self->{'output_handle'} = $handle;
120}
121
122sub set_mode {
123 my $self = shift (@_);
124 my ($mode) = @_;
125
126 $self->{'mode'} = $mode;
127}
128
129sub set_assocdir {
130 my $self = shift (@_);
131 my ($assocdir) = @_;
132
133 $self->{'assocdir'} = $assocdir;
134}
135
136sub set_dontgdbm {
137 my $self = shift (@_);
138 my ($dontgdbm) = @_;
139
140 $self->{'dontgdbm'} = $dontgdbm;
141}
142
143sub set_index {
144 my $self = shift (@_);
145 my ($index, $indexexparr) = @_;
146
147 $self->{'index'} = $index;
148 $self->{'indexexparr'} = $indexexparr if defined $indexexparr;
149}
150
151sub get_index {
152 my $self = shift (@_);
153
154 return $self->{'index'};
155}
156
157sub set_classifiers {
158 my $self = shift (@_);
159 my ($classifiers) = @_;
160
161 $self->{'classifiers'} = $classifiers;
162}
163
164sub set_indexing_text {
165 my $self = shift (@_);
166 my ($indexing_text) = @_;
167
168 $self->{'indexing_text'} = $indexing_text;
169}
170
171sub get_indexing_text {
172 my $self = shift (@_);
173
174 return $self->{'indexing_text'};
175}
176
177sub set_store_text {
178 my $self = shift (@_);
179 my ($store_text) = @_;
180
181 $self->{'store_text'} = $store_text;
182}
183
184sub get_doc_list {
185 my $self = shift(@_);
186
187 return @{$self->{'doclist'}};
188}
189
190
191sub process {
192 my $self = shift (@_);
193 my $method = $self->{'mode'};
194
195 $self->$method(@_);
196}
197
198# use 'Paged' if document has no more than 2 levels
199# and each section at second level has a number for
200# Title metadata
201sub get_document_type {
202 my $self = shift (@_);
203 my ($doc_obj) = @_;
204
205 my $thistype = "VList";
206 my $childtype = "VList";
207 my $title;
208 my @tmp = ();
209
210 my $section = $doc_obj->get_top_section ();
211 my $first = 1;
212 while (defined $section) {
213 @tmp = split /\./, $section;
214 if (scalar(@tmp) > 1) {
215 return ($thistype, $childtype);
216 }
217 if (!$first) {
218 $title = $doc_obj->get_metadata_element ($section, "Title");
219 if (!defined $title || $title !~ /^\d+$/) {
220 return ($thistype, $childtype);
221 }
222 }
223 $first = 0;
224 $section = $doc_obj->get_next_section($section);
225 }
226 if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
227 $thistype = "Paged";
228 } else {
229 $thistype = "Invisible";
230 }
231 $childtype = "Paged";
232 return ($thistype, $childtype);
233}
234
235sub assoc_files {
236 my $self = shift (@_);
237 my ($doc_obj, $archivedir) = @_;
238 my ($afile);
239
240 foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
241 # if assoc file starts with a slash, we put it relative to the assoc
242 # dir, otherwise it is relative to the HASH... directory
243 if ($assoc_file->[1] =~ m@^[/\\]@) {
244 $afile = &util::filename_cat($self->{'assocdir'},$assoc_file->[1]);
245 } else {
246 $afile = &util::filename_cat($self->{'assocdir'}, $archivedir, $assoc_file->[1]);
247 }
248 &util::hard_link ($assoc_file->[0], $afile);
249 }
250}
251
252sub infodb {
253 my $self = shift (@_);
254 my ($doc_obj, $filename) = @_;
255 my $handle = $self->{'output_handle'};
256# $handle = "main::STDOUT";
257
258 my $doctype = $doc_obj->get_doc_type();
259
260 # only output this document if it is one to be indexed
261 return if ($doctype ne "indexed_doc");
262
263 my ($archivedir) = $filename =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
264 $archivedir = "" unless defined $archivedir;
265 $archivedir =~ s/\\/\//g;
266 $archivedir =~ s/^\/+//;
267 $archivedir =~ s/\/+$//;
268
269 $self->assoc_files ($doc_obj, $archivedir);
270
271 # this is another document
272 $self->{'num_docs'} += 1 unless ($doctype eq "classification");
273
274 # is this a paged or a hierarchical document
275 my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
276
277 my $section = $doc_obj->get_top_section ();
278 my $doc_OID = $doc_obj->get_OID();
279 my $first = 1;
280 my $url = "";
281 while (defined $section) {
282 # update a few statistics
283 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
284 $self->{'num_sections'} += 1 unless ($doctype eq "classification");
285
286 # output the section name
287 if ($section eq "") { print $handle "[$doc_OID]\n"; }
288 else { print $handle "[$doc_OID.$section]\n"; }
289
290 # output the fact that this document is a document (unless doctype
291 # has been set to something else from within a plugin
292 my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
293 if (!defined $dtype || $dtype !~ /\w/) {
294 print $handle "<doctype>doc\n";
295 }
296
297 # output whether this node contains text
298 if ($doc_obj->get_text_length($section) > 0) {
299 print $handle "<hastxt>1\n";
300 } else {
301 print $handle "<hastxt>0\n";
302 }
303
304 # output all the section metadata
305 my $metadata = $doc_obj->get_all_metadata ($section);
306 foreach my $pair (@$metadata) {
307 my ($field, $value) = (@$pair);
308
309 if ($field ne "Identifier" && $field !~ /^gsdl/ &&
310 defined $value && $value ne "") {
311
312 # escape problematic stuff
313 $value =~ s/\\/\\\\/g;
314 $value =~ s/\n/\\n/g;
315 $value =~ s/\r/\\r/g;
316 if ($value =~ /-{70,}/) {
317 # if value contains 70 or more hyphens in a row we need
318 # to escape them to prevent txt2db from treating them
319 # as a separator
320 $value =~ s/-/&\#045;/gi;
321 }
322
323 # special case for URL metadata
324 if ($field =~ /^URL$/i) {
325 $url .= "[$value]\n";
326 if ($section eq "") {$url .= "<section>$doc_OID\n";}
327 else {$url .= "<section>$doc_OID.$section\n";}
328 $url .= '-' x 70 . "\n";
329 }
330
331 if (!defined $self->{'dontgdbm'}->{$field}) {
332 print $handle "<$field>$value\n";
333 }
334 }
335 }
336
337 # output archivedir if at top level
338 if ($section eq $doc_obj->get_top_section()) {
339 print $handle "<archivedir>$archivedir\n";
340 }
341
342 # output document display type
343 if ($first) {
344 print $handle "<thistype>$thistype\n";
345 }
346
347 # output a list of children
348 my $children = $doc_obj->get_children ($section);
349 if (scalar(@$children) > 0) {
350 print $handle "<childtype>$childtype\n";
351 print $handle "<contains>";
352 my $firstchild = 1;
353 foreach my $child (@$children) {
354 print $handle ";" unless $firstchild;
355 $firstchild = 0;
356 if ($child =~ /^.*?\.(\d+)$/) {
357 print $handle "\".$1";
358 } else {
359 print $handle "\".$child";
360 }
361# if ($child eq "") { print $handle "$doc_OID"; }
362# elsif ($section eq "") { print $handle "$doc_OID.$child"; }
363# else { print $handle "$doc_OID.$section.$child"; }
364 }
365 print $handle "\n";
366 }
367
368 # output the matching document number
369 print $handle "<docnum>$self->{'num_sections'}\n";
370
371 print $handle '-' x 70, "\n";
372
373
374 # output a database entry for the document number
375 print $handle "[$self->{'num_sections'}]\n";
376 if ($section eq "") { print $handle "<section>$doc_OID\n"; }
377 else { print $handle "<section>$doc_OID.$section\n"; }
378 print $handle '-' x 70, "\n";
379
380 # output entry for url
381 if ($url ne "") {
382 print $handle $url;
383 }
384
385 $first = 0;
386 $section = $doc_obj->get_next_section($section);
387 }
388
389 #add this document to the browse structure
390 push(@{$self->{'doclist'}},$doc_obj->get_OID())
391 unless ($doctype eq "classification");
392
393 # classify this document
394 &classify::classify_doc ($self->{'classifiers'}, $doc_obj);
395
396}
397
398sub find_paragraphs {
399 $_[1] =~ s/(<p\b)/\cC$1/gi;
400}
401
402sub filter_text {
403 # $self->filter_text ($field, $new_text);
404 # don't want to do anything for this version, however,
405 # in a particular collection you might want to override
406 # this method to post-process certain fields depending on
407 # the field, or whether we are outputting it for indexing
408}
409
410sub text {
411 my $self = shift (@_);
412 my ($doc_obj) = @_;
413 my $handle = $self->{'output_handle'};
414 my $indexed_doc = 1;
415
416 # only output this document if it is one to be indexed
417 return if ($doc_obj->get_doc_type() ne "indexed_doc");
418
419 # see if this document belongs to this subcollection
420 foreach my $indexexp (@{$self->{'indexexparr'}}) {
421 $indexed_doc = 0;
422 my ($field, $exp, $options) = split /\//, $indexexp;
423 if (defined ($field) && defined ($exp)) {
424 my ($bool) = $field =~ /^(.)/;
425 $field =~ s/^.// if $bool eq '!';
426 if ($field =~ /^filename$/i) {
427 $field = $doc_obj->get_source_filename();
428 } else {
429 $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
430 }
431 next unless defined $field;
432 if ($bool eq '!') {
433 if ($options =~ /^i$/i) {
434 if ($field !~ /$exp/i) {$indexed_doc = 1; last;}
435 } else {
436 if ($field !~ /$exp/) {$indexed_doc = 1; last;}
437 }
438 } else {
439 if ($options =~ /^i$/i) {
440 if ($field =~ /$exp/i) {$indexed_doc = 1; last;}
441 } else {
442 if ($field =~ /$exp/) {$indexed_doc = 1; last;}
443 }
444 }
445 }
446 }
447
448 # this is another document
449 $self->{'num_docs'} += 1;
450
451 # get the parameters for the output
452 my ($level, $fields) = split (/:/, $self->{'index'});
453 $fields =~ s/\ball\b/Title,Creator,text/;
454 $fields =~ s/\btopall\b/topTitle,topCreator,toptext/;
455
456 my $doc_section = 0; # just for this document
457 my $text = "";
458 my $text_extra = "";
459
460 # get the text for this document
461 my $section = $doc_obj->get_top_section();
462 while (defined $section) {
463 # update a few statistics
464 $doc_section++;
465 $self->{'num_sections'} += 1;
466
467 if ($indexed_doc) {
468 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
469 foreach my $field (split (/,/, $fields)) {
470 # only deal with this field if it doesn't start with top or
471 # this is the first section
472 my $real_field = $field;
473 if (!($real_field =~ s/^top//) || ($doc_section == 1)) {
474 my $new_text = "";
475 if ($real_field eq "text") {
476 $new_text = $doc_obj->get_text ($section) if $self->{'store_text'};
477 $self->{'num_processed_bytes'} += length ($new_text);
478 $new_text =~ s/[\cB\cC]//g;
479 $self->find_paragraphs($new_text);
480
481 } else {
482 my $first = 1;
483 foreach $meta (@{$doc_obj->get_metadata ($section, $real_field)}) {
484 $meta =~ s/[\cB\cC]//g;
485 $self->{'num_processed_bytes'} += length ($meta);
486 $new_text .= "\cC" unless $first;
487 $new_text .= $meta if $self->{'store_text'};
488 $first = 0;
489 }
490 }
491
492 # filter the text
493 $self->filter_text ($field, $new_text);
494
495 $text .= "$new_text\cC";
496 }
497 }
498 }
499
500 if ($level eq "document") { $text_extra .= "\cB"; }
501 else { $text .= "\cB"; }
502
503 $section = $doc_obj->get_next_section($section);
504 }
505
506 print $handle "$text$text_extra";
507}
508
5091;
510
Note: See TracBrowser for help on using the repository browser.