source: trunk/gsdl/perllib/mgbuildproc.pm@ 1251

Last change on this file since 1251 was 1251, checked in by sjboddie, 24 years ago

Added some stat reporting and a warning message to the build code.
Now warns when very little or no text is to be processed for a given
index (as mg craps out in these situations). Will hopefully be useful
in realizing when an attempt is made to create an index of metadata that
is never set etc.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 12.2 KB
Line 
1###########################################################################
2#
3# mgbuildproc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor outputs a document
27# for mg to process
28
29
30package mgbuildproc;
31
32use classify;
33use doc;
34use docproc;
35use util;
36
37BEGIN {
38 @ISA = ('docproc');
39}
40
41
42sub new {
43 my ($class, $collection, $source_dir, $build_dir, $verbosity) = @_;
44 my $self = new docproc ();
45
46 $self->{'collection'} = $collection;
47 $self->{'source_dir'} = $source_dir;
48 $self->{'build_dir'} = $build_dir;
49 $self->{'verbosity'} = $verbosity;
50 $self->{'classifiers'} = [];
51 $self->{'mode'} = "text";
52 $self->{'assocdir'} = $build_dir;
53 $self->{'dontgdbm'} = {};
54 $self->{'index'} = "section:text";
55 $self->{'indexexparr'} = [];
56 $self->{'output_handle'} = "STDOUT";
57 $self->{'num_docs'} = 0;
58 $self->{'num_sections'} = 0;
59 $self->{'num_bytes'} = 0;
60 $self->{'num_processed_bytes'} = 0;
61
62 $self->{'indexing_text'} = 0;
63
64 return bless $self, $class;
65}
66
67sub reset {
68 my $self = shift (@_);
69
70 $self->{'num_docs'} = 0;
71 $self->{'num_sections'} = 0;
72 $self->{'num_processed_bytes'} = 0;
73 $self->{'num_bytes'} = 0;
74}
75
76sub get_num_docs {
77 my $self = shift (@_);
78
79 return $self->{'num_docs'};
80}
81
82sub get_num_sections {
83 my $self = shift (@_);
84
85 return $self->{'num_sections'};
86}
87
88# num_bytes is the actual number of bytes in the collection
89# this is normally the same as what's processed during text compression
90sub get_num_bytes {
91 my $self = shift (@_);
92
93 return $self->{'num_bytes'};
94}
95
96# num_processed_bytes is the number of bytes actually passed
97# to mg for the current index
98sub get_num_processed_bytes {
99 my $self = shift (@_);
100
101 return $self->{'num_processed_bytes'};
102}
103
104sub set_output_handle {
105 my $self = shift (@_);
106 my ($handle) = @_;
107
108 $self->{'output_handle'} = $handle;
109}
110
111sub set_mode {
112 my $self = shift (@_);
113 my ($mode) = @_;
114
115 $self->{'mode'} = $mode;
116}
117
118sub set_assocdir {
119 my $self = shift (@_);
120 my ($assocdir) = @_;
121
122 $self->{'assocdir'} = $assocdir;
123}
124
125sub set_dontgdbm {
126 my $self = shift (@_);
127 my ($dontgdbm) = @_;
128
129 $self->{'dontgdbm'} = $dontgdbm;
130}
131
132sub set_index {
133 my $self = shift (@_);
134 my ($index, $indexexparr) = @_;
135
136 $self->{'index'} = $index;
137 $self->{'indexexparr'} = $indexexparr if defined $indexexparr;
138}
139
140sub get_index {
141 my $self = shift (@_);
142
143 return $self->{'index'};
144}
145
146sub set_classifiers {
147 my $self = shift (@_);
148 my ($classifiers) = @_;
149
150 $self->{'classifiers'} = $classifiers;
151}
152
153sub set_indexing_text {
154 my $self = shift (@_);
155 my ($indexing_text) = @_;
156
157 $self->{'indexing_text'} = $indexing_text;
158}
159
160sub get_indexing_text {
161 my $self = shift (@_);
162
163 return $self->{'indexing_text'};
164}
165
166sub process {
167 my $self = shift (@_);
168 my $method = $self->{'mode'};
169
170 $self->$method(@_);
171}
172
173# use 'Paged' if document has no more than 2 levels
174# and each section at second level has a number for
175# Title metadata
176sub get_document_type {
177 my $self = shift (@_);
178 my ($doc_obj) = @_;
179
180 my $thistype = "VList";
181 my $childtype = "VList";
182 my $title;
183 my @tmp = ();
184
185 my $section = $doc_obj->get_top_section ();
186 my $first = 1;
187 while (defined $section) {
188 @tmp = split /\./, $section;
189 if (scalar(@tmp) > 1) {
190 return ($thistype, $childtype);
191 }
192 if (!$first) {
193 $title = $doc_obj->get_metadata_element ($section, "Title");
194 if (!defined $title || $title !~ /^\d+$/) {
195 return ($thistype, $childtype);
196 }
197 }
198 $first = 0;
199 $section = $doc_obj->get_next_section($section);
200 }
201 if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
202 $thistype = "Paged";
203 } else {
204 $thistype = "Invisible";
205 }
206 $childtype = "Paged";
207 return ($thistype, $childtype);
208}
209
210sub assoc_files {
211 my $self = shift (@_);
212 my ($doc_obj, $archivedir) = @_;
213 my ($afile);
214
215 foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
216 # if assoc file contains directory structure of
217 # its own use it, otherwise use HASH... directory
218 if ($assoc_file->[1] =~ /[\/\\]/) {
219 $afile = &util::filename_cat($self->{'assocdir'}, $assoc_file->[1]);
220 } else {
221 $afile = &util::filename_cat($self->{'assocdir'}, $archivedir, $assoc_file->[1]);
222 }
223 &util::hard_link ($assoc_file->[0], $afile);
224 }
225}
226
227sub infodb {
228 my $self = shift (@_);
229 my ($doc_obj, $filename) = @_;
230 my $handle = $self->{'output_handle'};
231# $handle = "main::STDOUT";
232
233 my $doctype = $doc_obj->get_doc_type();
234
235 # only output this document if it is one to be indexed
236 return if ($doctype ne "indexed_doc");
237
238 my ($archivedir) = $filename =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
239 $archivedir = "" unless defined $archivedir;
240 $archivedir =~ s/\\/\//g;
241 $archivedir =~ s/^\/+//;
242 $archivedir =~ s/\/+$//;
243
244 $self->assoc_files ($doc_obj, $archivedir);
245
246 # this is another document
247 $self->{'num_docs'} += 1 unless ($doctype eq "classification");
248
249 # is this a paged or a hierarchical document
250 my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
251
252 my $section = $doc_obj->get_top_section ();
253 my $doc_OID = $doc_obj->get_OID();
254 my $first = 1;
255 my $url = "";
256 while (defined $section) {
257 # update a few statistics
258 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
259 $self->{'num_sections'} += 1 unless ($doctype eq "classification");
260
261 # output the section name
262 if ($section eq "") { print $handle "[$doc_OID]\n"; }
263 else { print $handle "[$doc_OID.$section]\n"; }
264
265 # output the fact that this document is a document (unless doctype
266 # has been set to something else from within a plugin
267 my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
268 if (!defined $dtype || $dtype !~ /\w/) {
269 print $handle "<doctype>doc\n";
270 }
271
272 # output whether this node contains text
273 if ($doc_obj->get_text_length($section) > 0) {
274 print $handle "<hastxt>1\n";
275 } else {
276 print $handle "<hastxt>0\n";
277 }
278
279 # output all the section metadata
280 my $metadata = $doc_obj->get_all_metadata ($section);
281 foreach my $pair (@$metadata) {
282 my ($field, $value) = (@$pair);
283
284 if ($field ne "Identifier" && $field !~ /^gsdl/ &&
285 defined $value && $value ne "") {
286
287 # escape problematic stuff
288 $value =~ s/\\/\\\\/g;
289 $value =~ s/\n/\\n/g;
290 $value =~ s/\r/\\r/g;
291
292 # special case for URL metadata
293 if ($field =~ /^URL$/i) {
294 $url .= "[$value]\n";
295 if ($section eq "") {$url .= "<section>$doc_OID\n";}
296 else {$url .= "<section>$doc_OID.$section\n";}
297 $url .= '-' x 70 . "\n";
298 }
299
300 if (!defined $self->{'dontgdbm'}->{$field}) {
301 print $handle "<$field>$value\n";
302 }
303 }
304 }
305
306 # output archivedir if at top level
307 if ($section eq $doc_obj->get_top_section()) {
308 print $handle "<archivedir>$archivedir\n";
309 }
310
311 # output document display type
312 if ($first) {
313 print $handle "<thistype>$thistype\n";
314 }
315
316 # output a list of children
317 my $children = $doc_obj->get_children ($section);
318 if (scalar(@$children) > 0) {
319 print $handle "<childtype>$childtype\n";
320 print $handle "<contains>";
321 my $firstchild = 1;
322 foreach my $child (@$children) {
323 print $handle ";" unless $firstchild;
324 $firstchild = 0;
325 if ($child =~ /^.*?\.(\d+)$/) {
326 print $handle "\".$1";
327 } else {
328 print $handle "\".$child";
329 }
330# if ($child eq "") { print $handle "$doc_OID"; }
331# elsif ($section eq "") { print $handle "$doc_OID.$child"; }
332# else { print $handle "$doc_OID.$section.$child"; }
333 }
334 print $handle "\n";
335 }
336
337 # output the matching document number
338 print $handle "<docnum>$self->{'num_sections'}\n";
339
340 print $handle '-' x 70, "\n";
341
342
343 # output a database entry for the document number
344 print $handle "[$self->{'num_sections'}]\n";
345 if ($section eq "") { print $handle "<section>$doc_OID\n"; }
346 else { print $handle "<section>$doc_OID.$section\n"; }
347 print $handle '-' x 70, "\n";
348
349 # output entry for url
350 if ($url ne "") {
351 print $handle $url;
352 }
353
354 $first = 0;
355 $section = $doc_obj->get_next_section($section);
356 }
357
358 # classify this document
359 &classify::classify_doc ($self->{'classifiers'}, $doc_obj);
360
361}
362
363sub find_paragraphs {
364 $_[1] =~ s/(<p\b)/\cC$1/gi;
365}
366
367sub filter_text {
368 # $self->filter_text ($field, $new_text);
369 # don't want to do anything for this version, however,
370 # in a particular collection you might want to override
371 # this method to post-process certain fields depending on
372 # the field, or whether we are outputting it for indexing
373}
374
375sub text {
376 my $self = shift (@_);
377 my ($doc_obj) = @_;
378 my $handle = $self->{'output_handle'};
379 my $indexed_doc = 1;
380
381 # only output this document if it is one to be indexed
382 return if ($doc_obj->get_doc_type() ne "indexed_doc");
383
384 # see if this document belongs to this subcollection
385 foreach my $indexexp (@{$self->{'indexexparr'}}) {
386 $indexed_doc = 0;
387 my ($field, $exp, $options) = split /\//, $indexexp;
388 if (defined ($field) && defined ($exp)) {
389 my ($bool) = $field =~ /^(.)/;
390 $field =~ s/^.// if $bool eq '!';
391 if ($field =~ /^filename$/i) {
392 $field = $doc_obj->get_source_filename();
393 } else {
394 $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
395 }
396 next unless defined $field;
397 if ($bool eq '!') {
398 if ($options =~ /^i$/i) {
399 if ($field !~ /$exp/i) {$indexed_doc = 1; last;}
400 } else {
401 if ($field !~ /$exp/) {$indexed_doc = 1; last;}
402 }
403 } else {
404 if ($options =~ /^i$/i) {
405 if ($field =~ /$exp/i) {$indexed_doc = 1; last;}
406 } else {
407 if ($field =~ /$exp/) {$indexed_doc = 1; last;}
408 }
409 }
410 }
411 }
412
413 # this is another document
414 $self->{'num_docs'} += 1;
415
416 # get the parameters for the output
417 my ($level, $fields) = split (/:/, $self->{'index'});
418 $fields =~ s/\ball\b/Title,Creator,text/;
419 $fields =~ s/\btopall\b/topTitle,topCreator,toptext/;
420
421 my $doc_section = 0; # just for this document
422 my $text = "";
423 my $text_extra = "";
424
425 # get the text for this document
426 my $section = $doc_obj->get_top_section();
427 while (defined $section) {
428 # update a few statistics
429 $doc_section++;
430 $self->{'num_sections'} += 1;
431
432 if ($indexed_doc) {
433 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
434 foreach my $field (split (/,/, $fields)) {
435 # only deal with this field if it doesn't start with top or
436 # this is the first section
437 my $real_field = $field;
438 if (!($real_field =~ s/^top//) || ($doc_section == 1)) {
439 my $new_text = "";
440 if ($real_field eq "text") {
441 $new_text = $doc_obj->get_text ($section);
442 $self->{'num_processed_bytes'} += length ($new_text);
443 $new_text =~ s/[\cB\cC]//g;
444 $self->find_paragraphs($new_text);
445
446 } else {
447 my $first = 1;
448 foreach $meta (@{$doc_obj->get_metadata ($section, $real_field)}) {
449 $meta =~ s/[\cB\cC]//g;
450 $self->{'num_processed_bytes'} += length ($meta);
451 $new_text .= "\cC" unless $first;
452 $new_text .= $meta;
453 $first = 0;
454 }
455 }
456
457 # filter the text
458 $self->filter_text ($field, $new_text);
459
460 $text .= "$new_text\cC";
461 }
462 }
463 }
464
465 if ($level eq "document") { $text_extra .= "\cB"; }
466 else { $text .= "\cB"; }
467
468 $section = $doc_obj->get_next_section($section);
469 }
470
471 print $handle "$text$text_extra";
472}
473
4741;
475
Note: See TracBrowser for help on using the repository browser.