source: trunk/gsdl/perllib/mgbuildproc.pm@ 1072

Last change on this file since 1072 was 1072, checked in by sjboddie, 24 years ago

Fixed bug - Control B's and C's were only being removed from body of text
and not from metadata values. This caused problems for mg when indexing
metadata values containing Control B's or C's. They're now removed from
both text and metadata.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 11.5 KB
Line 
1###########################################################################
2#
3# mgbuildproc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor outputs a document
27# for mg to process
28
29
30package mgbuildproc;
31
32use classify;
33use doc;
34use docproc;
35use util;
36
37BEGIN {
38 @ISA = ('docproc');
39}
40
41
42sub new {
43 my ($class, $collection, $source_dir, $build_dir, $verbosity) = @_;
44 my $self = new docproc ();
45
46 $self->{'collection'} = $collection;
47 $self->{'source_dir'} = $source_dir;
48 $self->{'build_dir'} = $build_dir;
49 $self->{'verbosity'} = $verbosity;
50 $self->{'classifiers'} = [];
51 $self->{'mode'} = "text";
52 $self->{'assocdir'} = $build_dir;
53 $self->{'dontgdbm'} = {};
54 $self->{'index'} = "section:text";
55 $self->{'indexexparr'} = [];
56 $self->{'output_handle'} = "STDOUT";
57 $self->{'num_docs'} = 0;
58 $self->{'num_sections'} = 0;
59 $self->{'num_bytes'} = 0;
60
61 $self->{'indexing_text'} = 0;
62
63 return bless $self, $class;
64}
65
66sub reset {
67 my $self = shift (@_);
68
69 $self->{'num_docs'} = 0;
70 $self->{'num_sections'} = 0;
71 $self->{'num_bytes'} = 0;
72}
73
74sub get_num_docs {
75 my $self = shift (@_);
76
77 return $self->{'num_docs'};
78}
79
80sub get_num_sections {
81 my $self = shift (@_);
82
83 return $self->{'num_sections'};
84}
85
86sub get_num_bytes {
87 my $self = shift (@_);
88
89 return $self->{'num_bytes'};
90}
91
92sub set_output_handle {
93 my $self = shift (@_);
94 my ($handle) = @_;
95
96 $self->{'output_handle'} = $handle;
97}
98
99sub set_mode {
100 my $self = shift (@_);
101 my ($mode) = @_;
102
103 $self->{'mode'} = $mode;
104}
105
106sub set_assocdir {
107 my $self = shift (@_);
108 my ($assocdir) = @_;
109
110 $self->{'assocdir'} = $assocdir;
111}
112
113sub set_dontgdbm {
114 my $self = shift (@_);
115 my ($dontgdbm) = @_;
116
117 $self->{'dontgdbm'} = $dontgdbm;
118}
119
120sub set_index {
121 my $self = shift (@_);
122 my ($index, $indexexparr) = @_;
123
124 $self->{'index'} = $index;
125 $self->{'indexexparr'} = $indexexparr if defined $indexexparr;
126}
127
128sub set_classifiers {
129 my $self = shift (@_);
130 my ($classifiers) = @_;
131
132 $self->{'classifiers'} = $classifiers;
133}
134
135sub set_indexing_text {
136 my $self = shift (@_);
137 my ($indexing_text) = @_;
138
139 $self->{'indexing_text'} = $indexing_text;
140}
141
142sub process {
143 my $self = shift (@_);
144 my $method = $self->{'mode'};
145
146 $self->$method(@_);
147}
148
149# use 'Paged' if document has no more than 2 levels
150# and each section at second level has a number for
151# Title metadata
152sub get_document_type {
153 my $self = shift (@_);
154 my ($doc_obj) = @_;
155
156 my $thistype = "VList";
157 my $childtype = "VList";
158 my $title;
159 my @tmp = ();
160
161 my $section = $doc_obj->get_top_section ();
162 my $first = 1;
163 while (defined $section) {
164 @tmp = split /\./, $section;
165 if (scalar(@tmp) > 1) {
166 return ($thistype, $childtype);
167 }
168 if (!$first) {
169 $title = $doc_obj->get_metadata_element ($section, "Title");
170 if (!defined $title || $title !~ /^\d+$/) {
171 return ($thistype, $childtype);
172 }
173 }
174 $first = 0;
175 $section = $doc_obj->get_next_section($section);
176 }
177 if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
178 $thistype = "Paged";
179 } else {
180 $thistype = "Invisible";
181 }
182 $childtype = "Paged";
183 return ($thistype, $childtype);
184}
185
186sub assoc_files {
187 my $self = shift (@_);
188 my ($doc_obj, $archivedir) = @_;
189 my ($afile);
190
191 foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
192 # if assoc file contains directory structure of
193 # its own use it, otherwise use HASH... directory
194 if ($assoc_file->[1] =~ /[\/\\]/) {
195 $afile = &util::filename_cat($self->{'assocdir'}, $assoc_file->[1]);
196 } else {
197 $afile = &util::filename_cat($self->{'assocdir'}, $archivedir, $assoc_file->[1]);
198 }
199 &util::hard_link ($assoc_file->[0], $afile);
200 }
201}
202
203sub infodb {
204 my $self = shift (@_);
205 my ($doc_obj, $filename) = @_;
206 my $handle = $self->{'output_handle'};
207# $handle = "main::STDOUT";
208
209 my $doctype = $doc_obj->get_doc_type();
210
211 # only output this document if it is one to be indexed
212 return if ($doctype ne "indexed_doc");
213
214 my ($archivedir) = $filename =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
215 $archivedir = "" unless defined $archivedir;
216 $archivedir =~ s/\\/\//g;
217 $archivedir =~ s/^\/+//;
218 $archivedir =~ s/\/+$//;
219
220 $self->assoc_files ($doc_obj, $archivedir);
221
222 # this is another document
223 $self->{'num_docs'} += 1 unless ($doctype eq "classification");
224
225 # is this a paged or a hierarchical document
226 my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
227
228 my $section = $doc_obj->get_top_section ();
229 my $doc_OID = $doc_obj->get_OID();
230 my $first = 1;
231 my $url = "";
232 while (defined $section) {
233 # update a few statistics
234 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
235 $self->{'num_sections'} += 1 unless ($doctype eq "classification");
236
237 # output the section name
238 if ($section eq "") { print $handle "[$doc_OID]\n"; }
239 else { print $handle "[$doc_OID.$section]\n"; }
240
241 # output the fact that this document is a document (unless doctype
242 # has been set to something else from within a plugin
243 my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
244 if (!defined $dtype || $dtype !~ /\w/) {
245 print $handle "<doctype>doc\n";
246 }
247
248 # output whether this node contains text
249 if ($doc_obj->get_text_length($section) > 0) {
250 print $handle "<hastxt>1\n";
251 } else {
252 print $handle "<hastxt>0\n";
253 }
254
255 # output all the section metadata
256 my $metadata = $doc_obj->get_all_metadata ($section);
257 foreach my $pair (@$metadata) {
258 my ($field, $value) = (@$pair);
259
260 if ($field ne "Identifier" && $field !~ /^gsdl/ &&
261 defined $value && $value ne "") {
262
263 # escape problematic stuff
264 $value =~ s/\\/\\\\/g;
265 $value =~ s/\n/\\n/g;
266 $value =~ s/\r/\\r/g;
267
268 # special case for URL metadata
269 if ($field =~ /^URL$/i) {
270 $url .= "[$value]\n";
271 if ($section eq "") {$url .= "<section>$doc_OID\n";}
272 else {$url .= "<section>$doc_OID.$section\n";}
273 $url .= '-' x 70 . "\n";
274 }
275
276 if (!defined $self->{'dontgdbm'}->{$field}) {
277 print $handle "<$field>$value\n";
278 }
279 }
280 }
281
282 # output archivedir if at top level
283 if ($section eq $doc_obj->get_top_section()) {
284 print $handle "<archivedir>$archivedir\n";
285 }
286
287 # output document display type
288 if ($first) {
289 print $handle "<thistype>$thistype\n";
290 }
291
292 # output a list of children
293 my $children = $doc_obj->get_children ($section);
294 if (scalar(@$children) > 0) {
295 print $handle "<childtype>$childtype\n";
296 print $handle "<contains>";
297 my $firstchild = 1;
298 foreach my $child (@$children) {
299 print $handle ";" unless $firstchild;
300 $firstchild = 0;
301 if ($child =~ /^.*?\.(\d+)$/) {
302 print $handle "\".$1";
303 } else {
304 print $handle "\".$child";
305 }
306# if ($child eq "") { print $handle "$doc_OID"; }
307# elsif ($section eq "") { print $handle "$doc_OID.$child"; }
308# else { print $handle "$doc_OID.$section.$child"; }
309 }
310 print $handle "\n";
311 }
312
313 # output the matching document number
314 print $handle "<docnum>$self->{'num_sections'}\n";
315
316 print $handle '-' x 70, "\n";
317
318
319 # output a database entry for the document number
320 print $handle "[$self->{'num_sections'}]\n";
321 if ($section eq "") { print $handle "<section>$doc_OID\n"; }
322 else { print $handle "<section>$doc_OID.$section\n"; }
323 print $handle '-' x 70, "\n";
324
325 # output entry for url
326 if ($url ne "") {
327 print $handle $url;
328 }
329
330 $first = 0;
331 $section = $doc_obj->get_next_section($section);
332 }
333
334 # classify this document
335 &classify::classify_doc ($self->{'classifiers'}, $doc_obj);
336
337}
338
339sub find_paragraphs {
340 $_[1] =~ s/(<p\b)/\cC$1/gi;
341}
342
343sub filter_text {
344 # $self->filter_text ($field, $new_text);
345 # don't want to do anything for this version, however,
346 # in a particular collection you might want to override
347 # this method to post-process certain fields depending on
348 # the field, or whether we are outputting it for indexing
349}
350
351sub text {
352 my $self = shift (@_);
353 my ($doc_obj) = @_;
354 my $handle = $self->{'output_handle'};
355 my $indexed_doc = 1;
356
357 # only output this document if it is one to be indexed
358 return if ($doc_obj->get_doc_type() ne "indexed_doc");
359
360 # see if this document belongs to this subcollection
361 foreach my $indexexp (@{$self->{'indexexparr'}}) {
362 $indexed_doc = 0;
363 my ($field, $exp, $options) = split /\//, $indexexp;
364 if (defined ($field) && defined ($exp)) {
365 my ($bool) = $field =~ /^(.)/;
366 $field =~ s/^.// if $bool eq '!';
367 if ($field =~ /^filename$/i) {
368 $field = $doc_obj->get_source_filename();
369 } else {
370 $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
371 }
372 next unless defined $field;
373 if ($bool eq '!') {
374 if ($options =~ /^i$/i) {
375 if ($field !~ /$exp/i) {$indexed_doc = 1; last;}
376 } else {
377 if ($field !~ /$exp/) {$indexed_doc = 1; last;}
378 }
379 } else {
380 if ($options =~ /^i$/i) {
381 if ($field =~ /$exp/i) {$indexed_doc = 1; last;}
382 } else {
383 if ($field =~ /$exp/) {$indexed_doc = 1; last;}
384 }
385 }
386 }
387 }
388
389 # this is another document
390 $self->{'num_docs'} += 1;
391
392 # get the parameters for the output
393 my ($level, $fields) = split (/:/, $self->{'index'});
394 $fields =~ s/\ball\b/Title,Creator,text/;
395 $fields =~ s/\btopall\b/topTitle,topCreator,toptext/;
396
397 my $doc_section = 0; # just for this document
398 my $text = "";
399 my $text_extra = "";
400
401 # get the text for this document
402 my $section = $doc_obj->get_top_section();
403 while (defined $section) {
404 # update a few statistics
405 $doc_section++;
406 $self->{'num_sections'} += 1;
407
408 if ($indexed_doc) {
409 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
410 foreach my $field (split (/,/, $fields)) {
411 # only deal with this field if it doesn't start with top or
412 # this is the first section
413 my $real_field = $field;
414 if (!($real_field =~ s/^top//) || ($doc_section == 1)) {
415 my $new_text = "";
416 if ($real_field eq "text") {
417 $new_text = $doc_obj->get_text ($section);
418 $new_text =~ s/[\cB\cC]//g;
419 $self->find_paragraphs($new_text);
420
421 } else {
422 my $first = 1;
423 foreach $meta (@{$doc_obj->get_metadata ($section, $real_field)}) {
424 $meta =~ s/[\cB\cC]//g;
425 $new_text .= "\cC" unless $first;
426 $new_text .= $meta;
427 $first = 0;
428 }
429 }
430
431 # filter the text
432 $self->filter_text ($field, $new_text);
433
434 $text .= "$new_text\cC";
435 }
436 }
437 }
438
439 if ($level eq "document") { $text_extra .= "\cB"; }
440 else { $text .= "\cB"; }
441
442 $section = $doc_obj->get_next_section($section);
443 }
444
445 print $handle "$text$text_extra";
446}
447
4481;
449
Note: See TracBrowser for help on using the repository browser.