source: trunk/gsdl/perllib/mgbuildproc.pm@ 706

Last change on this file since 706 was 706, checked in by sjboddie, 25 years ago

fixed a kind of bug...

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 9.9 KB
Line 
1###########################################################################
2#
3# mgbuildproc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor outputs a document
27# for mg to process
28
29
30package mgbuildproc;
31
32use classify;
33use doc;
34use docproc;
35use util;
36
37
38BEGIN {
39 @ISA = ('docproc');
40}
41
42
43sub new {
44 my ($class, $collection, $source_dir, $build_dir, $verbosity) = @_;
45 my $self = new docproc ();
46
47 $self->{'collection'} = $collection;
48 $self->{'source_dir'} = $source_dir;
49 $self->{'build_dir'} = $build_dir;
50 $self->{'verbosity'} = $verbosity;
51 $self->{'classifiers'} = [];
52 $self->{'mode'} = "text";
53 $self->{'index'} = "section:text";
54 $self->{'indexexparr'} = [];
55 $self->{'output_handle'} = "STDOUT";
56 $self->{'num_docs'} = 0;
57 $self->{'num_sections'} = 0;
58 $self->{'num_bytes'} = 0;
59
60 $self->{'indexing_text'} = 0;
61
62 return bless $self, $class;
63}
64
65sub reset {
66 my $self = shift (@_);
67
68 $self->{'num_docs'} = 0;
69 $self->{'num_sections'} = 0;
70 $self->{'num_bytes'} = 0;
71}
72
73sub get_num_docs {
74 my $self = shift (@_);
75
76 return $self->{'num_docs'};
77}
78
79sub get_num_sections {
80 my $self = shift (@_);
81
82 return $self->{'num_sections'};
83}
84
85sub get_num_bytes {
86 my $self = shift (@_);
87
88 return $self->{'num_bytes'};
89}
90
91sub set_output_handle {
92 my $self = shift (@_);
93 my ($handle) = @_;
94
95 $self->{'output_handle'} = $handle;
96}
97
98sub set_mode {
99 my $self = shift (@_);
100 my ($mode) = @_;
101
102 $self->{'mode'} = $mode;
103}
104
105sub set_index {
106 my $self = shift (@_);
107 my ($index, $indexexparr) = @_;
108
109 $self->{'index'} = $index;
110 $self->{'indexexparr'} = $indexexparr if defined $indexexparr;
111}
112
113sub set_classifiers {
114 my $self = shift (@_);
115 my ($classifiers) = @_;
116
117 $self->{'classifiers'} = $classifiers;
118}
119
120sub set_indexing_text {
121 my $self = shift (@_);
122 my ($indexing_text) = @_;
123
124 $self->{'indexing_text'} = $indexing_text;
125}
126
127sub process {
128 my $self = shift (@_);
129 my $method = $self->{'mode'};
130
131 $self->$method(@_);
132}
133
134# use 'Paged' if document has no more than 2 levels
135# and each section at second level has a number for
136# Title metadata
137sub get_document_type {
138 my $self = shift (@_);
139 my ($doc_obj) = @_;
140
141 my $thistype = "VList";
142 my $childtype = "VList";
143 my $title;
144 my @tmp = ();
145
146 my $section = $doc_obj->get_top_section ();
147 my $first = 1;
148 while (defined $section) {
149 @tmp = split /\./, $section;
150 if (scalar(@tmp) > 1) {
151 return ($thistype, $childtype);
152 }
153 if (!$first) {
154 $title = $doc_obj->get_metadata_element ($section, "Title");
155 if (!defined $title || $title !~ /^\d+$/) {
156 return ($thistype, $childtype);
157 }
158 }
159 $first = 0;
160 $section = $doc_obj->get_next_section($section);
161 }
162 if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
163 $thistype = "Paged";
164 } else {
165 $thistype = "Invisible";
166 }
167 $childtype = "Paged";
168 return ($thistype, $childtype);
169}
170
171sub infodb {
172 my $self = shift (@_);
173 my ($doc_obj, $filename) = @_;
174 my $handle = $self->{'output_handle'};
175# $handle = "main::STDOUT";
176
177 my $doctype = $doc_obj->get_doc_type();
178
179 # only output this document if it is one to be indexed
180 return if ($doctype ne "indexed_doc");
181
182 # this is another document
183 $self->{'num_docs'} += 1 unless ($doctype eq "classification");
184
185 # is this a paged or a hierarchical document
186 my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
187
188 my $section = $doc_obj->get_top_section ();
189 my $doc_OID = $doc_obj->get_OID();
190 my $first = 1;
191 while (defined $section) {
192 # update a few statistics
193 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
194 $self->{'num_sections'} += 1 unless ($doctype eq "classification");
195
196 # output the section name
197 if ($section eq "") { print $handle "[$doc_OID]\n"; }
198 else { print $handle "[$doc_OID.$section]\n"; }
199
200 # output the fact that this document is a document
201 print $handle "<doctype>doc\n";
202
203 # output whether this node contains text
204 if ($doc_obj->get_text_length($section) > 0) {
205 print $handle "<hastxt>1\n";
206 } else {
207 print $handle "<hastxt>0\n";
208 }
209
210 # output all the section metadata
211 my $metadata = $doc_obj->get_all_metadata ($section);
212 foreach $pair (@$metadata) {
213 my ($field, $value) = (@$pair);
214
215 if ($field ne "Identifier" && $field !~ /^gsdl/ &&
216 defined $value && $value ne "") {
217 # escape problematic stuff
218 $value =~ s/\\/\\\\/g;
219 $value =~ s/\n/\\n/g;
220 $value =~ s/\r/\\r/g;
221
222 print $handle "<$field>$value\n";
223 }
224 }
225
226 # output archivedir if at top level
227 if ($section eq $doc_obj->get_top_section()) {
228 my ($archivedir) = $filename =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
229 $archivedir = "" unless defined $archivedir;
230 $archivedir =~ s/^(\/|\\)*//;
231 $archivedir =~ s/(\/|\\)*$//;
232 print $handle "<archivedir>$archivedir\n";
233 }
234
235 # output document display type
236 if ($first) {
237 print $handle "<thistype>$thistype\n";
238 }
239
240 # output a list of children
241 my $children = $doc_obj->get_children ($section);
242 if (scalar(@$children) > 0) {
243 print $handle "<childtype>$childtype\n";
244 print $handle "<contains>";
245 my $firstchild = 1;
246 foreach $child (@$children) {
247 print $handle ";" unless $firstchild;
248 $firstchild = 0;
249 if ($child =~ /^.*?\.(\d+)$/) {
250 print $handle "\".$1";
251 } else {
252 print $handle "\".$child";
253 }
254# if ($child eq "") { print $handle "$doc_OID"; }
255# elsif ($section eq "") { print $handle "$doc_OID.$child"; }
256# else { print $handle "$doc_OID.$section.$child"; }
257 }
258 print $handle "\n";
259 }
260
261 # output the matching document number
262 print $handle "<docnum>$self->{'num_sections'}\n";
263
264 print $handle '-' x 70, "\n";
265
266
267 # output a database entry for the document number
268 print $handle "[$self->{'num_sections'}]\n";
269 if ($section eq "") { print $handle "<section>$doc_OID\n"; }
270 else { print $handle "<section>$doc_OID.$section\n"; }
271 print $handle '-' x 70, "\n";
272
273 $first = 0;
274 $section = $doc_obj->get_next_section($section);
275 }
276
277 # classify this document
278 &classify::classify_doc ($self->{'classifiers'}, $doc_obj);
279
280}
281
282sub find_paragraphs {
283 $_[1] =~ s/(<p\b)/\cC$1/gi;
284}
285
286sub filter_text {
287 # $self->filter_text ($field, $new_text);
288 # don't want to do anything for this version, however,
289 # in a particular collection you might want to override
290 # this method to post-process certain fields depending on
291 # the field, or whether we are outputting it for indexing
292}
293
294sub text {
295 my $self = shift (@_);
296 my ($doc_obj) = @_;
297 my $handle = $self->{'output_handle'};
298 my $indexed_doc = 1;
299
300 # only output this document if it is one to be indexed
301 return if ($doc_obj->get_doc_type() ne "indexed_doc");
302
303 # see if this document belongs to this subcollection
304 foreach $indexexp (@{$self->{'indexexparr'}}) {
305 $indexed_doc = 0;
306 my ($field, $exp, $options) = split /\//, $indexexp;
307 if (defined ($field) && defined ($exp)) {
308 my ($bool) = $field =~ /^(.)/;
309 $field =~ s/^.// if $bool eq '!';
310 if ($field =~ /^filename$/i) {
311 $field = $doc_obj->get_source_filename();
312 } else {
313 $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
314 }
315 next unless defined $field;
316 if ($bool eq '!') {
317 if ($options =~ /^i$/i) {
318 if ($field !~ /$exp/i) {$indexed_doc = 1; last;}
319 } else {
320 if ($field !~ /$exp/) {$indexed_doc = 1; last;}
321 }
322 } else {
323 if ($options =~ /^i$/i) {
324 if ($field =~ /$exp/i) {$indexed_doc = 1; last;}
325 } else {
326 if ($field =~ /$exp/) {$indexed_doc = 1; last;}
327 }
328 }
329 }
330 }
331
332 # this is another document
333 $self->{'num_docs'} += 1;
334
335 # get the parameters for the output
336 my ($level, $fields) = split (/:/, $self->{'index'});
337 $fields =~ s/\ball\b/Title,Creator,text/;
338 $fields =~ s/\btopall\b/topTitle,topCreator,toptext/;
339
340 my $doc_section = 0; # just for this document
341 my $text = "";
342 my $text_extra = "";
343
344 # get the text for this document
345 my $section = $doc_obj->get_top_section();
346 while (defined $section) {
347 # update a few statistics
348 $doc_section++;
349 $self->{'num_sections'} += 1;
350
351 if ($indexed_doc) {
352 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
353 foreach $field (split (/,/, $fields)) {
354 # only deal with this field if it doesn't start with top or
355 # this is the first section
356 my $real_field = $field;
357 if (!($real_field =~ s/^top//) || ($doc_section == 1)) {
358 my $new_text = "";
359 if ($real_field eq "text") {
360 $new_text = $doc_obj->get_text ($section);
361 $new_text =~ s/[\cB\cC]//g;
362 $self->find_paragraphs($new_text);
363
364 } else {
365 $new_text = join ("\cC", @{$doc_obj->get_metadata ($section, $real_field)});
366 }
367
368 # filter the text
369 $self->filter_text ($field, $new_text);
370
371 if ($self->{'indexing_text'} &&
372 $new_text =~ /[\(\)\{\}]/) {
373 }
374
375 $text .= "$new_text\cC";
376 }
377 }
378 }
379
380 if ($level eq "document") { $text_extra .= "\cB"; }
381 else { $text .= "\cB"; }
382
383 $section = $doc_obj->get_next_section($section);
384 }
385
386 print $handle "$text$text_extra";
387}
388
3891;
390
Note: See TracBrowser for help on using the repository browser.