Context Navigation

source: trunk/gsdl/perllib/mgbuildproc.pm@ 810

Last change on this file since 810 was 810, checked in by sjboddie, 24 years ago
plugins now take options, files are associated at build time as well as import time
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 11.0 KB

Line
1	###########################################################################
2	#
3	# mgbuildproc.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# This document processor outputs a document
27	# for mg to process
28
29
30	package mgbuildproc;
31
32	use classify;
33	use doc;
34	use docproc;
35	use util;
36
37
38	BEGIN {
39	@ISA = ('docproc');
40	}
41
42
43	sub new {
44	my ($class, $collection, $source_dir, $build_dir, $verbosity) = @_;
45	my $self = new docproc ();
46
47	$self->{'collection'} = $collection;
48	$self->{'source_dir'} = $source_dir;
49	$self->{'build_dir'} = $build_dir;
50	$self->{'verbosity'} = $verbosity;
51	$self->{'classifiers'} = [];
52	$self->{'mode'} = "text";
53	$self->{'assocdir'} = $build_dir;
54	$self->{'dontgdbm'} = {};
55	$self->{'index'} = "section:text";
56	$self->{'indexexparr'} = [];
57	$self->{'output_handle'} = "STDOUT";
58	$self->{'num_docs'} = 0;
59	$self->{'num_sections'} = 0;
60	$self->{'num_bytes'} = 0;
61
62	$self->{'indexing_text'} = 0;
63
64	return bless $self, $class;
65	}
66
67	sub reset {
68	my $self = shift (@_);
69
70	$self->{'num_docs'} = 0;
71	$self->{'num_sections'} = 0;
72	$self->{'num_bytes'} = 0;
73	}
74
75	sub get_num_docs {
76	my $self = shift (@_);
77
78	return $self->{'num_docs'};
79	}
80
81	sub get_num_sections {
82	my $self = shift (@_);
83
84	return $self->{'num_sections'};
85	}
86
87	sub get_num_bytes {
88	my $self = shift (@_);
89
90	return $self->{'num_bytes'};
91	}
92
93	sub set_output_handle {
94	my $self = shift (@_);
95	my ($handle) = @_;
96
97	$self->{'output_handle'} = $handle;
98	}
99
100	sub set_mode {
101	my $self = shift (@_);
102	my ($mode) = @_;
103
104	$self->{'mode'} = $mode;
105	}
106
107	sub set_assocdir {
108	my $self = shift (@_);
109	my ($assocdir) = @_;
110
111	$self->{'assocdir'} = $assocdir;
112	}
113
114	sub set_dontgdbm {
115	my $self = shift (@_);
116	my ($dontgdbm) = @_;
117
118	$self->{'dontgdbm'} = $dontgdbm;
119	}
120
121	sub set_index {
122	my $self = shift (@_);
123	my ($index, $indexexparr) = @_;
124
125	$self->{'index'} = $index;
126	$self->{'indexexparr'} = $indexexparr if defined $indexexparr;
127	}
128
129	sub set_classifiers {
130	my $self = shift (@_);
131	my ($classifiers) = @_;
132
133	$self->{'classifiers'} = $classifiers;
134	}
135
136	sub set_indexing_text {
137	my $self = shift (@_);
138	my ($indexing_text) = @_;
139
140	$self->{'indexing_text'} = $indexing_text;
141	}
142
143	sub process {
144	my $self = shift (@_);
145	my $method = $self->{'mode'};
146
147	$self->$method(@_);
148	}
149
150	# use 'Paged' if document has no more than 2 levels
151	# and each section at second level has a number for
152	# Title metadata
153	sub get_document_type {
154	my $self = shift (@_);
155	my ($doc_obj) = @_;
156
157	my $thistype = "VList";
158	my $childtype = "VList";
159	my $title;
160	my @tmp = ();
161
162	my $section = $doc_obj->get_top_section ();
163	my $first = 1;
164	while (defined $section) {
165	@tmp = split /\./, $section;
166	if (scalar(@tmp) > 1) {
167	return ($thistype, $childtype);
168	}
169	if (!$first) {
170	$title = $doc_obj->get_metadata_element ($section, "Title");
171	if (!defined $title \|\| $title !~ /^\d+$/) {
172	return ($thistype, $childtype);
173	}
174	}
175	$first = 0;
176	$section = $doc_obj->get_next_section($section);
177	}
178	if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
179	$thistype = "Paged";
180	} else {
181	$thistype = "Invisible";
182	}
183	$childtype = "Paged";
184	return ($thistype, $childtype);
185	}
186
187	sub assoc_files {
188	my $self = shift (@_);
189	my ($doc_obj) = @_;
190
191	foreach $assoc_file (@{$doc_obj->get_assoc_files()}) {
192	$afile = &util::filename_cat($self->{'assocdir'}, $assoc_file->[1]);
193	&util::hard_link ($assoc_file->[0], $afile);
194	}
195	}
196
197	sub infodb {
198	my $self = shift (@_);
199	my ($doc_obj, $filename) = @_;
200	my $handle = $self->{'output_handle'};
201	# $handle = "main::STDOUT";
202
203	$self->assoc_files ($doc_obj);
204
205	my $doctype = $doc_obj->get_doc_type();
206
207	# only output this document if it is one to be indexed
208	return if ($doctype ne "indexed_doc");
209
210	# this is another document
211	$self->{'num_docs'} += 1 unless ($doctype eq "classification");
212
213	# is this a paged or a hierarchical document
214	my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
215
216	my $section = $doc_obj->get_top_section ();
217	my $doc_OID = $doc_obj->get_OID();
218	my $first = 1;
219	my $url = "";
220	while (defined $section) {
221	# update a few statistics
222	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
223	$self->{'num_sections'} += 1 unless ($doctype eq "classification");
224
225	# output the section name
226	if ($section eq "") { print $handle "[$doc_OID]\n"; }
227	else { print $handle "[$doc_OID.$section]\n"; }
228
229	# output the fact that this document is a document
230	print $handle "<doctype>doc\n";
231
232	# output whether this node contains text
233	if ($doc_obj->get_text_length($section) > 0) {
234	print $handle "<hastxt>1\n";
235	} else {
236	print $handle "<hastxt>0\n";
237	}
238
239	# output all the section metadata
240	my $metadata = $doc_obj->get_all_metadata ($section);
241	foreach $pair (@$metadata) {
242	my ($field, $value) = (@$pair);
243
244	if ($field ne "Identifier" && $field !~ /^gsdl/ &&
245	defined $value && $value ne "") {
246
247	# escape problematic stuff
248	$value =~ s/\\/\\\\/g;
249	$value =~ s/\n/\\n/g;
250	$value =~ s/\r/\\r/g;
251
252	# special case for URL metadata
253	if ($field =~ /^URL$/i) {
254	$url .= "[$value]\n";
255	if ($section eq "") {$url .= "<section>$doc_OID\n";}
256	else {$url .= "<section>$doc_OID.$section\n";}
257	$url .= '-' x 70 . "\n";
258	}
259
260	if (!defined $self->{'dontgdbm'}->{$field}) {
261	print $handle "<$field>$value\n";
262	}
263	}
264	}
265
266	# output archivedir if at top level
267	if ($section eq $doc_obj->get_top_section()) {
268	my ($archivedir) = $filename =~ /^(.?)(?:\/\|\\)[^\/\\]$/;
269	$archivedir = "" unless defined $archivedir;
270	$archivedir =~ s/\\/\//g;
271	$archivedir =~ s/^\/+//;
272	$archivedir =~ s/\/+$//;
273	print $handle "<archivedir>$archivedir\n";
274	}
275
276	# output document display type
277	if ($first) {
278	print $handle "<thistype>$thistype\n";
279	}
280
281	# output a list of children
282	my $children = $doc_obj->get_children ($section);
283	if (scalar(@$children) > 0) {
284	print $handle "<childtype>$childtype\n";
285	print $handle "<contains>";
286	my $firstchild = 1;
287	foreach $child (@$children) {
288	print $handle ";" unless $firstchild;
289	$firstchild = 0;
290	if ($child =~ /^.*?\.(\d+)$/) {
291	print $handle "\".$1";
292	} else {
293	print $handle "\".$child";
294	}
295	# if ($child eq "") { print $handle "$doc_OID"; }
296	# elsif ($section eq "") { print $handle "$doc_OID.$child"; }
297	# else { print $handle "$doc_OID.$section.$child"; }
298	}
299	print $handle "\n";
300	}
301
302	# output the matching document number
303	print $handle "<docnum>$self->{'num_sections'}\n";
304
305	print $handle '-' x 70, "\n";
306
307
308	# output a database entry for the document number
309	print $handle "[$self->{'num_sections'}]\n";
310	if ($section eq "") { print $handle "<section>$doc_OID\n"; }
311	else { print $handle "<section>$doc_OID.$section\n"; }
312	print $handle '-' x 70, "\n";
313
314	# output entry for url
315	if ($url ne "") {
316	print $handle $url;
317	}
318
319	$first = 0;
320	$section = $doc_obj->get_next_section($section);
321	}
322
323	# classify this document
324	&classify::classify_doc ($self->{'classifiers'}, $doc_obj);
325
326	}
327
328	sub find_paragraphs {
329	$_[1] =~ s/(<p\b)/\cC$1/gi;
330	}
331
332	sub filter_text {
333	# $self->filter_text ($field, $new_text);
334	# don't want to do anything for this version, however,
335	# in a particular collection you might want to override
336	# this method to post-process certain fields depending on
337	# the field, or whether we are outputting it for indexing
338	}
339
340	sub text {
341	my $self = shift (@_);
342	my ($doc_obj) = @_;
343	my $handle = $self->{'output_handle'};
344	my $indexed_doc = 1;
345
346	# only output this document if it is one to be indexed
347	return if ($doc_obj->get_doc_type() ne "indexed_doc");
348
349	# see if this document belongs to this subcollection
350	foreach $indexexp (@{$self->{'indexexparr'}}) {
351	$indexed_doc = 0;
352	my ($field, $exp, $options) = split /\//, $indexexp;
353	if (defined ($field) && defined ($exp)) {
354	my ($bool) = $field =~ /^(.)/;
355	$field =~ s/^.// if $bool eq '!';
356	if ($field =~ /^filename$/i) {
357	$field = $doc_obj->get_source_filename();
358	} else {
359	$field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
360	}
361	next unless defined $field;
362	if ($bool eq '!') {
363	if ($options =~ /^i$/i) {
364	if ($field !~ /$exp/i) {$indexed_doc = 1; last;}
365	} else {
366	if ($field !~ /$exp/) {$indexed_doc = 1; last;}
367	}
368	} else {
369	if ($options =~ /^i$/i) {
370	if ($field =~ /$exp/i) {$indexed_doc = 1; last;}
371	} else {
372	if ($field =~ /$exp/) {$indexed_doc = 1; last;}
373	}
374	}
375	}
376	}
377
378	# this is another document
379	$self->{'num_docs'} += 1;
380
381	# get the parameters for the output
382	my ($level, $fields) = split (/:/, $self->{'index'});
383	$fields =~ s/\ball\b/Title,Creator,text/;
384	$fields =~ s/\btopall\b/topTitle,topCreator,toptext/;
385
386	my $doc_section = 0; # just for this document
387	my $text = "";
388	my $text_extra = "";
389
390	# get the text for this document
391	my $section = $doc_obj->get_top_section();
392	while (defined $section) {
393	# update a few statistics
394	$doc_section++;
395	$self->{'num_sections'} += 1;
396
397	if ($indexed_doc) {
398	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
399	foreach $field (split (/,/, $fields)) {
400	# only deal with this field if it doesn't start with top or
401	# this is the first section
402	my $real_field = $field;
403	if (!($real_field =~ s/^top//) \|\| ($doc_section == 1)) {
404	my $new_text = "";
405	if ($real_field eq "text") {
406	$new_text = $doc_obj->get_text ($section);
407	$new_text =~ s/[\cB\cC]//g;
408	$self->find_paragraphs($new_text);
409
410	} else {
411	$new_text = join ("\cC", @{$doc_obj->get_metadata ($section, $real_field)});
412	}
413
414	# filter the text
415	$self->filter_text ($field, $new_text);
416
417	if ($self->{'indexing_text'} &&
418	$new_text =~ /[\(\)\{\}]/) {
419	}
420
421	$text .= "$new_text\cC";
422	}
423	}
424	}
425
426	if ($level eq "document") { $text_extra .= "\cB"; }
427	else { $text .= "\cB"; }
428
429	$section = $doc_obj->get_next_section($section);
430	}
431
432	print $handle "$text$text_extra";
433	}
434
435	1;
436

Note: See TracBrowser for help on using the repository browser.

Download in other formats: