Context Navigation

source: trunk/gsdl/perllib/mgbuildproc.pm@ 2450

Last change on this file since 2450 was 2336, checked in by sjboddie, 23 years ago
added a -no_text option to buildcol.pl to allow collections to be built without storing compressed text (intended for use in collections where original documents (PDFs or Word docs maybe) are returned instead of the compressed text)
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 12.6 KB

Line
1	###########################################################################
2	#
3	# mgbuildproc.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# This document processor outputs a document
27	# for mg to process
28
29
30	package mgbuildproc;
31
32	use classify;
33	use doc;
34	use docproc;
35	use util;
36
37	BEGIN {
38	@ISA = ('docproc');
39	}
40
41
42	sub new {
43	my ($class, $collection, $source_dir, $build_dir,
44	$verbosity, $outhandle) = @_;
45	my $self = new docproc ();
46
47	# outhandle is where all the debugging info goes
48	# output_handle is where the output of the plugins is piped
49	# to (i.e. mg, gdbm etc.)
50	$outhandle = STDERR unless defined $outhandle;
51
52	$self->{'collection'} = $collection;
53	$self->{'source_dir'} = $source_dir;
54	$self->{'build_dir'} = $build_dir;
55	$self->{'verbosity'} = $verbosity;
56	$self->{'classifiers'} = [];
57	$self->{'mode'} = "text";
58	$self->{'assocdir'} = $build_dir;
59	$self->{'dontgdbm'} = {};
60	$self->{'index'} = "section:text";
61	$self->{'indexexparr'} = [];
62	$self->{'output_handle'} = "STDOUT";
63	$self->{'num_docs'} = 0;
64	$self->{'num_sections'} = 0;
65	$self->{'num_bytes'} = 0;
66	$self->{'num_processed_bytes'} = 0;
67	$self->{'store_text'} = 1;
68	$self->{'outhandle'} = $outhandle;
69
70	$self->{'indexing_text'} = 0;
71
72	return bless $self, $class;
73	}
74
75	sub reset {
76	my $self = shift (@_);
77
78	$self->{'num_docs'} = 0;
79	$self->{'num_sections'} = 0;
80	$self->{'num_processed_bytes'} = 0;
81	$self->{'num_bytes'} = 0;
82	}
83
84	sub get_num_docs {
85	my $self = shift (@_);
86
87	return $self->{'num_docs'};
88	}
89
90	sub get_num_sections {
91	my $self = shift (@_);
92
93	return $self->{'num_sections'};
94	}
95
96	# num_bytes is the actual number of bytes in the collection
97	# this is normally the same as what's processed during text compression
98	sub get_num_bytes {
99	my $self = shift (@_);
100
101	return $self->{'num_bytes'};
102	}
103
104	# num_processed_bytes is the number of bytes actually passed
105	# to mg for the current index
106	sub get_num_processed_bytes {
107	my $self = shift (@_);
108
109	return $self->{'num_processed_bytes'};
110	}
111
112	sub set_output_handle {
113	my $self = shift (@_);
114	my ($handle) = @_;
115
116	$self->{'output_handle'} = $handle;
117	}
118
119	sub set_mode {
120	my $self = shift (@_);
121	my ($mode) = @_;
122
123	$self->{'mode'} = $mode;
124	}
125
126	sub set_assocdir {
127	my $self = shift (@_);
128	my ($assocdir) = @_;
129
130	$self->{'assocdir'} = $assocdir;
131	}
132
133	sub set_dontgdbm {
134	my $self = shift (@_);
135	my ($dontgdbm) = @_;
136
137	$self->{'dontgdbm'} = $dontgdbm;
138	}
139
140	sub set_index {
141	my $self = shift (@_);
142	my ($index, $indexexparr) = @_;
143
144	$self->{'index'} = $index;
145	$self->{'indexexparr'} = $indexexparr if defined $indexexparr;
146	}
147
148	sub get_index {
149	my $self = shift (@_);
150
151	return $self->{'index'};
152	}
153
154	sub set_classifiers {
155	my $self = shift (@_);
156	my ($classifiers) = @_;
157
158	$self->{'classifiers'} = $classifiers;
159	}
160
161	sub set_indexing_text {
162	my $self = shift (@_);
163	my ($indexing_text) = @_;
164
165	$self->{'indexing_text'} = $indexing_text;
166	}
167
168	sub get_indexing_text {
169	my $self = shift (@_);
170
171	return $self->{'indexing_text'};
172	}
173
174	sub set_store_text {
175	my $self = shift (@_);
176	my ($store_text) = @_;
177
178	$self->{'store_text'} = $store_text;
179	}
180
181
182	sub process {
183	my $self = shift (@_);
184	my $method = $self->{'mode'};
185
186	$self->$method(@_);
187	}
188
189	# use 'Paged' if document has no more than 2 levels
190	# and each section at second level has a number for
191	# Title metadata
192	sub get_document_type {
193	my $self = shift (@_);
194	my ($doc_obj) = @_;
195
196	my $thistype = "VList";
197	my $childtype = "VList";
198	my $title;
199	my @tmp = ();
200
201	my $section = $doc_obj->get_top_section ();
202	my $first = 1;
203	while (defined $section) {
204	@tmp = split /\./, $section;
205	if (scalar(@tmp) > 1) {
206	return ($thistype, $childtype);
207	}
208	if (!$first) {
209	$title = $doc_obj->get_metadata_element ($section, "Title");
210	if (!defined $title \|\| $title !~ /^\d+$/) {
211	return ($thistype, $childtype);
212	}
213	}
214	$first = 0;
215	$section = $doc_obj->get_next_section($section);
216	}
217	if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
218	$thistype = "Paged";
219	} else {
220	$thistype = "Invisible";
221	}
222	$childtype = "Paged";
223	return ($thistype, $childtype);
224	}
225
226	sub assoc_files {
227	my $self = shift (@_);
228	my ($doc_obj, $archivedir) = @_;
229	my ($afile);
230
231	foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
232	# if assoc file contains directory structure of
233	# its own use it, otherwise use HASH... directory
234	if ($assoc_file->[1] =~ /[\/\\]/) {
235	$afile = &util::filename_cat($self->{'assocdir'}, $assoc_file->[1]);
236	} else {
237	$afile = &util::filename_cat($self->{'assocdir'}, $archivedir, $assoc_file->[1]);
238	}
239	&util::hard_link ($assoc_file->[0], $afile);
240	}
241	}
242
243	sub infodb {
244	my $self = shift (@_);
245	my ($doc_obj, $filename) = @_;
246	my $handle = $self->{'output_handle'};
247	# $handle = "main::STDOUT";
248
249	my $doctype = $doc_obj->get_doc_type();
250
251	# only output this document if it is one to be indexed
252	return if ($doctype ne "indexed_doc");
253
254	my ($archivedir) = $filename =~ /^(.?)(?:\/\|\\)[^\/\\]$/;
255	$archivedir = "" unless defined $archivedir;
256	$archivedir =~ s/\\/\//g;
257	$archivedir =~ s/^\/+//;
258	$archivedir =~ s/\/+$//;
259
260	$self->assoc_files ($doc_obj, $archivedir);
261
262	# this is another document
263	$self->{'num_docs'} += 1 unless ($doctype eq "classification");
264
265	# is this a paged or a hierarchical document
266	my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
267
268	my $section = $doc_obj->get_top_section ();
269	my $doc_OID = $doc_obj->get_OID();
270	my $first = 1;
271	my $url = "";
272	while (defined $section) {
273	# update a few statistics
274	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
275	$self->{'num_sections'} += 1 unless ($doctype eq "classification");
276
277	# output the section name
278	if ($section eq "") { print $handle "[$doc_OID]\n"; }
279	else { print $handle "[$doc_OID.$section]\n"; }
280
281	# output the fact that this document is a document (unless doctype
282	# has been set to something else from within a plugin
283	my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
284	if (!defined $dtype \|\| $dtype !~ /\w/) {
285	print $handle "<doctype>doc\n";
286	}
287
288	# output whether this node contains text
289	if ($doc_obj->get_text_length($section) > 0) {
290	print $handle "<hastxt>1\n";
291	} else {
292	print $handle "<hastxt>0\n";
293	}
294
295	# output all the section metadata
296	my $metadata = $doc_obj->get_all_metadata ($section);
297	foreach my $pair (@$metadata) {
298	my ($field, $value) = (@$pair);
299
300	if ($field ne "Identifier" && $field !~ /^gsdl/ &&
301	defined $value && $value ne "") {
302
303	# escape problematic stuff
304	$value =~ s/\\/\\\\/g;
305	$value =~ s/\n/\\n/g;
306	$value =~ s/\r/\\r/g;
307
308	# special case for URL metadata
309	if ($field =~ /^URL$/i) {
310	$url .= "[$value]\n";
311	if ($section eq "") {$url .= "<section>$doc_OID\n";}
312	else {$url .= "<section>$doc_OID.$section\n";}
313	$url .= '-' x 70 . "\n";
314	}
315
316	if (!defined $self->{'dontgdbm'}->{$field}) {
317	print $handle "<$field>$value\n";
318	}
319	}
320	}
321
322	# output archivedir if at top level
323	if ($section eq $doc_obj->get_top_section()) {
324	print $handle "<archivedir>$archivedir\n";
325	}
326
327	# output document display type
328	if ($first) {
329	print $handle "<thistype>$thistype\n";
330	}
331
332	# output a list of children
333	my $children = $doc_obj->get_children ($section);
334	if (scalar(@$children) > 0) {
335	print $handle "<childtype>$childtype\n";
336	print $handle "<contains>";
337	my $firstchild = 1;
338	foreach my $child (@$children) {
339	print $handle ";" unless $firstchild;
340	$firstchild = 0;
341	if ($child =~ /^.*?\.(\d+)$/) {
342	print $handle "\".$1";
343	} else {
344	print $handle "\".$child";
345	}
346	# if ($child eq "") { print $handle "$doc_OID"; }
347	# elsif ($section eq "") { print $handle "$doc_OID.$child"; }
348	# else { print $handle "$doc_OID.$section.$child"; }
349	}
350	print $handle "\n";
351	}
352
353	# output the matching document number
354	print $handle "<docnum>$self->{'num_sections'}\n";
355
356	print $handle '-' x 70, "\n";
357
358
359	# output a database entry for the document number
360	print $handle "[$self->{'num_sections'}]\n";
361	if ($section eq "") { print $handle "<section>$doc_OID\n"; }
362	else { print $handle "<section>$doc_OID.$section\n"; }
363	print $handle '-' x 70, "\n";
364
365	# output entry for url
366	if ($url ne "") {
367	print $handle $url;
368	}
369
370	$first = 0;
371	$section = $doc_obj->get_next_section($section);
372	}
373
374	# classify this document
375	&classify::classify_doc ($self->{'classifiers'}, $doc_obj);
376
377	}
378
379	sub find_paragraphs {
380	$_[1] =~ s/(<p\b)/\cC$1/gi;
381	}
382
383	sub filter_text {
384	# $self->filter_text ($field, $new_text);
385	# don't want to do anything for this version, however,
386	# in a particular collection you might want to override
387	# this method to post-process certain fields depending on
388	# the field, or whether we are outputting it for indexing
389	}
390
391	sub text {
392	my $self = shift (@_);
393	my ($doc_obj) = @_;
394	my $handle = $self->{'output_handle'};
395	my $indexed_doc = 1;
396
397	# only output this document if it is one to be indexed
398	return if ($doc_obj->get_doc_type() ne "indexed_doc");
399
400	# see if this document belongs to this subcollection
401	foreach my $indexexp (@{$self->{'indexexparr'}}) {
402	$indexed_doc = 0;
403	my ($field, $exp, $options) = split /\//, $indexexp;
404	if (defined ($field) && defined ($exp)) {
405	my ($bool) = $field =~ /^(.)/;
406	$field =~ s/^.// if $bool eq '!';
407	if ($field =~ /^filename$/i) {
408	$field = $doc_obj->get_source_filename();
409	} else {
410	$field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
411	}
412	next unless defined $field;
413	if ($bool eq '!') {
414	if ($options =~ /^i$/i) {
415	if ($field !~ /$exp/i) {$indexed_doc = 1; last;}
416	} else {
417	if ($field !~ /$exp/) {$indexed_doc = 1; last;}
418	}
419	} else {
420	if ($options =~ /^i$/i) {
421	if ($field =~ /$exp/i) {$indexed_doc = 1; last;}
422	} else {
423	if ($field =~ /$exp/) {$indexed_doc = 1; last;}
424	}
425	}
426	}
427	}
428
429	# this is another document
430	$self->{'num_docs'} += 1;
431
432	# get the parameters for the output
433	my ($level, $fields) = split (/:/, $self->{'index'});
434	$fields =~ s/\ball\b/Title,Creator,text/;
435	$fields =~ s/\btopall\b/topTitle,topCreator,toptext/;
436
437	my $doc_section = 0; # just for this document
438	my $text = "";
439	my $text_extra = "";
440
441	# get the text for this document
442	my $section = $doc_obj->get_top_section();
443	while (defined $section) {
444	# update a few statistics
445	$doc_section++;
446	$self->{'num_sections'} += 1;
447
448	if ($indexed_doc) {
449	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
450	foreach my $field (split (/,/, $fields)) {
451	# only deal with this field if it doesn't start with top or
452	# this is the first section
453	my $real_field = $field;
454	if (!($real_field =~ s/^top//) \|\| ($doc_section == 1)) {
455	my $new_text = "";
456	if ($real_field eq "text") {
457	$new_text = $doc_obj->get_text ($section) if $self->{'store_text'};
458	$self->{'num_processed_bytes'} += length ($new_text);
459	$new_text =~ s/[\cB\cC]//g;
460	$self->find_paragraphs($new_text);
461
462	} else {
463	my $first = 1;
464	foreach $meta (@{$doc_obj->get_metadata ($section, $real_field)}) {
465	$meta =~ s/[\cB\cC]//g;
466	$self->{'num_processed_bytes'} += length ($meta);
467	$new_text .= "\cC" unless $first;
468	$new_text .= $meta if $self->{'store_text'};
469	$first = 0;
470	}
471	}
472
473	# filter the text
474	$self->filter_text ($field, $new_text);
475
476	$text .= "$new_text\cC";
477	}
478	}
479	}
480
481	if ($level eq "document") { $text_extra .= "\cB"; }
482	else { $text .= "\cB"; }
483
484	$section = $doc_obj->get_next_section($section);
485	}
486
487	print $handle "$text$text_extra";
488	}
489
490	1;
491

Note: See TracBrowser for help on using the repository browser.

Download in other formats: