Context Navigation

source: trunk/gsdl/perllib/mgbuildproc.pm@ 1251

Last change on this file since 1251 was 1251, checked in by sjboddie, 24 years ago
Added some stat reporting and a warning message to the build code. Now warns when very little or no text is to be processed for a given index (as mg craps out in these situations). Will hopefully be useful in realizing when an attempt is made to create an index of metadata that is never set etc.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 12.2 KB

Line
1	###########################################################################
2	#
3	# mgbuildproc.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# This document processor outputs a document
27	# for mg to process
28
29
30	package mgbuildproc;
31
32	use classify;
33	use doc;
34	use docproc;
35	use util;
36
37	BEGIN {
38	@ISA = ('docproc');
39	}
40
41
42	sub new {
43	my ($class, $collection, $source_dir, $build_dir, $verbosity) = @_;
44	my $self = new docproc ();
45
46	$self->{'collection'} = $collection;
47	$self->{'source_dir'} = $source_dir;
48	$self->{'build_dir'} = $build_dir;
49	$self->{'verbosity'} = $verbosity;
50	$self->{'classifiers'} = [];
51	$self->{'mode'} = "text";
52	$self->{'assocdir'} = $build_dir;
53	$self->{'dontgdbm'} = {};
54	$self->{'index'} = "section:text";
55	$self->{'indexexparr'} = [];
56	$self->{'output_handle'} = "STDOUT";
57	$self->{'num_docs'} = 0;
58	$self->{'num_sections'} = 0;
59	$self->{'num_bytes'} = 0;
60	$self->{'num_processed_bytes'} = 0;
61
62	$self->{'indexing_text'} = 0;
63
64	return bless $self, $class;
65	}
66
67	sub reset {
68	my $self = shift (@_);
69
70	$self->{'num_docs'} = 0;
71	$self->{'num_sections'} = 0;
72	$self->{'num_processed_bytes'} = 0;
73	$self->{'num_bytes'} = 0;
74	}
75
76	sub get_num_docs {
77	my $self = shift (@_);
78
79	return $self->{'num_docs'};
80	}
81
82	sub get_num_sections {
83	my $self = shift (@_);
84
85	return $self->{'num_sections'};
86	}
87
88	# num_bytes is the actual number of bytes in the collection
89	# this is normally the same as what's processed during text compression
90	sub get_num_bytes {
91	my $self = shift (@_);
92
93	return $self->{'num_bytes'};
94	}
95
96	# num_processed_bytes is the number of bytes actually passed
97	# to mg for the current index
98	sub get_num_processed_bytes {
99	my $self = shift (@_);
100
101	return $self->{'num_processed_bytes'};
102	}
103
104	sub set_output_handle {
105	my $self = shift (@_);
106	my ($handle) = @_;
107
108	$self->{'output_handle'} = $handle;
109	}
110
111	sub set_mode {
112	my $self = shift (@_);
113	my ($mode) = @_;
114
115	$self->{'mode'} = $mode;
116	}
117
118	sub set_assocdir {
119	my $self = shift (@_);
120	my ($assocdir) = @_;
121
122	$self->{'assocdir'} = $assocdir;
123	}
124
125	sub set_dontgdbm {
126	my $self = shift (@_);
127	my ($dontgdbm) = @_;
128
129	$self->{'dontgdbm'} = $dontgdbm;
130	}
131
132	sub set_index {
133	my $self = shift (@_);
134	my ($index, $indexexparr) = @_;
135
136	$self->{'index'} = $index;
137	$self->{'indexexparr'} = $indexexparr if defined $indexexparr;
138	}
139
140	sub get_index {
141	my $self = shift (@_);
142
143	return $self->{'index'};
144	}
145
146	sub set_classifiers {
147	my $self = shift (@_);
148	my ($classifiers) = @_;
149
150	$self->{'classifiers'} = $classifiers;
151	}
152
153	sub set_indexing_text {
154	my $self = shift (@_);
155	my ($indexing_text) = @_;
156
157	$self->{'indexing_text'} = $indexing_text;
158	}
159
160	sub get_indexing_text {
161	my $self = shift (@_);
162
163	return $self->{'indexing_text'};
164	}
165
166	sub process {
167	my $self = shift (@_);
168	my $method = $self->{'mode'};
169
170	$self->$method(@_);
171	}
172
173	# use 'Paged' if document has no more than 2 levels
174	# and each section at second level has a number for
175	# Title metadata
176	sub get_document_type {
177	my $self = shift (@_);
178	my ($doc_obj) = @_;
179
180	my $thistype = "VList";
181	my $childtype = "VList";
182	my $title;
183	my @tmp = ();
184
185	my $section = $doc_obj->get_top_section ();
186	my $first = 1;
187	while (defined $section) {
188	@tmp = split /\./, $section;
189	if (scalar(@tmp) > 1) {
190	return ($thistype, $childtype);
191	}
192	if (!$first) {
193	$title = $doc_obj->get_metadata_element ($section, "Title");
194	if (!defined $title \|\| $title !~ /^\d+$/) {
195	return ($thistype, $childtype);
196	}
197	}
198	$first = 0;
199	$section = $doc_obj->get_next_section($section);
200	}
201	if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
202	$thistype = "Paged";
203	} else {
204	$thistype = "Invisible";
205	}
206	$childtype = "Paged";
207	return ($thistype, $childtype);
208	}
209
210	sub assoc_files {
211	my $self = shift (@_);
212	my ($doc_obj, $archivedir) = @_;
213	my ($afile);
214
215	foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
216	# if assoc file contains directory structure of
217	# its own use it, otherwise use HASH... directory
218	if ($assoc_file->[1] =~ /[\/\\]/) {
219	$afile = &util::filename_cat($self->{'assocdir'}, $assoc_file->[1]);
220	} else {
221	$afile = &util::filename_cat($self->{'assocdir'}, $archivedir, $assoc_file->[1]);
222	}
223	&util::hard_link ($assoc_file->[0], $afile);
224	}
225	}
226
227	sub infodb {
228	my $self = shift (@_);
229	my ($doc_obj, $filename) = @_;
230	my $handle = $self->{'output_handle'};
231	# $handle = "main::STDOUT";
232
233	my $doctype = $doc_obj->get_doc_type();
234
235	# only output this document if it is one to be indexed
236	return if ($doctype ne "indexed_doc");
237
238	my ($archivedir) = $filename =~ /^(.?)(?:\/\|\\)[^\/\\]$/;
239	$archivedir = "" unless defined $archivedir;
240	$archivedir =~ s/\\/\//g;
241	$archivedir =~ s/^\/+//;
242	$archivedir =~ s/\/+$//;
243
244	$self->assoc_files ($doc_obj, $archivedir);
245
246	# this is another document
247	$self->{'num_docs'} += 1 unless ($doctype eq "classification");
248
249	# is this a paged or a hierarchical document
250	my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
251
252	my $section = $doc_obj->get_top_section ();
253	my $doc_OID = $doc_obj->get_OID();
254	my $first = 1;
255	my $url = "";
256	while (defined $section) {
257	# update a few statistics
258	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
259	$self->{'num_sections'} += 1 unless ($doctype eq "classification");
260
261	# output the section name
262	if ($section eq "") { print $handle "[$doc_OID]\n"; }
263	else { print $handle "[$doc_OID.$section]\n"; }
264
265	# output the fact that this document is a document (unless doctype
266	# has been set to something else from within a plugin
267	my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
268	if (!defined $dtype \|\| $dtype !~ /\w/) {
269	print $handle "<doctype>doc\n";
270	}
271
272	# output whether this node contains text
273	if ($doc_obj->get_text_length($section) > 0) {
274	print $handle "<hastxt>1\n";
275	} else {
276	print $handle "<hastxt>0\n";
277	}
278
279	# output all the section metadata
280	my $metadata = $doc_obj->get_all_metadata ($section);
281	foreach my $pair (@$metadata) {
282	my ($field, $value) = (@$pair);
283
284	if ($field ne "Identifier" && $field !~ /^gsdl/ &&
285	defined $value && $value ne "") {
286
287	# escape problematic stuff
288	$value =~ s/\\/\\\\/g;
289	$value =~ s/\n/\\n/g;
290	$value =~ s/\r/\\r/g;
291
292	# special case for URL metadata
293	if ($field =~ /^URL$/i) {
294	$url .= "[$value]\n";
295	if ($section eq "") {$url .= "<section>$doc_OID\n";}
296	else {$url .= "<section>$doc_OID.$section\n";}
297	$url .= '-' x 70 . "\n";
298	}
299
300	if (!defined $self->{'dontgdbm'}->{$field}) {
301	print $handle "<$field>$value\n";
302	}
303	}
304	}
305
306	# output archivedir if at top level
307	if ($section eq $doc_obj->get_top_section()) {
308	print $handle "<archivedir>$archivedir\n";
309	}
310
311	# output document display type
312	if ($first) {
313	print $handle "<thistype>$thistype\n";
314	}
315
316	# output a list of children
317	my $children = $doc_obj->get_children ($section);
318	if (scalar(@$children) > 0) {
319	print $handle "<childtype>$childtype\n";
320	print $handle "<contains>";
321	my $firstchild = 1;
322	foreach my $child (@$children) {
323	print $handle ";" unless $firstchild;
324	$firstchild = 0;
325	if ($child =~ /^.*?\.(\d+)$/) {
326	print $handle "\".$1";
327	} else {
328	print $handle "\".$child";
329	}
330	# if ($child eq "") { print $handle "$doc_OID"; }
331	# elsif ($section eq "") { print $handle "$doc_OID.$child"; }
332	# else { print $handle "$doc_OID.$section.$child"; }
333	}
334	print $handle "\n";
335	}
336
337	# output the matching document number
338	print $handle "<docnum>$self->{'num_sections'}\n";
339
340	print $handle '-' x 70, "\n";
341
342
343	# output a database entry for the document number
344	print $handle "[$self->{'num_sections'}]\n";
345	if ($section eq "") { print $handle "<section>$doc_OID\n"; }
346	else { print $handle "<section>$doc_OID.$section\n"; }
347	print $handle '-' x 70, "\n";
348
349	# output entry for url
350	if ($url ne "") {
351	print $handle $url;
352	}
353
354	$first = 0;
355	$section = $doc_obj->get_next_section($section);
356	}
357
358	# classify this document
359	&classify::classify_doc ($self->{'classifiers'}, $doc_obj);
360
361	}
362
363	sub find_paragraphs {
364	$_[1] =~ s/(<p\b)/\cC$1/gi;
365	}
366
367	sub filter_text {
368	# $self->filter_text ($field, $new_text);
369	# don't want to do anything for this version, however,
370	# in a particular collection you might want to override
371	# this method to post-process certain fields depending on
372	# the field, or whether we are outputting it for indexing
373	}
374
375	sub text {
376	my $self = shift (@_);
377	my ($doc_obj) = @_;
378	my $handle = $self->{'output_handle'};
379	my $indexed_doc = 1;
380
381	# only output this document if it is one to be indexed
382	return if ($doc_obj->get_doc_type() ne "indexed_doc");
383
384	# see if this document belongs to this subcollection
385	foreach my $indexexp (@{$self->{'indexexparr'}}) {
386	$indexed_doc = 0;
387	my ($field, $exp, $options) = split /\//, $indexexp;
388	if (defined ($field) && defined ($exp)) {
389	my ($bool) = $field =~ /^(.)/;
390	$field =~ s/^.// if $bool eq '!';
391	if ($field =~ /^filename$/i) {
392	$field = $doc_obj->get_source_filename();
393	} else {
394	$field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
395	}
396	next unless defined $field;
397	if ($bool eq '!') {
398	if ($options =~ /^i$/i) {
399	if ($field !~ /$exp/i) {$indexed_doc = 1; last;}
400	} else {
401	if ($field !~ /$exp/) {$indexed_doc = 1; last;}
402	}
403	} else {
404	if ($options =~ /^i$/i) {
405	if ($field =~ /$exp/i) {$indexed_doc = 1; last;}
406	} else {
407	if ($field =~ /$exp/) {$indexed_doc = 1; last;}
408	}
409	}
410	}
411	}
412
413	# this is another document
414	$self->{'num_docs'} += 1;
415
416	# get the parameters for the output
417	my ($level, $fields) = split (/:/, $self->{'index'});
418	$fields =~ s/\ball\b/Title,Creator,text/;
419	$fields =~ s/\btopall\b/topTitle,topCreator,toptext/;
420
421	my $doc_section = 0; # just for this document
422	my $text = "";
423	my $text_extra = "";
424
425	# get the text for this document
426	my $section = $doc_obj->get_top_section();
427	while (defined $section) {
428	# update a few statistics
429	$doc_section++;
430	$self->{'num_sections'} += 1;
431
432	if ($indexed_doc) {
433	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
434	foreach my $field (split (/,/, $fields)) {
435	# only deal with this field if it doesn't start with top or
436	# this is the first section
437	my $real_field = $field;
438	if (!($real_field =~ s/^top//) \|\| ($doc_section == 1)) {
439	my $new_text = "";
440	if ($real_field eq "text") {
441	$new_text = $doc_obj->get_text ($section);
442	$self->{'num_processed_bytes'} += length ($new_text);
443	$new_text =~ s/[\cB\cC]//g;
444	$self->find_paragraphs($new_text);
445
446	} else {
447	my $first = 1;
448	foreach $meta (@{$doc_obj->get_metadata ($section, $real_field)}) {
449	$meta =~ s/[\cB\cC]//g;
450	$self->{'num_processed_bytes'} += length ($meta);
451	$new_text .= "\cC" unless $first;
452	$new_text .= $meta;
453	$first = 0;
454	}
455	}
456
457	# filter the text
458	$self->filter_text ($field, $new_text);
459
460	$text .= "$new_text\cC";
461	}
462	}
463	}
464
465	if ($level eq "document") { $text_extra .= "\cB"; }
466	else { $text .= "\cB"; }
467
468	$section = $doc_obj->get_next_section($section);
469	}
470
471	print $handle "$text$text_extra";
472	}
473
474	1;
475

Note: See TracBrowser for help on using the repository browser.

Download in other formats: