Context Navigation

source: trunk/gsdl/perllib/mgbuildproc.pm@ 4743

Last change on this file since 4743 was 4743, checked in by sjboddie, 21 years ago
Build code changes allowing mg collections containing no indexes to be built (it in fact builds a small "dummy:text" index if none are specified since we need an index for the runtime code to be able to retrieve the compressed text).
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 13.6 KB

Line
1	###########################################################################
2	#
3	# mgbuildproc.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# This document processor outputs a document
27	# for mg to process
28
29
30	package mgbuildproc;
31
32	eval {require bytes};
33
34	use classify;
35	use doc;
36	use docproc;
37	use util;
38
39	BEGIN {
40	@ISA = ('docproc');
41	}
42
43
44	sub new {
45	my ($class, $collection, $source_dir, $build_dir,
46	$verbosity, $outhandle) = @_;
47	my $self = new docproc ();
48
49	# outhandle is where all the debugging info goes
50	# output_handle is where the output of the plugins is piped
51	# to (i.e. mg, gdbm etc.)
52	$outhandle = STDERR unless defined $outhandle;
53
54	$self->{'collection'} = $collection;
55	$self->{'source_dir'} = $source_dir;
56	$self->{'build_dir'} = $build_dir;
57	$self->{'verbosity'} = $verbosity;
58	$self->{'classifiers'} = [];
59	$self->{'mode'} = "text";
60	$self->{'assocdir'} = $build_dir;
61	$self->{'dontgdbm'} = {};
62	$self->{'index'} = "section:text";
63	$self->{'indexexparr'} = [];
64	$self->{'output_handle'} = "STDOUT";
65	$self->{'num_docs'} = 0;
66	$self->{'num_sections'} = 0;
67	$self->{'num_bytes'} = 0;
68	$self->{'num_processed_bytes'} = 0;
69	$self->{'store_text'} = 1;
70	$self->{'outhandle'} = $outhandle;
71
72	#used by browse interface
73	$self->{'doclist'} = [];
74
75	$self->{'indexing_text'} = 0;
76
77	return bless $self, $class;
78	}
79
80	sub reset {
81	my $self = shift (@_);
82
83	$self->{'num_docs'} = 0;
84	$self->{'num_sections'} = 0;
85	$self->{'num_processed_bytes'} = 0;
86	$self->{'num_bytes'} = 0;
87	}
88
89	sub get_num_docs {
90	my $self = shift (@_);
91
92	return $self->{'num_docs'};
93	}
94
95	sub get_num_sections {
96	my $self = shift (@_);
97
98	return $self->{'num_sections'};
99	}
100
101	# num_bytes is the actual number of bytes in the collection
102	# this is normally the same as what's processed during text compression
103	sub get_num_bytes {
104	my $self = shift (@_);
105
106	return $self->{'num_bytes'};
107	}
108
109	# num_processed_bytes is the number of bytes actually passed
110	# to mg for the current index
111	sub get_num_processed_bytes {
112	my $self = shift (@_);
113
114	return $self->{'num_processed_bytes'};
115	}
116
117	sub set_output_handle {
118	my $self = shift (@_);
119	my ($handle) = @_;
120
121	$self->{'output_handle'} = $handle;
122	}
123
124	sub set_mode {
125	my $self = shift (@_);
126	my ($mode) = @_;
127
128	$self->{'mode'} = $mode;
129	}
130
131	sub set_assocdir {
132	my $self = shift (@_);
133	my ($assocdir) = @_;
134
135	$self->{'assocdir'} = $assocdir;
136	}
137
138	sub set_dontgdbm {
139	my $self = shift (@_);
140	my ($dontgdbm) = @_;
141
142	$self->{'dontgdbm'} = $dontgdbm;
143	}
144
145	sub set_index {
146	my $self = shift (@_);
147	my ($index, $indexexparr) = @_;
148
149	$self->{'index'} = $index;
150	$self->{'indexexparr'} = $indexexparr if defined $indexexparr;
151	}
152
153	sub get_index {
154	my $self = shift (@_);
155
156	return $self->{'index'};
157	}
158
159	sub set_classifiers {
160	my $self = shift (@_);
161	my ($classifiers) = @_;
162
163	$self->{'classifiers'} = $classifiers;
164	}
165
166	sub set_indexing_text {
167	my $self = shift (@_);
168	my ($indexing_text) = @_;
169
170	$self->{'indexing_text'} = $indexing_text;
171	}
172
173	sub get_indexing_text {
174	my $self = shift (@_);
175
176	return $self->{'indexing_text'};
177	}
178
179	sub set_store_text {
180	my $self = shift (@_);
181	my ($store_text) = @_;
182
183	$self->{'store_text'} = $store_text;
184	}
185
186	sub get_doc_list {
187	my $self = shift(@_);
188
189	return @{$self->{'doclist'}};
190	}
191
192
193	sub process {
194	my $self = shift (@_);
195	my $method = $self->{'mode'};
196
197	$self->$method(@_);
198	}
199
200	# use 'Paged' if document has no more than 2 levels
201	# and each section at second level has a number for
202	# Title metadata
203	sub get_document_type {
204	my $self = shift (@_);
205	my ($doc_obj) = @_;
206
207	my $thistype = "VList";
208	my $childtype = "VList";
209	my $title;
210	my @tmp = ();
211
212	my $section = $doc_obj->get_top_section ();
213	my $first = 1;
214	while (defined $section) {
215	@tmp = split /\./, $section;
216	if (scalar(@tmp) > 1) {
217	return ($thistype, $childtype);
218	}
219	if (!$first) {
220	$title = $doc_obj->get_metadata_element ($section, "Title");
221	if (!defined $title \|\| $title !~ /^\d+$/) {
222	return ($thistype, $childtype);
223	}
224	}
225	$first = 0;
226	$section = $doc_obj->get_next_section($section);
227	}
228	if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
229	$thistype = "Paged";
230	} else {
231	$thistype = "Invisible";
232	}
233	$childtype = "Paged";
234	return ($thistype, $childtype);
235	}
236
237	sub assoc_files {
238	my $self = shift (@_);
239	my ($doc_obj, $archivedir) = @_;
240	my ($afile);
241
242	foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
243	# if assoc file starts with a slash, we put it relative to the assoc
244	# dir, otherwise it is relative to the HASH... directory
245	if ($assoc_file->[1] =~ m@^[/\\]@) {
246	$afile = &util::filename_cat($self->{'assocdir'},$assoc_file->[1]);
247	} else {
248	$afile = &util::filename_cat($self->{'assocdir'}, $archivedir, $assoc_file->[1]);
249	}
250	&util::hard_link ($assoc_file->[0], $afile);
251	}
252	}
253
254	sub infodb {
255	my $self = shift (@_);
256	my ($doc_obj, $filename) = @_;
257	my $handle = $self->{'output_handle'};
258	# $handle = "main::STDOUT";
259
260	my $doctype = $doc_obj->get_doc_type();
261
262	# only output this document if it is one to be indexed
263	return if ($doctype ne "indexed_doc");
264
265	my ($archivedir) = $filename =~ /^(.?)(?:\/\|\\)[^\/\\]$/;
266	$archivedir = "" unless defined $archivedir;
267	$archivedir =~ s/\\/\//g;
268	$archivedir =~ s/^\/+//;
269	$archivedir =~ s/\/+$//;
270
271	$self->assoc_files ($doc_obj, $archivedir);
272
273	# this is another document
274	$self->{'num_docs'} += 1 unless ($doctype eq "classification");
275
276	# is this a paged or a hierarchical document
277	my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
278
279	my $section = $doc_obj->get_top_section ();
280	my $doc_OID = $doc_obj->get_OID();
281	my $first = 1;
282	my $url = "";
283	while (defined $section) {
284	# update a few statistics
285	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
286	$self->{'num_sections'} += 1 unless ($doctype eq "classification");
287
288	# output the section name
289	if ($section eq "") { print $handle "[$doc_OID]\n"; }
290	else { print $handle "[$doc_OID.$section]\n"; }
291
292	# output the fact that this document is a document (unless doctype
293	# has been set to something else from within a plugin
294	my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
295	if (!defined $dtype \|\| $dtype !~ /\w/) {
296	print $handle "<doctype>doc\n";
297	}
298
299	# output whether this node contains text
300	if ($doc_obj->get_text_length($section) > 0) {
301	print $handle "<hastxt>1\n";
302	} else {
303	print $handle "<hastxt>0\n";
304	}
305
306	# output all the section metadata
307	my $metadata = $doc_obj->get_all_metadata ($section);
308	foreach my $pair (@$metadata) {
309	my ($field, $value) = (@$pair);
310
311	if ($field ne "Identifier" && $field !~ /^gsdl/ &&
312	defined $value && $value ne "") {
313
314	# escape problematic stuff
315	$value =~ s/\\/\\\\/g;
316	$value =~ s/\n/\\n/g;
317	$value =~ s/\r/\\r/g;
318	if ($value =~ /-{70,}/) {
319	# if value contains 70 or more hyphens in a row we need
320	# to escape them to prevent txt2db from treating them
321	# as a separator
322	$value =~ s/-/&\#045;/gi;
323	}
324
325	# special case for URL metadata
326	if ($field =~ /^URL$/i) {
327	$url .= "[$value]\n";
328	if ($section eq "") {$url .= "<section>$doc_OID\n";}
329	else {$url .= "<section>$doc_OID.$section\n";}
330	$url .= '-' x 70 . "\n";
331	}
332
333	if (!defined $self->{'dontgdbm'}->{$field}) {
334	print $handle "<$field>$value\n";
335	}
336	}
337	}
338
339	# output archivedir if at top level
340	if ($section eq $doc_obj->get_top_section()) {
341	print $handle "<archivedir>$archivedir\n";
342	}
343
344	# output document display type
345	if ($first) {
346	print $handle "<thistype>$thistype\n";
347	}
348
349	# output a list of children
350	my $children = $doc_obj->get_children ($section);
351	if (scalar(@$children) > 0) {
352	print $handle "<childtype>$childtype\n";
353	print $handle "<contains>";
354	my $firstchild = 1;
355	foreach my $child (@$children) {
356	print $handle ";" unless $firstchild;
357	$firstchild = 0;
358	if ($child =~ /^.*?\.(\d+)$/) {
359	print $handle "\".$1";
360	} else {
361	print $handle "\".$child";
362	}
363	# if ($child eq "") { print $handle "$doc_OID"; }
364	# elsif ($section eq "") { print $handle "$doc_OID.$child"; }
365	# else { print $handle "$doc_OID.$section.$child"; }
366	}
367	print $handle "\n";
368	}
369
370	# output the matching document number
371	print $handle "<docnum>$self->{'num_sections'}\n";
372
373	print $handle '-' x 70, "\n";
374
375
376	# output a database entry for the document number
377	print $handle "[$self->{'num_sections'}]\n";
378	if ($section eq "") { print $handle "<section>$doc_OID\n"; }
379	else { print $handle "<section>$doc_OID.$section\n"; }
380	print $handle '-' x 70, "\n";
381
382	# output entry for url
383	if ($url ne "") {
384	print $handle $url;
385	}
386
387	$first = 0;
388	$section = $doc_obj->get_next_section($section);
389	}
390
391	#add this document to the browse structure
392	push(@{$self->{'doclist'}},$doc_obj->get_OID())
393	unless ($doctype eq "classification");
394
395	# classify this document
396	&classify::classify_doc ($self->{'classifiers'}, $doc_obj);
397
398	}
399
400	sub find_paragraphs {
401	$_[1] =~ s/(<p\b)/\cC$1/gi;
402	}
403
404	sub filter_text {
405	# $self->filter_text ($field, $new_text);
406	# don't want to do anything for this version, however,
407	# in a particular collection you might want to override
408	# this method to post-process certain fields depending on
409	# the field, or whether we are outputting it for indexing
410	}
411
412	sub text {
413	my $self = shift (@_);
414	my ($doc_obj) = @_;
415	my $handle = $self->{'output_handle'};
416	my $indexed_doc = 1;
417
418	# only output this document if it is one to be indexed
419	return if ($doc_obj->get_doc_type() ne "indexed_doc");
420
421	# see if this document belongs to this subcollection
422	foreach my $indexexp (@{$self->{'indexexparr'}}) {
423	$indexed_doc = 0;
424	my ($field, $exp, $options) = split /\//, $indexexp;
425	if (defined ($field) && defined ($exp)) {
426	my ($bool) = $field =~ /^(.)/;
427	$field =~ s/^.// if $bool eq '!';
428	if ($field =~ /^filename$/i) {
429	$field = $doc_obj->get_source_filename();
430	} else {
431	$field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
432	}
433	next unless defined $field;
434	if ($bool eq '!') {
435	if ($options =~ /^i$/i) {
436	if ($field !~ /$exp/i) {$indexed_doc = 1; last;}
437	} else {
438	if ($field !~ /$exp/) {$indexed_doc = 1; last;}
439	}
440	} else {
441	if ($options =~ /^i$/i) {
442	if ($field =~ /$exp/i) {$indexed_doc = 1; last;}
443	} else {
444	if ($field =~ /$exp/) {$indexed_doc = 1; last;}
445	}
446	}
447	}
448	}
449
450	# this is another document
451	$self->{'num_docs'} += 1;
452
453	# get the parameters for the output
454	my ($level, $fields) = split (/:/, $self->{'index'});
455	$fields =~ s/\ball\b/Title,Creator,text/;
456	$fields =~ s/\btopall\b/topTitle,topCreator,toptext/;
457
458	my $doc_section = 0; # just for this document
459	my $text = "";
460	my $text_extra = "";
461
462	# get the text for this document
463	my $section = $doc_obj->get_top_section();
464	while (defined $section) {
465	# update a few statistics
466	$doc_section++;
467	$self->{'num_sections'} += 1;
468
469	if ($indexed_doc) {
470	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
471	foreach my $field (split (/,/, $fields)) {
472	# only deal with this field if it doesn't start with top or
473	# this is the first section
474	my $real_field = $field;
475	if (!($real_field =~ s/^top//) \|\| ($doc_section == 1)) {
476	my $new_text = "";
477	if ($level eq "dummy") {
478	# a dummy index is a special case used when no
479	# indexes are specified (since there must always be
480	# at least one index or we can't retrieve the
481	# compressed text) - we add a small amount of text
482	# to these dummy indexes which will never be seen
483	# but will overcome mg's problems with building
484	# empty indexes
485	$new_text = "this is dummy text to stop mg barfing";
486	$self->{'num_processed_bytes'} += length ($new_text);
487
488	} elsif ($real_field eq "text") {
489	$new_text = $doc_obj->get_text ($section) if $self->{'store_text'};
490	$self->{'num_processed_bytes'} += length ($new_text);
491	$new_text =~ s/[\cB\cC]//g;
492	$self->find_paragraphs($new_text);
493
494	} else {
495	my $first = 1;
496	foreach $meta (@{$doc_obj->get_metadata ($section, $real_field)}) {
497	$meta =~ s/[\cB\cC]//g;
498	$self->{'num_processed_bytes'} += length ($meta);
499	$new_text .= "\cC" unless $first;
500	$new_text .= $meta if $self->{'store_text'};
501	$first = 0;
502	}
503	}
504
505	# filter the text
506	$self->filter_text ($field, $new_text);
507
508	$text .= "$new_text\cC";
509	}
510	}
511	}
512
513	if ($level eq "document") { $text_extra .= "\cB"; }
514	else { $text .= "\cB"; }
515
516	$section = $doc_obj->get_next_section($section);
517	}
518
519	print $handle "$text$text_extra";
520	}
521
522	1;
523

Note: See TracBrowser for help on using the repository browser.

Download in other formats: