Context Navigation

source: trunk/gsdl/perllib/mgbuildproc.pm@ 7909

Last change on this file since 7909 was 6767, checked in by kjdon, 20 years ago
small change in the use of gsdlthistype to determine document type (paged vs Vlist)
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 14.0 KB

Line
1	###########################################################################
2	#
3	# mgbuildproc.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# This document processor outputs a document
27	# for mg to process
28
29
30	package mgbuildproc;
31
32	eval {require bytes};
33
34	use classify;
35	use doc;
36	use docproc;
37	use util;
38
39	BEGIN {
40	@ISA = ('docproc');
41	}
42
43
44	sub new {
45	my ($class, $collection, $source_dir, $build_dir,
46	$verbosity, $outhandle) = @_;
47	my $self = new docproc ();
48
49	# outhandle is where all the debugging info goes
50	# output_handle is where the output of the plugins is piped
51	# to (i.e. mg, gdbm etc.)
52	$outhandle = STDERR unless defined $outhandle;
53
54	$self->{'collection'} = $collection;
55	$self->{'source_dir'} = $source_dir;
56	$self->{'build_dir'} = $build_dir;
57	$self->{'verbosity'} = $verbosity;
58	$self->{'classifiers'} = [];
59	$self->{'mode'} = "text";
60	$self->{'assocdir'} = $build_dir;
61	$self->{'dontgdbm'} = {};
62	$self->{'index'} = "section:text";
63	$self->{'indexexparr'} = [];
64	$self->{'output_handle'} = "STDOUT";
65	$self->{'num_docs'} = 0;
66	$self->{'num_sections'} = 0;
67	$self->{'num_bytes'} = 0;
68	$self->{'num_processed_bytes'} = 0;
69	$self->{'store_text'} = 1;
70	$self->{'outhandle'} = $outhandle;
71
72	#used by browse interface
73	$self->{'doclist'} = [];
74
75	$self->{'indexing_text'} = 0;
76
77	return bless $self, $class;
78	}
79
80	sub reset {
81	my $self = shift (@_);
82
83	$self->{'num_docs'} = 0;
84	$self->{'num_sections'} = 0;
85	$self->{'num_processed_bytes'} = 0;
86	$self->{'num_bytes'} = 0;
87	}
88
89	sub get_num_docs {
90	my $self = shift (@_);
91
92	return $self->{'num_docs'};
93	}
94
95	sub get_num_sections {
96	my $self = shift (@_);
97
98	return $self->{'num_sections'};
99	}
100
101	# num_bytes is the actual number of bytes in the collection
102	# this is normally the same as what's processed during text compression
103	sub get_num_bytes {
104	my $self = shift (@_);
105
106	return $self->{'num_bytes'};
107	}
108
109	# num_processed_bytes is the number of bytes actually passed
110	# to mg for the current index
111	sub get_num_processed_bytes {
112	my $self = shift (@_);
113
114	return $self->{'num_processed_bytes'};
115	}
116
117	sub set_output_handle {
118	my $self = shift (@_);
119	my ($handle) = @_;
120
121	$self->{'output_handle'} = $handle;
122	}
123
124	sub set_mode {
125	my $self = shift (@_);
126	my ($mode) = @_;
127
128	$self->{'mode'} = $mode;
129	}
130
131	sub set_assocdir {
132	my $self = shift (@_);
133	my ($assocdir) = @_;
134
135	$self->{'assocdir'} = $assocdir;
136	}
137
138	sub set_dontgdbm {
139	my $self = shift (@_);
140	my ($dontgdbm) = @_;
141
142	$self->{'dontgdbm'} = $dontgdbm;
143	}
144
145	sub set_index {
146	my $self = shift (@_);
147	my ($index, $indexexparr) = @_;
148
149	$self->{'index'} = $index;
150	$self->{'indexexparr'} = $indexexparr if defined $indexexparr;
151	}
152
153	sub get_index {
154	my $self = shift (@_);
155
156	return $self->{'index'};
157	}
158
159	sub set_classifiers {
160	my $self = shift (@_);
161	my ($classifiers) = @_;
162
163	$self->{'classifiers'} = $classifiers;
164	}
165
166	sub set_indexing_text {
167	my $self = shift (@_);
168	my ($indexing_text) = @_;
169
170	$self->{'indexing_text'} = $indexing_text;
171	}
172
173	sub get_indexing_text {
174	my $self = shift (@_);
175
176	return $self->{'indexing_text'};
177	}
178
179	sub set_store_text {
180	my $self = shift (@_);
181	my ($store_text) = @_;
182
183	$self->{'store_text'} = $store_text;
184	}
185
186	sub get_doc_list {
187	my $self = shift(@_);
188
189	return @{$self->{'doclist'}};
190	}
191
192
193	sub process {
194	my $self = shift (@_);
195	my $method = $self->{'mode'};
196
197	$self->$method(@_);
198	}
199
200	# use 'Paged' if document has no more than 2 levels
201	# and each section at second level has a number for
202	# Title metadata
203	#also use Paged if gsdlthistype metadata is set to Paged
204	sub get_document_type {
205	my $self = shift (@_);
206	my ($doc_obj) = @_;
207
208	my $thistype = "VList";
209	my $childtype = "VList";
210	my $title;
211	my @tmp = ();
212
213	my $section = $doc_obj->get_top_section ();
214
215	my $gsdlthistype = $doc_obj->get_metadata_element ($section, "gsdlthistype");
216	if (defined $gsdlthistype) {
217	if ($gsdlthistype eq "Paged") {
218	$thistype = "Paged";
219	$childtype = "Paged";
220	return ($thistype, $childtype);
221	} elsif ($gsdlthistype eq "Hierarchy") {
222	return ($thistype, $childtype); # use VList, VList
223	}
224	}
225	my $first = 1;
226	while (defined $section) {
227	@tmp = split /\./, $section;
228	if (scalar(@tmp) > 1) {
229	return ($thistype, $childtype);
230	}
231	if (!$first) {
232	$title = $doc_obj->get_metadata_element ($section, "Title");
233	if (!defined $title \|\| $title !~ /^\d+$/) {
234	return ($thistype, $childtype);
235	}
236	}
237	$first = 0;
238	$section = $doc_obj->get_next_section($section);
239	}
240	if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
241	$thistype = "Paged";
242	} else {
243	$thistype = "Invisible";
244	}
245	$childtype = "Paged";
246	return ($thistype, $childtype);
247	}
248
249	sub assoc_files {
250	my $self = shift (@_);
251	my ($doc_obj, $archivedir) = @_;
252	my ($afile);
253
254	foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
255	# if assoc file starts with a slash, we put it relative to the assoc
256	# dir, otherwise it is relative to the HASH... directory
257	if ($assoc_file->[1] =~ m@^[/\\]@) {
258	$afile = &util::filename_cat($self->{'assocdir'},$assoc_file->[1]);
259	} else {
260	$afile = &util::filename_cat($self->{'assocdir'}, $archivedir, $assoc_file->[1]);
261	}
262	&util::hard_link ($assoc_file->[0], $afile);
263	}
264	}
265
266	sub infodb {
267	my $self = shift (@_);
268	my ($doc_obj, $filename) = @_;
269	my $handle = $self->{'output_handle'};
270	# $handle = "main::STDOUT";
271
272	my $doctype = $doc_obj->get_doc_type();
273
274	# only output this document if it is one to be indexed
275	return if ($doctype ne "indexed_doc");
276
277	my ($archivedir) = $filename =~ /^(.?)(?:\/\|\\)[^\/\\]$/;
278	$archivedir = "" unless defined $archivedir;
279	$archivedir =~ s/\\/\//g;
280	$archivedir =~ s/^\/+//;
281	$archivedir =~ s/\/+$//;
282
283	$self->assoc_files ($doc_obj, $archivedir);
284
285	# this is another document
286	$self->{'num_docs'} += 1 unless ($doctype eq "classification");
287
288	# is this a paged or a hierarchical document
289	my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
290
291	my $section = $doc_obj->get_top_section ();
292	my $doc_OID = $doc_obj->get_OID();
293	my $first = 1;
294	my $url = "";
295	while (defined $section) {
296	# update a few statistics
297	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
298	$self->{'num_sections'} += 1 unless ($doctype eq "classification");
299
300	# output the section name
301	if ($section eq "") { print $handle "[$doc_OID]\n"; }
302	else { print $handle "[$doc_OID.$section]\n"; }
303
304	# output the fact that this document is a document (unless doctype
305	# has been set to something else from within a plugin
306	my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
307	if (!defined $dtype \|\| $dtype !~ /\w/) {
308	print $handle "<doctype>doc\n";
309	}
310
311	# output whether this node contains text
312	if ($doc_obj->get_text_length($section) > 0) {
313	print $handle "<hastxt>1\n";
314	} else {
315	print $handle "<hastxt>0\n";
316	}
317
318	# output all the section metadata
319	my $metadata = $doc_obj->get_all_metadata ($section);
320	foreach my $pair (@$metadata) {
321	my ($field, $value) = (@$pair);
322
323	if ($field ne "Identifier" && $field !~ /^gsdl/ &&
324	defined $value && $value ne "") {
325
326	# escape problematic stuff
327	$value =~ s/\\/\\\\/g;
328	$value =~ s/\n/\\n/g;
329	$value =~ s/\r/\\r/g;
330	if ($value =~ /-{70,}/) {
331	# if value contains 70 or more hyphens in a row we need
332	# to escape them to prevent txt2db from treating them
333	# as a separator
334	$value =~ s/-/&\#045;/gi;
335	}
336
337	# special case for URL metadata
338	if ($field =~ /^URL$/i) {
339	$url .= "[$value]\n";
340	if ($section eq "") {$url .= "<section>$doc_OID\n";}
341	else {$url .= "<section>$doc_OID.$section\n";}
342	$url .= '-' x 70 . "\n";
343	}
344
345	if (!defined $self->{'dontgdbm'}->{$field}) {
346	print $handle "<$field>$value\n";
347	}
348	}
349	}
350
351	# output archivedir if at top level
352	if ($section eq $doc_obj->get_top_section()) {
353	print $handle "<archivedir>$archivedir\n";
354	}
355
356	# output document display type
357	if ($first) {
358	print $handle "<thistype>$thistype\n";
359	}
360
361	# output a list of children
362	my $children = $doc_obj->get_children ($section);
363	if (scalar(@$children) > 0) {
364	print $handle "<childtype>$childtype\n";
365	print $handle "<contains>";
366	my $firstchild = 1;
367	foreach my $child (@$children) {
368	print $handle ";" unless $firstchild;
369	$firstchild = 0;
370	if ($child =~ /^.*?\.(\d+)$/) {
371	print $handle "\".$1";
372	} else {
373	print $handle "\".$child";
374	}
375	# if ($child eq "") { print $handle "$doc_OID"; }
376	# elsif ($section eq "") { print $handle "$doc_OID.$child"; }
377	# else { print $handle "$doc_OID.$section.$child"; }
378	}
379	print $handle "\n";
380	}
381
382	# output the matching document number
383	print $handle "<docnum>$self->{'num_sections'}\n";
384
385	print $handle '-' x 70, "\n";
386
387
388	# output a database entry for the document number
389	print $handle "[$self->{'num_sections'}]\n";
390	if ($section eq "") { print $handle "<section>$doc_OID\n"; }
391	else { print $handle "<section>$doc_OID.$section\n"; }
392	print $handle '-' x 70, "\n";
393
394	# output entry for url
395	if ($url ne "") {
396	print $handle $url;
397	}
398
399	$first = 0;
400	$section = $doc_obj->get_next_section($section);
401	}
402
403	#add this document to the browse structure
404	push(@{$self->{'doclist'}},$doc_obj->get_OID())
405	unless ($doctype eq "classification");
406
407	# classify this document
408	&classify::classify_doc ($self->{'classifiers'}, $doc_obj);
409
410	}
411
412	sub find_paragraphs {
413	$_[1] =~ s/(<p\b)/\cC$1/gi;
414	}
415
416	sub filter_text {
417	# $self->filter_text ($field, $new_text);
418	# don't want to do anything for this version, however,
419	# in a particular collection you might want to override
420	# this method to post-process certain fields depending on
421	# the field, or whether we are outputting it for indexing
422	}
423
424	sub text {
425	my $self = shift (@_);
426	my ($doc_obj) = @_;
427	my $handle = $self->{'output_handle'};
428	my $indexed_doc = 1;
429
430	# only output this document if it is one to be indexed
431	return if ($doc_obj->get_doc_type() ne "indexed_doc");
432
433	# see if this document belongs to this subcollection
434	foreach my $indexexp (@{$self->{'indexexparr'}}) {
435	$indexed_doc = 0;
436	my ($field, $exp, $options) = split /\//, $indexexp;
437	if (defined ($field) && defined ($exp)) {
438	my ($bool) = $field =~ /^(.)/;
439	$field =~ s/^.// if $bool eq '!';
440	if ($field =~ /^filename$/i) {
441	$field = $doc_obj->get_source_filename();
442	} else {
443	$field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
444	}
445	next unless defined $field;
446	if ($bool eq '!') {
447	if ($options =~ /^i$/i) {
448	if ($field !~ /$exp/i) {$indexed_doc = 1; last;}
449	} else {
450	if ($field !~ /$exp/) {$indexed_doc = 1; last;}
451	}
452	} else {
453	if ($options =~ /^i$/i) {
454	if ($field =~ /$exp/i) {$indexed_doc = 1; last;}
455	} else {
456	if ($field =~ /$exp/) {$indexed_doc = 1; last;}
457	}
458	}
459	}
460	}
461
462	# this is another document
463	$self->{'num_docs'} += 1;
464
465	# get the parameters for the output
466	my ($level, $fields) = split (/:/, $self->{'index'});
467	$fields =~ s/\ball\b/Title,Creator,text/;
468	$fields =~ s/\btopall\b/topTitle,topCreator,toptext/;
469
470	my $doc_section = 0; # just for this document
471	my $text = "";
472	my $text_extra = "";
473
474	# get the text for this document
475	my $section = $doc_obj->get_top_section();
476	while (defined $section) {
477	# update a few statistics
478	$doc_section++;
479	$self->{'num_sections'} += 1;
480
481	if ($indexed_doc) {
482	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
483	foreach my $field (split (/,/, $fields)) {
484	# only deal with this field if it doesn't start with top or
485	# this is the first section
486	my $real_field = $field;
487	if (!($real_field =~ s/^top//) \|\| ($doc_section == 1)) {
488	my $new_text = "";
489	if ($level eq "dummy") {
490	# a dummy index is a special case used when no
491	# indexes are specified (since there must always be
492	# at least one index or we can't retrieve the
493	# compressed text) - we add a small amount of text
494	# to these dummy indexes which will never be seen
495	# but will overcome mg's problems with building
496	# empty indexes
497	$new_text = "this is dummy text to stop mg barfing";
498	$self->{'num_processed_bytes'} += length ($new_text);
499
500	} elsif ($real_field eq "text") {
501	$new_text = $doc_obj->get_text ($section) if $self->{'store_text'};
502	$self->{'num_processed_bytes'} += length ($new_text);
503	$new_text =~ s/[\cB\cC]//g;
504	$self->find_paragraphs($new_text);
505
506	} else {
507	my $first = 1;
508	foreach $meta (@{$doc_obj->get_metadata ($section, $real_field)}) {
509	$meta =~ s/[\cB\cC]//g;
510	$self->{'num_processed_bytes'} += length ($meta);
511	$new_text .= "\cC" unless $first;
512	$new_text .= $meta if $self->{'store_text'};
513	$first = 0;
514	}
515	}
516
517	# filter the text
518	$self->filter_text ($field, $new_text);
519
520	$text .= "$new_text\cC";
521	}
522	}
523	}
524
525	if ($level eq "document") { $text_extra .= "\cB"; }
526	else { $text .= "\cB"; }
527
528	$section = $doc_obj->get_next_section($section);
529	}
530
531	print $handle "$text$text_extra";
532	}
533
534	1;
535

Note: See TracBrowser for help on using the repository browser.

Download in other formats: