Context Navigation

source: trunk/gsdl/perllib/mgbuildproc.pm@ 3665

Last change on this file since 3665 was 3665, checked in by sjboddie, 21 years ago
Prevent occurances of 70 or more hyphens in metadata values from causing txt2db to wrongly treat them as a separator.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 13.1 KB

Line
1	###########################################################################
2	#
3	# mgbuildproc.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# This document processor outputs a document
27	# for mg to process
28
29
30	package mgbuildproc;
31
32	use classify;
33	use doc;
34	use docproc;
35	use util;
36
37	BEGIN {
38	@ISA = ('docproc');
39	}
40
41
42	sub new {
43	my ($class, $collection, $source_dir, $build_dir,
44	$verbosity, $outhandle) = @_;
45	my $self = new docproc ();
46
47	# outhandle is where all the debugging info goes
48	# output_handle is where the output of the plugins is piped
49	# to (i.e. mg, gdbm etc.)
50	$outhandle = STDERR unless defined $outhandle;
51
52	$self->{'collection'} = $collection;
53	$self->{'source_dir'} = $source_dir;
54	$self->{'build_dir'} = $build_dir;
55	$self->{'verbosity'} = $verbosity;
56	$self->{'classifiers'} = [];
57	$self->{'mode'} = "text";
58	$self->{'assocdir'} = $build_dir;
59	$self->{'dontgdbm'} = {};
60	$self->{'index'} = "section:text";
61	$self->{'indexexparr'} = [];
62	$self->{'output_handle'} = "STDOUT";
63	$self->{'num_docs'} = 0;
64	$self->{'num_sections'} = 0;
65	$self->{'num_bytes'} = 0;
66	$self->{'num_processed_bytes'} = 0;
67	$self->{'store_text'} = 1;
68	$self->{'outhandle'} = $outhandle;
69
70	#used by browse interface
71	$self->{'doclist'} = [];
72
73	$self->{'indexing_text'} = 0;
74
75	return bless $self, $class;
76	}
77
78	sub reset {
79	my $self = shift (@_);
80
81	$self->{'num_docs'} = 0;
82	$self->{'num_sections'} = 0;
83	$self->{'num_processed_bytes'} = 0;
84	$self->{'num_bytes'} = 0;
85	}
86
87	sub get_num_docs {
88	my $self = shift (@_);
89
90	return $self->{'num_docs'};
91	}
92
93	sub get_num_sections {
94	my $self = shift (@_);
95
96	return $self->{'num_sections'};
97	}
98
99	# num_bytes is the actual number of bytes in the collection
100	# this is normally the same as what's processed during text compression
101	sub get_num_bytes {
102	my $self = shift (@_);
103
104	return $self->{'num_bytes'};
105	}
106
107	# num_processed_bytes is the number of bytes actually passed
108	# to mg for the current index
109	sub get_num_processed_bytes {
110	my $self = shift (@_);
111
112	return $self->{'num_processed_bytes'};
113	}
114
115	sub set_output_handle {
116	my $self = shift (@_);
117	my ($handle) = @_;
118
119	$self->{'output_handle'} = $handle;
120	}
121
122	sub set_mode {
123	my $self = shift (@_);
124	my ($mode) = @_;
125
126	$self->{'mode'} = $mode;
127	}
128
129	sub set_assocdir {
130	my $self = shift (@_);
131	my ($assocdir) = @_;
132
133	$self->{'assocdir'} = $assocdir;
134	}
135
136	sub set_dontgdbm {
137	my $self = shift (@_);
138	my ($dontgdbm) = @_;
139
140	$self->{'dontgdbm'} = $dontgdbm;
141	}
142
143	sub set_index {
144	my $self = shift (@_);
145	my ($index, $indexexparr) = @_;
146
147	$self->{'index'} = $index;
148	$self->{'indexexparr'} = $indexexparr if defined $indexexparr;
149	}
150
151	sub get_index {
152	my $self = shift (@_);
153
154	return $self->{'index'};
155	}
156
157	sub set_classifiers {
158	my $self = shift (@_);
159	my ($classifiers) = @_;
160
161	$self->{'classifiers'} = $classifiers;
162	}
163
164	sub set_indexing_text {
165	my $self = shift (@_);
166	my ($indexing_text) = @_;
167
168	$self->{'indexing_text'} = $indexing_text;
169	}
170
171	sub get_indexing_text {
172	my $self = shift (@_);
173
174	return $self->{'indexing_text'};
175	}
176
177	sub set_store_text {
178	my $self = shift (@_);
179	my ($store_text) = @_;
180
181	$self->{'store_text'} = $store_text;
182	}
183
184	sub get_doc_list {
185	my $self = shift(@_);
186
187	return @{$self->{'doclist'}};
188	}
189
190
191	sub process {
192	my $self = shift (@_);
193	my $method = $self->{'mode'};
194
195	$self->$method(@_);
196	}
197
198	# use 'Paged' if document has no more than 2 levels
199	# and each section at second level has a number for
200	# Title metadata
201	sub get_document_type {
202	my $self = shift (@_);
203	my ($doc_obj) = @_;
204
205	my $thistype = "VList";
206	my $childtype = "VList";
207	my $title;
208	my @tmp = ();
209
210	my $section = $doc_obj->get_top_section ();
211	my $first = 1;
212	while (defined $section) {
213	@tmp = split /\./, $section;
214	if (scalar(@tmp) > 1) {
215	return ($thistype, $childtype);
216	}
217	if (!$first) {
218	$title = $doc_obj->get_metadata_element ($section, "Title");
219	if (!defined $title \|\| $title !~ /^\d+$/) {
220	return ($thistype, $childtype);
221	}
222	}
223	$first = 0;
224	$section = $doc_obj->get_next_section($section);
225	}
226	if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
227	$thistype = "Paged";
228	} else {
229	$thistype = "Invisible";
230	}
231	$childtype = "Paged";
232	return ($thistype, $childtype);
233	}
234
235	sub assoc_files {
236	my $self = shift (@_);
237	my ($doc_obj, $archivedir) = @_;
238	my ($afile);
239
240	foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
241	# if assoc file starts with a slash, we put it relative to the assoc
242	# dir, otherwise it is relative to the HASH... directory
243	if ($assoc_file->[1] =~ m@^[/\\]@) {
244	$afile = &util::filename_cat($self->{'assocdir'},$assoc_file->[1]);
245	} else {
246	$afile = &util::filename_cat($self->{'assocdir'}, $archivedir, $assoc_file->[1]);
247	}
248	&util::hard_link ($assoc_file->[0], $afile);
249	}
250	}
251
252	sub infodb {
253	my $self = shift (@_);
254	my ($doc_obj, $filename) = @_;
255	my $handle = $self->{'output_handle'};
256	# $handle = "main::STDOUT";
257
258	my $doctype = $doc_obj->get_doc_type();
259
260	# only output this document if it is one to be indexed
261	return if ($doctype ne "indexed_doc");
262
263	my ($archivedir) = $filename =~ /^(.?)(?:\/\|\\)[^\/\\]$/;
264	$archivedir = "" unless defined $archivedir;
265	$archivedir =~ s/\\/\//g;
266	$archivedir =~ s/^\/+//;
267	$archivedir =~ s/\/+$//;
268
269	$self->assoc_files ($doc_obj, $archivedir);
270
271	# this is another document
272	$self->{'num_docs'} += 1 unless ($doctype eq "classification");
273
274	# is this a paged or a hierarchical document
275	my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
276
277	my $section = $doc_obj->get_top_section ();
278	my $doc_OID = $doc_obj->get_OID();
279	my $first = 1;
280	my $url = "";
281	while (defined $section) {
282	# update a few statistics
283	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
284	$self->{'num_sections'} += 1 unless ($doctype eq "classification");
285
286	# output the section name
287	if ($section eq "") { print $handle "[$doc_OID]\n"; }
288	else { print $handle "[$doc_OID.$section]\n"; }
289
290	# output the fact that this document is a document (unless doctype
291	# has been set to something else from within a plugin
292	my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
293	if (!defined $dtype \|\| $dtype !~ /\w/) {
294	print $handle "<doctype>doc\n";
295	}
296
297	# output whether this node contains text
298	if ($doc_obj->get_text_length($section) > 0) {
299	print $handle "<hastxt>1\n";
300	} else {
301	print $handle "<hastxt>0\n";
302	}
303
304	# output all the section metadata
305	my $metadata = $doc_obj->get_all_metadata ($section);
306	foreach my $pair (@$metadata) {
307	my ($field, $value) = (@$pair);
308
309	if ($field ne "Identifier" && $field !~ /^gsdl/ &&
310	defined $value && $value ne "") {
311
312	# escape problematic stuff
313	$value =~ s/\\/\\\\/g;
314	$value =~ s/\n/\\n/g;
315	$value =~ s/\r/\\r/g;
316	if ($value =~ /-{70,}/) {
317	# if value contains 70 or more hyphens in a row we need
318	# to escape them to prevent txt2db from treating them
319	# as a separator
320	$value =~ s/-/&\#045;/gi;
321	}
322
323	# special case for URL metadata
324	if ($field =~ /^URL$/i) {
325	$url .= "[$value]\n";
326	if ($section eq "") {$url .= "<section>$doc_OID\n";}
327	else {$url .= "<section>$doc_OID.$section\n";}
328	$url .= '-' x 70 . "\n";
329	}
330
331	if (!defined $self->{'dontgdbm'}->{$field}) {
332	print $handle "<$field>$value\n";
333	}
334	}
335	}
336
337	# output archivedir if at top level
338	if ($section eq $doc_obj->get_top_section()) {
339	print $handle "<archivedir>$archivedir\n";
340	}
341
342	# output document display type
343	if ($first) {
344	print $handle "<thistype>$thistype\n";
345	}
346
347	# output a list of children
348	my $children = $doc_obj->get_children ($section);
349	if (scalar(@$children) > 0) {
350	print $handle "<childtype>$childtype\n";
351	print $handle "<contains>";
352	my $firstchild = 1;
353	foreach my $child (@$children) {
354	print $handle ";" unless $firstchild;
355	$firstchild = 0;
356	if ($child =~ /^.*?\.(\d+)$/) {
357	print $handle "\".$1";
358	} else {
359	print $handle "\".$child";
360	}
361	# if ($child eq "") { print $handle "$doc_OID"; }
362	# elsif ($section eq "") { print $handle "$doc_OID.$child"; }
363	# else { print $handle "$doc_OID.$section.$child"; }
364	}
365	print $handle "\n";
366	}
367
368	# output the matching document number
369	print $handle "<docnum>$self->{'num_sections'}\n";
370
371	print $handle '-' x 70, "\n";
372
373
374	# output a database entry for the document number
375	print $handle "[$self->{'num_sections'}]\n";
376	if ($section eq "") { print $handle "<section>$doc_OID\n"; }
377	else { print $handle "<section>$doc_OID.$section\n"; }
378	print $handle '-' x 70, "\n";
379
380	# output entry for url
381	if ($url ne "") {
382	print $handle $url;
383	}
384
385	$first = 0;
386	$section = $doc_obj->get_next_section($section);
387	}
388
389	#add this document to the browse structure
390	push(@{$self->{'doclist'}},$doc_obj->get_OID())
391	unless ($doctype eq "classification");
392
393	# classify this document
394	&classify::classify_doc ($self->{'classifiers'}, $doc_obj);
395
396	}
397
398	sub find_paragraphs {
399	$_[1] =~ s/(<p\b)/\cC$1/gi;
400	}
401
402	sub filter_text {
403	# $self->filter_text ($field, $new_text);
404	# don't want to do anything for this version, however,
405	# in a particular collection you might want to override
406	# this method to post-process certain fields depending on
407	# the field, or whether we are outputting it for indexing
408	}
409
410	sub text {
411	my $self = shift (@_);
412	my ($doc_obj) = @_;
413	my $handle = $self->{'output_handle'};
414	my $indexed_doc = 1;
415
416	# only output this document if it is one to be indexed
417	return if ($doc_obj->get_doc_type() ne "indexed_doc");
418
419	# see if this document belongs to this subcollection
420	foreach my $indexexp (@{$self->{'indexexparr'}}) {
421	$indexed_doc = 0;
422	my ($field, $exp, $options) = split /\//, $indexexp;
423	if (defined ($field) && defined ($exp)) {
424	my ($bool) = $field =~ /^(.)/;
425	$field =~ s/^.// if $bool eq '!';
426	if ($field =~ /^filename$/i) {
427	$field = $doc_obj->get_source_filename();
428	} else {
429	$field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
430	}
431	next unless defined $field;
432	if ($bool eq '!') {
433	if ($options =~ /^i$/i) {
434	if ($field !~ /$exp/i) {$indexed_doc = 1; last;}
435	} else {
436	if ($field !~ /$exp/) {$indexed_doc = 1; last;}
437	}
438	} else {
439	if ($options =~ /^i$/i) {
440	if ($field =~ /$exp/i) {$indexed_doc = 1; last;}
441	} else {
442	if ($field =~ /$exp/) {$indexed_doc = 1; last;}
443	}
444	}
445	}
446	}
447
448	# this is another document
449	$self->{'num_docs'} += 1;
450
451	# get the parameters for the output
452	my ($level, $fields) = split (/:/, $self->{'index'});
453	$fields =~ s/\ball\b/Title,Creator,text/;
454	$fields =~ s/\btopall\b/topTitle,topCreator,toptext/;
455
456	my $doc_section = 0; # just for this document
457	my $text = "";
458	my $text_extra = "";
459
460	# get the text for this document
461	my $section = $doc_obj->get_top_section();
462	while (defined $section) {
463	# update a few statistics
464	$doc_section++;
465	$self->{'num_sections'} += 1;
466
467	if ($indexed_doc) {
468	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
469	foreach my $field (split (/,/, $fields)) {
470	# only deal with this field if it doesn't start with top or
471	# this is the first section
472	my $real_field = $field;
473	if (!($real_field =~ s/^top//) \|\| ($doc_section == 1)) {
474	my $new_text = "";
475	if ($real_field eq "text") {
476	$new_text = $doc_obj->get_text ($section) if $self->{'store_text'};
477	$self->{'num_processed_bytes'} += length ($new_text);
478	$new_text =~ s/[\cB\cC]//g;
479	$self->find_paragraphs($new_text);
480
481	} else {
482	my $first = 1;
483	foreach $meta (@{$doc_obj->get_metadata ($section, $real_field)}) {
484	$meta =~ s/[\cB\cC]//g;
485	$self->{'num_processed_bytes'} += length ($meta);
486	$new_text .= "\cC" unless $first;
487	$new_text .= $meta if $self->{'store_text'};
488	$first = 0;
489	}
490	}
491
492	# filter the text
493	$self->filter_text ($field, $new_text);
494
495	$text .= "$new_text\cC";
496	}
497	}
498	}
499
500	if ($level eq "document") { $text_extra .= "\cB"; }
501	else { $text .= "\cB"; }
502
503	$section = $doc_obj->get_next_section($section);
504	}
505
506	print $handle "$text$text_extra";
507	}
508
509	1;
510

Note: See TracBrowser for help on using the repository browser.

Download in other formats: