Context Navigation

source: trunk/gsdl/perllib/mgbuildproc.pm@ 1072

Last change on this file since 1072 was 1072, checked in by sjboddie, 24 years ago
Fixed bug - Control B's and C's were only being removed from body of text and not from metadata values. This caused problems for mg when indexing metadata values containing Control B's or C's. They're now removed from both text and metadata.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 11.5 KB

Line
1	###########################################################################
2	#
3	# mgbuildproc.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# This document processor outputs a document
27	# for mg to process
28
29
30	package mgbuildproc;
31
32	use classify;
33	use doc;
34	use docproc;
35	use util;
36
37	BEGIN {
38	@ISA = ('docproc');
39	}
40
41
42	sub new {
43	my ($class, $collection, $source_dir, $build_dir, $verbosity) = @_;
44	my $self = new docproc ();
45
46	$self->{'collection'} = $collection;
47	$self->{'source_dir'} = $source_dir;
48	$self->{'build_dir'} = $build_dir;
49	$self->{'verbosity'} = $verbosity;
50	$self->{'classifiers'} = [];
51	$self->{'mode'} = "text";
52	$self->{'assocdir'} = $build_dir;
53	$self->{'dontgdbm'} = {};
54	$self->{'index'} = "section:text";
55	$self->{'indexexparr'} = [];
56	$self->{'output_handle'} = "STDOUT";
57	$self->{'num_docs'} = 0;
58	$self->{'num_sections'} = 0;
59	$self->{'num_bytes'} = 0;
60
61	$self->{'indexing_text'} = 0;
62
63	return bless $self, $class;
64	}
65
66	sub reset {
67	my $self = shift (@_);
68
69	$self->{'num_docs'} = 0;
70	$self->{'num_sections'} = 0;
71	$self->{'num_bytes'} = 0;
72	}
73
74	sub get_num_docs {
75	my $self = shift (@_);
76
77	return $self->{'num_docs'};
78	}
79
80	sub get_num_sections {
81	my $self = shift (@_);
82
83	return $self->{'num_sections'};
84	}
85
86	sub get_num_bytes {
87	my $self = shift (@_);
88
89	return $self->{'num_bytes'};
90	}
91
92	sub set_output_handle {
93	my $self = shift (@_);
94	my ($handle) = @_;
95
96	$self->{'output_handle'} = $handle;
97	}
98
99	sub set_mode {
100	my $self = shift (@_);
101	my ($mode) = @_;
102
103	$self->{'mode'} = $mode;
104	}
105
106	sub set_assocdir {
107	my $self = shift (@_);
108	my ($assocdir) = @_;
109
110	$self->{'assocdir'} = $assocdir;
111	}
112
113	sub set_dontgdbm {
114	my $self = shift (@_);
115	my ($dontgdbm) = @_;
116
117	$self->{'dontgdbm'} = $dontgdbm;
118	}
119
120	sub set_index {
121	my $self = shift (@_);
122	my ($index, $indexexparr) = @_;
123
124	$self->{'index'} = $index;
125	$self->{'indexexparr'} = $indexexparr if defined $indexexparr;
126	}
127
128	sub set_classifiers {
129	my $self = shift (@_);
130	my ($classifiers) = @_;
131
132	$self->{'classifiers'} = $classifiers;
133	}
134
135	sub set_indexing_text {
136	my $self = shift (@_);
137	my ($indexing_text) = @_;
138
139	$self->{'indexing_text'} = $indexing_text;
140	}
141
142	sub process {
143	my $self = shift (@_);
144	my $method = $self->{'mode'};
145
146	$self->$method(@_);
147	}
148
149	# use 'Paged' if document has no more than 2 levels
150	# and each section at second level has a number for
151	# Title metadata
152	sub get_document_type {
153	my $self = shift (@_);
154	my ($doc_obj) = @_;
155
156	my $thistype = "VList";
157	my $childtype = "VList";
158	my $title;
159	my @tmp = ();
160
161	my $section = $doc_obj->get_top_section ();
162	my $first = 1;
163	while (defined $section) {
164	@tmp = split /\./, $section;
165	if (scalar(@tmp) > 1) {
166	return ($thistype, $childtype);
167	}
168	if (!$first) {
169	$title = $doc_obj->get_metadata_element ($section, "Title");
170	if (!defined $title \|\| $title !~ /^\d+$/) {
171	return ($thistype, $childtype);
172	}
173	}
174	$first = 0;
175	$section = $doc_obj->get_next_section($section);
176	}
177	if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
178	$thistype = "Paged";
179	} else {
180	$thistype = "Invisible";
181	}
182	$childtype = "Paged";
183	return ($thistype, $childtype);
184	}
185
186	sub assoc_files {
187	my $self = shift (@_);
188	my ($doc_obj, $archivedir) = @_;
189	my ($afile);
190
191	foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
192	# if assoc file contains directory structure of
193	# its own use it, otherwise use HASH... directory
194	if ($assoc_file->[1] =~ /[\/\\]/) {
195	$afile = &util::filename_cat($self->{'assocdir'}, $assoc_file->[1]);
196	} else {
197	$afile = &util::filename_cat($self->{'assocdir'}, $archivedir, $assoc_file->[1]);
198	}
199	&util::hard_link ($assoc_file->[0], $afile);
200	}
201	}
202
203	sub infodb {
204	my $self = shift (@_);
205	my ($doc_obj, $filename) = @_;
206	my $handle = $self->{'output_handle'};
207	# $handle = "main::STDOUT";
208
209	my $doctype = $doc_obj->get_doc_type();
210
211	# only output this document if it is one to be indexed
212	return if ($doctype ne "indexed_doc");
213
214	my ($archivedir) = $filename =~ /^(.?)(?:\/\|\\)[^\/\\]$/;
215	$archivedir = "" unless defined $archivedir;
216	$archivedir =~ s/\\/\//g;
217	$archivedir =~ s/^\/+//;
218	$archivedir =~ s/\/+$//;
219
220	$self->assoc_files ($doc_obj, $archivedir);
221
222	# this is another document
223	$self->{'num_docs'} += 1 unless ($doctype eq "classification");
224
225	# is this a paged or a hierarchical document
226	my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
227
228	my $section = $doc_obj->get_top_section ();
229	my $doc_OID = $doc_obj->get_OID();
230	my $first = 1;
231	my $url = "";
232	while (defined $section) {
233	# update a few statistics
234	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
235	$self->{'num_sections'} += 1 unless ($doctype eq "classification");
236
237	# output the section name
238	if ($section eq "") { print $handle "[$doc_OID]\n"; }
239	else { print $handle "[$doc_OID.$section]\n"; }
240
241	# output the fact that this document is a document (unless doctype
242	# has been set to something else from within a plugin
243	my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
244	if (!defined $dtype \|\| $dtype !~ /\w/) {
245	print $handle "<doctype>doc\n";
246	}
247
248	# output whether this node contains text
249	if ($doc_obj->get_text_length($section) > 0) {
250	print $handle "<hastxt>1\n";
251	} else {
252	print $handle "<hastxt>0\n";
253	}
254
255	# output all the section metadata
256	my $metadata = $doc_obj->get_all_metadata ($section);
257	foreach my $pair (@$metadata) {
258	my ($field, $value) = (@$pair);
259
260	if ($field ne "Identifier" && $field !~ /^gsdl/ &&
261	defined $value && $value ne "") {
262
263	# escape problematic stuff
264	$value =~ s/\\/\\\\/g;
265	$value =~ s/\n/\\n/g;
266	$value =~ s/\r/\\r/g;
267
268	# special case for URL metadata
269	if ($field =~ /^URL$/i) {
270	$url .= "[$value]\n";
271	if ($section eq "") {$url .= "<section>$doc_OID\n";}
272	else {$url .= "<section>$doc_OID.$section\n";}
273	$url .= '-' x 70 . "\n";
274	}
275
276	if (!defined $self->{'dontgdbm'}->{$field}) {
277	print $handle "<$field>$value\n";
278	}
279	}
280	}
281
282	# output archivedir if at top level
283	if ($section eq $doc_obj->get_top_section()) {
284	print $handle "<archivedir>$archivedir\n";
285	}
286
287	# output document display type
288	if ($first) {
289	print $handle "<thistype>$thistype\n";
290	}
291
292	# output a list of children
293	my $children = $doc_obj->get_children ($section);
294	if (scalar(@$children) > 0) {
295	print $handle "<childtype>$childtype\n";
296	print $handle "<contains>";
297	my $firstchild = 1;
298	foreach my $child (@$children) {
299	print $handle ";" unless $firstchild;
300	$firstchild = 0;
301	if ($child =~ /^.*?\.(\d+)$/) {
302	print $handle "\".$1";
303	} else {
304	print $handle "\".$child";
305	}
306	# if ($child eq "") { print $handle "$doc_OID"; }
307	# elsif ($section eq "") { print $handle "$doc_OID.$child"; }
308	# else { print $handle "$doc_OID.$section.$child"; }
309	}
310	print $handle "\n";
311	}
312
313	# output the matching document number
314	print $handle "<docnum>$self->{'num_sections'}\n";
315
316	print $handle '-' x 70, "\n";
317
318
319	# output a database entry for the document number
320	print $handle "[$self->{'num_sections'}]\n";
321	if ($section eq "") { print $handle "<section>$doc_OID\n"; }
322	else { print $handle "<section>$doc_OID.$section\n"; }
323	print $handle '-' x 70, "\n";
324
325	# output entry for url
326	if ($url ne "") {
327	print $handle $url;
328	}
329
330	$first = 0;
331	$section = $doc_obj->get_next_section($section);
332	}
333
334	# classify this document
335	&classify::classify_doc ($self->{'classifiers'}, $doc_obj);
336
337	}
338
339	sub find_paragraphs {
340	$_[1] =~ s/(<p\b)/\cC$1/gi;
341	}
342
343	sub filter_text {
344	# $self->filter_text ($field, $new_text);
345	# don't want to do anything for this version, however,
346	# in a particular collection you might want to override
347	# this method to post-process certain fields depending on
348	# the field, or whether we are outputting it for indexing
349	}
350
351	sub text {
352	my $self = shift (@_);
353	my ($doc_obj) = @_;
354	my $handle = $self->{'output_handle'};
355	my $indexed_doc = 1;
356
357	# only output this document if it is one to be indexed
358	return if ($doc_obj->get_doc_type() ne "indexed_doc");
359
360	# see if this document belongs to this subcollection
361	foreach my $indexexp (@{$self->{'indexexparr'}}) {
362	$indexed_doc = 0;
363	my ($field, $exp, $options) = split /\//, $indexexp;
364	if (defined ($field) && defined ($exp)) {
365	my ($bool) = $field =~ /^(.)/;
366	$field =~ s/^.// if $bool eq '!';
367	if ($field =~ /^filename$/i) {
368	$field = $doc_obj->get_source_filename();
369	} else {
370	$field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
371	}
372	next unless defined $field;
373	if ($bool eq '!') {
374	if ($options =~ /^i$/i) {
375	if ($field !~ /$exp/i) {$indexed_doc = 1; last;}
376	} else {
377	if ($field !~ /$exp/) {$indexed_doc = 1; last;}
378	}
379	} else {
380	if ($options =~ /^i$/i) {
381	if ($field =~ /$exp/i) {$indexed_doc = 1; last;}
382	} else {
383	if ($field =~ /$exp/) {$indexed_doc = 1; last;}
384	}
385	}
386	}
387	}
388
389	# this is another document
390	$self->{'num_docs'} += 1;
391
392	# get the parameters for the output
393	my ($level, $fields) = split (/:/, $self->{'index'});
394	$fields =~ s/\ball\b/Title,Creator,text/;
395	$fields =~ s/\btopall\b/topTitle,topCreator,toptext/;
396
397	my $doc_section = 0; # just for this document
398	my $text = "";
399	my $text_extra = "";
400
401	# get the text for this document
402	my $section = $doc_obj->get_top_section();
403	while (defined $section) {
404	# update a few statistics
405	$doc_section++;
406	$self->{'num_sections'} += 1;
407
408	if ($indexed_doc) {
409	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
410	foreach my $field (split (/,/, $fields)) {
411	# only deal with this field if it doesn't start with top or
412	# this is the first section
413	my $real_field = $field;
414	if (!($real_field =~ s/^top//) \|\| ($doc_section == 1)) {
415	my $new_text = "";
416	if ($real_field eq "text") {
417	$new_text = $doc_obj->get_text ($section);
418	$new_text =~ s/[\cB\cC]//g;
419	$self->find_paragraphs($new_text);
420
421	} else {
422	my $first = 1;
423	foreach $meta (@{$doc_obj->get_metadata ($section, $real_field)}) {
424	$meta =~ s/[\cB\cC]//g;
425	$new_text .= "\cC" unless $first;
426	$new_text .= $meta;
427	$first = 0;
428	}
429	}
430
431	# filter the text
432	$self->filter_text ($field, $new_text);
433
434	$text .= "$new_text\cC";
435	}
436	}
437	}
438
439	if ($level eq "document") { $text_extra .= "\cB"; }
440	else { $text .= "\cB"; }
441
442	$section = $doc_obj->get_next_section($section);
443	}
444
445	print $handle "$text$text_extra";
446	}
447
448	1;
449

Note: See TracBrowser for help on using the repository browser.

Download in other formats: