Context Navigation

source: main/tags/2.51/gsdl/perllib/mgppbuildproc.pm

Last change on this file was 7090, checked in by sjboddie, 20 years ago
Fixed some minor perl syntax problems that were causing warning messages when building mgpp collections.
Property svn:keywords set to `Author Date Id Revision`
File size: 19.4 KB

Line
1	###########################################################################
2	#
3	# mgppbuildproc.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# This document processor outputs a document
27	# for mgpp to process
28
29
30	package mgppbuildproc;
31
32	eval {require bytes};
33
34	use classify;
35	use doc;
36	use docproc;
37	use util;
38
39
40	BEGIN {
41	@ISA = ('docproc');
42	}
43
44	#this must be the same as in mgppbuilder
45	%level_map = ('document'=>'Doc',
46	'section'=>'Sec',
47	'paragraph'=>'Para');
48
49	sub new {
50	my ($class, $collection, $source_dir, $build_dir,
51	$verbosity, $outhandle) = @_;
52	my $self = new docproc ();
53
54	# outhandle is where all the debugging info goes
55	# output_handle is where the output of the plugins is piped
56	# to (i.e. mg, gdbm etc.)
57	$outhandle = STDERR unless defined $outhandle;
58
59	$self->{'collection'} = $collection;
60	$self->{'source_dir'} = $source_dir;
61	$self->{'build_dir'} = $build_dir;
62	$self->{'verbosity'} = $verbosity;
63	$self->{'classifiers'} = [];
64	$self->{'mode'} = "text";
65	$self->{'assocdir'} = $build_dir;
66	$self->{'dontgdbm'} = {};
67	$self->{'index'} = "text";
68	$self->{'indexexparr'} = [];
69	$self->{'output_handle'} = "STDOUT";
70	$self->{'num_docs'} = 0;
71	$self->{'num_sections'} = 0;
72	$self->{'num_bytes'} = 0;
73	$self->{'num_processed_bytes'} = 0;
74	$self->{'store_text'} = 1;
75	$self->{'outhandle'} = $outhandle;
76
77	#used by browse interface
78	$self->{'doclist'} = [];
79
80	$self->{'indexing_text'} = 0;
81
82	#new ones for mgpp
83	$self->{'dontindex'} = {};
84	$self->{'indexfieldmap'} = {};
85	$self->{'indexfields'} = {}; # only put in the ones that are not specified directly in the index
86	$self->{'strip_html'}=1;
87
88
89	return bless $self, $class;
90	}
91
92	sub reset {
93	my $self = shift (@_);
94
95	$self->{'num_docs'} = 0;
96	$self->{'num_sections'} = 0;
97	$self->{'num_processed_bytes'} = 0;
98	$self->{'num_bytes'} = 0;
99	}
100
101	sub get_num_docs {
102	my $self = shift (@_);
103
104	return $self->{'num_docs'};
105	}
106
107	sub get_num_sections {
108	my $self = shift (@_);
109
110	return $self->{'num_sections'};
111	}
112
113	# num_bytes is the actual number of bytes in the collection
114	# this is normally the same as what's processed during text compression
115	sub get_num_bytes {
116	my $self = shift (@_);
117
118	return $self->{'num_bytes'};
119	}
120
121	# num_processed_bytes is the number of bytes actually passed
122	# to mgpp for the current index
123	sub get_num_processed_bytes {
124	my $self = shift (@_);
125
126	return $self->{'num_processed_bytes'};
127	}
128
129	sub set_output_handle {
130	my $self = shift (@_);
131	my ($handle) = @_;
132
133	$self->{'output_handle'} = $handle;
134	}
135
136	sub set_mode {
137	my $self = shift (@_);
138	my ($mode) = @_;
139
140	$self->{'mode'} = $mode;
141	}
142
143	sub set_assocdir {
144	my $self = shift (@_);
145	my ($assocdir) = @_;
146
147	$self->{'assocdir'} = $assocdir;
148	}
149
150	sub set_dontgdbm {
151	my $self = shift (@_);
152	my ($dontgdbm) = @_;
153
154	$self->{'dontgdbm'} = $dontgdbm;
155	}
156
157	sub set_index {
158	my $self = shift (@_);
159	my ($index, $indexexparr) = @_;
160
161	$self->{'index'} = $index;
162	$self->{'indexexparr'} = $indexexparr if defined $indexexparr;
163	}
164
165	sub get_index {
166	my $self = shift (@_);
167
168	return $self->{'index'};
169	}
170
171	sub set_classifiers {
172	my $self = shift (@_);
173	my ($classifiers) = @_;
174
175	$self->{'classifiers'} = $classifiers;
176	}
177
178	sub set_indexing_text {
179	my $self = shift (@_);
180	my ($indexing_text) = @_;
181
182	$self->{'indexing_text'} = $indexing_text;
183	}
184
185	sub get_indexing_text {
186	my $self = shift (@_);
187
188	return $self->{'indexing_text'};
189	}
190
191	sub set_store_text {
192	my $self = shift (@_);
193	my ($store_text) = @_;
194
195	$self->{'store_text'} = $store_text;
196	}
197
198	sub get_doc_list {
199	my $self = shift(@_);
200
201	return @{$self->{'doclist'}};
202	}
203
204	sub set_indexfieldmap {
205	my $self = shift (@_);
206	my ($indexmap) = @_;
207
208	$self->{'indexfieldmap'} = $indexmap;
209	}
210
211	sub get_indexfieldmap {
212	my $self = shift (@_);
213
214	return $self->{'indexfieldmap'};
215	}
216
217	sub set_levels {
218	my $self = shift (@_);
219	my ($levels) = @_;
220
221	$self->{'levels'} = $levels;
222	}
223
224	sub set_strip_html {
225	my $self = shift (@_);
226	my ($strip) = @_;
227	$self->{'strip_html'}=$strip;
228	}
229
230	sub process {
231	my $self = shift (@_);
232	my $method = $self->{'mode'};
233
234	$self->$method(@_);
235	}
236
237	# use 'Paged' if document has no more than 2 levels
238	# and each section at second level has a number for
239	# Title metadata
240	# also use Paged if gsdlthistype metadata is set to Paged
241	sub get_document_type {
242	my $self = shift (@_);
243	my ($doc_obj) = @_;
244
245	my $thistype = "VList";
246	my $childtype = "VList";
247	my $title;
248	my @tmp = ();
249
250	my $section = $doc_obj->get_top_section ();
251
252	my $gsdlthistype = $doc_obj->get_metadata_element ($section, "gsdlthistype");
253	if (defined $gsdlthistype) {
254	if ($gsdlthistype eq "Paged") {
255	$thistype = "Paged";
256	$childtype = "Paged";
257	return ($thistype, $childtype);
258	} elsif ($gsdlthistype eq "Hierarchy") {
259	return ($thistype, $childtype); # use VList, VList
260	}
261	}
262	my $first = 1;
263	while (defined $section) {
264	@tmp = split /\./, $section;
265	if (scalar(@tmp) > 1) {
266	return ($thistype, $childtype);
267	}
268	if (!$first) {
269	$title = $doc_obj->get_metadata_element ($section, "Title");
270	if (!defined $title \|\| $title !~ /^\d+$/) {
271	return ($thistype, $childtype);
272	}
273	}
274	$first = 0;
275	$section = $doc_obj->get_next_section($section);
276	}
277	if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
278	$thistype = "Paged";
279	} else {
280	$thistype = "Invisible";
281	}
282	$childtype = "Paged";
283	return ($thistype, $childtype);
284	}
285
286	sub assoc_files {
287	my $self = shift (@_);
288	my ($doc_obj, $archivedir) = @_;
289	my ($afile);
290
291	foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
292	# if assoc file starts with a slash, we put it relative to the assoc
293	# dir, otherwise it is relative to the HASH... directory
294	if ($assoc_file->[1] =~ m@^[/\\]@) {
295	$afile = &util::filename_cat($self->{'assocdir'},$assoc_file->[1]);
296	} else {
297	$afile = &util::filename_cat($self->{'assocdir'}, $archivedir, $assoc_file->[1]);
298	}
299	&util::hard_link ($assoc_file->[0], $afile);
300	}
301	}
302
303	sub infodb {
304	my $self = shift (@_);
305	my ($doc_obj, $filename) = @_;
306	my $handle = $self->{'output_handle'};
307
308	my $doctype = $doc_obj->get_doc_type();
309
310	# only output this document if it is one to be indexed
311	return if ($doctype ne "indexed_doc");
312
313	#if a Section level index is not built, the gdbm file should be at doc
314	#level not Section
315	my $docs_only = 1;
316	if ($self->{'levels'}->{'section'}) {
317	$docs_only = 0;
318	}
319
320	my ($archivedir) = $filename =~ /^(.?)(?:\/\|\\)[^\/\\]$/;
321	$archivedir = "" unless defined $archivedir;
322	$archivedir =~ s/\\/\//g;
323	$archivedir =~ s/^\/+//;
324	$archivedir =~ s/\/+$//;
325
326	$self->assoc_files ($doc_obj, $archivedir);
327
328	# this is another document
329	$self->{'num_docs'} += 1 unless ($doctype eq "classification");
330
331	# is this a paged or a hierarchical document
332	my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
333
334	my $section = $doc_obj->get_top_section ();
335	my $doc_OID = $doc_obj->get_OID();
336	my $first = 1;
337	my $url = "";
338	while (defined $section) {
339	# update a few statistics
340	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
341	$self->{'num_sections'} += 1 unless ($doctype eq "classification");
342
343	# output the section name
344	if ($section eq "") { print $handle "[$doc_OID]\n"; }
345	else { print $handle "[$doc_OID.$section]\n"; }
346
347	# output the fact that this document is a document (unless doctype
348	# has been set to something else from within a plugin
349	my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
350	if (!defined $dtype \|\| $dtype !~ /\w/) {
351	print $handle "<doctype>doc\n";
352	}
353
354	# output whether this node contains text
355	if ($doc_obj->get_text_length($section) > 0) {
356	print $handle "<hastxt>1\n";
357	} else {
358	print $handle "<hastxt>0\n";
359	}
360
361	# output all the section metadata
362	my $metadata = $doc_obj->get_all_metadata ($section);
363	foreach my $pair (@$metadata) {
364	my ($field, $value) = (@$pair);
365
366	if ($field ne "Identifier" && $field !~ /^gsdl/ &&
367	defined $value && $value ne "") {
368
369	# escape problematic stuff
370	$value =~ s/\\/\\\\/g;
371	$value =~ s/\n/\\n/g;
372	$value =~ s/\r/\\r/g;
373
374	# special case for URL metadata
375	if ($field =~ /^URL$/i) {
376	$url .= "[$value]\n";
377	if ($section eq "") {$url .= "<section>$doc_OID\n";}
378	else {$url .= "<section>$doc_OID.$section\n";}
379	$url .= '-' x 70 . "\n";
380	}
381
382	if (!defined $self->{'dontgdbm'}->{$field}) {
383	print $handle "<$field>$value\n";
384	}
385	}
386	}
387
388	# output archivedir if at top level
389	if ($section eq $doc_obj->get_top_section()) {
390	print $handle "<archivedir>$archivedir\n";
391	}
392
393	# output document display type
394	if ($first) {
395	print $handle "<thistype>$thistype\n";
396	}
397
398	if (!$docs_only) {
399	# output a list of children
400	my $children = $doc_obj->get_children ($section);
401	if (scalar(@$children) > 0) {
402	print $handle "<childtype>$childtype\n";
403	print $handle "<contains>";
404	my $firstchild = 1;
405	foreach $child (@$children) {
406	print $handle ";" unless $firstchild;
407	$firstchild = 0;
408	if ($child =~ /^.*?\.(\d+)$/) {
409	print $handle "\".$1";
410	} else {
411	print $handle "\".$child";
412	}
413	# if ($child eq "") { print $handle "$doc_OID"; }
414	# elsif ($section eq "") { print $handle "$doc_OID.$child"; }
415	# else { print $handle "$doc_OID.$section.$child"; }
416	}
417	print $handle "\n";
418	}
419	#output the matching doc number
420	print $handle "<docnum>$self->{'num_sections'}\n";
421
422	} # if (!$docs_only)
423	else { #docs only, doc num is num_docs not num_sections
424	# output the matching document number
425	print $handle "<docnum>$self->{'num_docs'}\n";
426	}
427
428	print $handle '-' x 70, "\n";
429
430
431	# output a database entry for the document number
432	if ($docs_only) {
433	print $handle "[$self->{'num_docs'}]\n";
434	print $handle "<section>$doc_OID\n";
435	}
436	else {
437	print $handle "[$self->{'num_sections'}]\n";
438	if ($section eq "") { print $handle "<section>$doc_OID\n"; }
439	else { print $handle "<section>$doc_OID.$section\n"; }
440	}
441	print $handle '-' x 70, "\n";
442
443	# output entry for url
444	if ($url ne "") {
445	print $handle $url;
446	}
447
448	$first = 0;
449	$section = $doc_obj->get_next_section($section);
450	last if ($docs_only); # if no sections wanted, only gdbm the docs
451	}
452
453	#add this document to the browse structure
454	push(@{$self->{'doclist'}},$doc_obj->get_OID())
455	unless ($doctype eq "classification");
456
457	# classify this document
458	&classify::classify_doc ($self->{'classifiers'}, $doc_obj);
459
460	}
461
462	#sub find_paragraphs {
463	# $_[1] =~ s/(<p\b)/<Paragraph>$1/gi;
464	#}
465
466	#this function strips the html tags from the doc if ($strip_html) and
467	# if ($para) replaces <p> with <Paragraph> tags.
468	# if both are false, the original text is returned
469	#assumes that <pre> and </pre> have no spaces, and removes all < and > inside
470	#these tags
471	sub preprocess_text {
472	my $self = shift (@_);
473	my ($text, $strip_html, $para) = @_;
474	my ($outtext) = "";
475	if ($strip_html) {
476	while ($text =~ /<([^>]*)>/ && $text ne "") {
477
478	$tag = $1;
479	$outtext .= $`." "; #add everything before the matched tag
480	$text = $'; #everything after the matched tag
481	if ($para && $tag =~ /^\s*p\s/i) {
482	$outtext .= $para;
483	}
484	elsif ($tag =~ /^pre$/) { # a pre tag
485	$text =~ /<\/pre>/; # find the closing pre tag
486	my $tmp_text = $`; #everything before the closing pre tag
487	$text = $'; #everything after the </pre>
488	$tmp_text =~ s/[<>]//g; # remove all < and >
489	$outtext.= $tmp_text . " ";
490	}
491	}
492
493	$outtext .= $text; # add any remaining text
494	return $outtext;
495	} #if strip_html
496
497	#if ($para) {
498	#$text =~ s/(<p\b)/$para$1/gi;
499	#return $text;
500	# }
501	return $text;
502	}
503
504
505
506	sub filter_text {
507	# $self->filter_text ($field, $new_text);
508	# don't want to do anything for this version, however,
509	# in a particular collection you might want to override
510	# this method to post-process certain fields depending on
511	# the field, or whether we are outputting it for indexing
512	}
513
514	sub text {
515	my $self = shift (@_);
516	my ($doc_obj) = @_;
517	my $handle = $self->{'output_handle'};
518	my $outhandle = $self->{'outhandle'};
519	my $indexed_doc = 1;
520
521	# only output this document if it is one to be indexed
522	return if ($doc_obj->get_doc_type() ne "indexed_doc");
523
524	# see if this document belongs to this subcollection
525	foreach my $indexexp (@{$self->{'indexexparr'}}) {
526	$indexed_doc = 0;
527	my ($field, $exp, $options) = split /\//, $indexexp;
528	if (defined ($field) && defined ($exp)) {
529	my ($bool) = $field =~ /^(.)/;
530	$field =~ s/^.// if $bool eq '!';
531	if ($field =~ /^filename$/i) {
532	$field = $doc_obj->get_source_filename();
533	} else {
534	$field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
535	}
536	next unless defined $field;
537	if ($bool eq '!') {
538	if ($options =~ /^i$/i) {
539	if ($field !~ /$exp/i) {$indexed_doc = 1; last;}
540	} else {
541	if ($field !~ /$exp/) {$indexed_doc = 1; last;}
542	}
543	} else {
544	if ($options =~ /^i$/i) {
545	if ($field =~ /$exp/i) {$indexed_doc = 1; last;}
546	} else {
547	if ($field =~ /$exp/) {$indexed_doc = 1; last;}
548	}
549	}
550	}
551	}
552
553	# this is another document
554	$self->{'num_docs'} += 1;
555
556	# get the parameters for the output
557	# split on : just in case there is subcoll and lang stuff
558	my ($fields) = split (/:/, $self->{'index'});
559
560	my ($documenttag) = "";
561	my($documentendtag) = "";
562	if ($self->{'levels'}->{'document'}) {
563	$documenttag = "\n<". $level_map{'document'} . ">\n";
564	$documentendtag = "\n</". $level_map{'document'} . ">\n";
565	}
566	my ($sectiontag) = "";
567	if ($self->{'levels'}->{'section'}) {
568	$sectiontag = "\n<". $level_map{'section'} . ">\n";
569	}
570	my ($paratag) = "";
571	if ($self->{'levels'}->{'paragraph'}) {
572	if ($self->{'strip_html'}) {
573	$paratag = "<". $level_map{'paragraph'} . ">";
574	} else {
575	print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
576	}
577	}
578
579	my $doc_section = 0; # just for this document
580
581	my $text = $documenttag;
582
583	# get the text for this document
584	my $section = $doc_obj->get_top_section();
585	while (defined $section) {
586	# update a few statistics
587	$doc_section++;
588	$self->{'num_sections'} += 1;
589	$text .= "$sectiontag";
590
591	if ($indexed_doc) {
592	if ($self->{'indexing_text'}) {
593	$text .= "$paratag"; # only add para tags for indexing
594	# note that we assume that metadata will not be asked for for the compressed text, so we add para tags without checking for indexing_text
595	}
596	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
597	foreach my $field (split (/,/, $fields)) {
598	# only deal with this field if it doesn't start with top or
599	# this is the first section
600	my $real_field = $field;
601	if (!($real_field =~ s/^top//) \|\| ($doc_section == 1)) {
602	my $new_text = "";
603	my $tmp_text = "";
604	if ($real_field eq "text") {
605	if ($self->{'indexing_text'}) { #tag the text with <Text>...</Text>, add the <Paragraph> tags and strip out html if needed
606	$new_text .= "$paratag<TX>\n";
607	$tmp_text .= $doc_obj->get_text ($section);
608	$tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, "</TX>$paratag<TX>");
609
610	$new_text .= "$tmp_text</TX>\n";
611	#if (!defined $self->{'indexfields'}->{'TextOnly'}) {
612	#$self->{'indexfields'}->{'TextOnly'} = 1;
613	#}
614	}
615	else { # leave html stuff in, and dont add Paragraph tags - never retrieve paras at the moment
616	$new_text .= $doc_obj->get_text ($section) if $self->{'store_text'};
617	}
618	} else { # metadata field
619	if ($real_field eq "allfields") { #ignore
620	}
621	elsif ($real_field eq "metadata") { # insert all metadata
622	#except gsdl stuff
623	my $shortname = "";
624	my $metadata = $doc_obj->get_all_metadata ($section);
625	foreach $pair (@$metadata) {
626	my ($mfield, $mvalue) = (@$pair);
627	# check fields here, maybe others dont want - change to use dontindex!!
628	if ($mfield ne "Identifier"
629	&& $mfield !~ /^gsdl/
630	&& $mfield ne "classifytype"
631	&& $mfield ne "assocfilepath"
632	&& defined $mvalue && $mvalue ne "") {
633
634	if (defined $self->{'indexfieldmap'}->{$mfield}) {
635	$shortname = $self->{'indexfieldmap'}->{$mfield};
636	}
637	else {
638	$shortname = $self->create_shortname($mfield);
639	$self->{'indexfieldmap'}->{$mfield} = $shortname;
640	$self->{'indexfieldmap'}->{$shortname} = 1;
641	}
642	$new_text .= "$paratag<$shortname>$mvalue</$shortname>\n";
643	if (!defined $self->{'indexfields'}->{$mfield}) {
644	$self->{'indexfields'}->{$mfield} = 1;
645	}
646	}
647	}
648
649	}
650	else { #individual metadata specified
651	my $shortname="";
652	#if (!defined $self->{'indexfields'}->{$real_field}) {
653	#$self->{'indexfields'}->{$real_field} = 1;
654	#}
655	if (defined $self->{'indexfieldmap'}->{$real_field}) {
656	$shortname = $self->{'indexfieldmap'}->{$real_field};
657	}
658	else {
659	$shortname = $self->create_shortname($real_field);
660	$self->{'indexfieldmap'}->{$real_field} = $shortname;
661	$self->{'indexfieldmap'}->{$shortname} = 1;
662	}
663	foreach $item (@{$doc_obj->get_metadata ($section, $real_field)}) {
664	$new_text .= "$paratag<$shortname>$item</$shortname>\n";
665	}
666	}
667
668	}
669
670	# filter the text
671	$self->filter_text ($field, $new_text);
672
673	$self->{'num_processed_bytes'} += length ($new_text);
674	$text .= "$new_text";
675	}
676	}
677	} # if (indexed_doc)
678
679	$section = $doc_obj->get_next_section($section);
680	} #while defined section
681	print $handle "$text\n$documentendtag";
682
683	}
684
685	#chooses the first two letters or digits for the shortname
686	#now ignores non-letdig characters
687	sub create_shortname {
688	$self = shift(@_);
689
690	my ($realname) = @_;
691	#take the first two chars
692	my $shortname;
693	if ($realname =~ /^[^\w](\w)[^\w](\w)/) {
694	$shortname = "$1$2";
695	} else {
696	# there aren't two letdig's in the field - try arbitrary combinations
697	$realname = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
698	$shortname = "AB";
699	}
700	$shortname =~ tr/a-z/A-Z/;
701
702	#if already used, take the first and third letdigs and so on
703	$count = 1;
704	while (defined $self->{'indexfieldmap'}->{$shortname}) {
705	if ($realname =~ /^[^\w](\w)([^\w]\w){$count}[^\w]*(\w)/) {
706	$shortname = "$1$3";
707	$count++;
708	$shortname =~ tr/a-z/A-Z/;
709
710	}
711	else {
712	#remove up to and incl the first letdig
713	$realname =~ s/^[^\w]*\w//;
714	$count = 0;
715	}
716	}
717
718	return $shortname;
719	}
720
721	1;
722

Note: See TracBrowser for help on using the repository browser.

Download in other formats: