Context Navigation

source: trunk/gsdl/perllib/mgppbuildproc.pm@ 5608

Last change on this file since 5608 was 4811, checked in by kjdon, 21 years ago
levels are now specified using upper or lower case, eg Section or section. if levels aren't specified, use document, otherwise use only what is specified eg levels section will only give section level. we use Doc, Sec, Para when passing the stuff to mgpp. the build.cfg file now contains indexlevels and textlevel entries - these give the actual names used by mgpp, and mean that the c++ code no longer has to assume them. collection meta can be specified for the levels, otherwise _textdocument_, _textsection_ and _textparagraph_ will be used.
Property svn:keywords set to `Author Date Id Revision`
File size: 19.0 KB

Line
1	###########################################################################
2	#
3	# mgppbuildproc.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# This document processor outputs a document
27	# for mgpp to process
28
29
30	package mgppbuildproc;
31
32	eval {require bytes};
33
34	use classify;
35	use doc;
36	use docproc;
37	use util;
38
39
40	BEGIN {
41	@ISA = ('docproc');
42	}
43
44	#this must be the same as in mgppbuilder
45	%level_map = ('document'=>'Doc',
46	'section'=>'Sec',
47	'paragraph'=>'Para');
48
49	sub new {
50	my ($class, $collection, $source_dir, $build_dir,
51	$verbosity, $outhandle) = @_;
52	my $self = new docproc ();
53
54	# outhandle is where all the debugging info goes
55	# output_handle is where the output of the plugins is piped
56	# to (i.e. mg, gdbm etc.)
57	$outhandle = STDERR unless defined $outhandle;
58
59	$self->{'collection'} = $collection;
60	$self->{'source_dir'} = $source_dir;
61	$self->{'build_dir'} = $build_dir;
62	$self->{'verbosity'} = $verbosity;
63	$self->{'classifiers'} = [];
64	$self->{'mode'} = "text";
65	$self->{'assocdir'} = $build_dir;
66	$self->{'dontgdbm'} = {};
67	$self->{'index'} = "text";
68	$self->{'indexexparr'} = [];
69	$self->{'output_handle'} = "STDOUT";
70	$self->{'num_docs'} = 0;
71	$self->{'num_sections'} = 0;
72	$self->{'num_bytes'} = 0;
73	$self->{'num_processed_bytes'} = 0;
74	$self->{'store_text'} = 1;
75	$self->{'outhandle'} = $outhandle;
76
77	#used by browse interface
78	$self->{'doclist'} = [];
79
80	$self->{'indexing_text'} = 0;
81
82	#new ones for mgpp
83	$self->{'dontindex'} = {};
84	$self->{'indexfieldmap'} = {};
85	$self->{'indexfields'} = {}; # only put in the ones that are not specified directly in the index
86	$self->{'strip_html'}=1;
87
88
89	return bless $self, $class;
90	}
91
92	sub reset {
93	my $self = shift (@_);
94
95	$self->{'num_docs'} = 0;
96	$self->{'num_sections'} = 0;
97	$self->{'num_processed_bytes'} = 0;
98	$self->{'num_bytes'} = 0;
99	}
100
101	sub get_num_docs {
102	my $self = shift (@_);
103
104	return $self->{'num_docs'};
105	}
106
107	sub get_num_sections {
108	my $self = shift (@_);
109
110	return $self->{'num_sections'};
111	}
112
113	# num_bytes is the actual number of bytes in the collection
114	# this is normally the same as what's processed during text compression
115	sub get_num_bytes {
116	my $self = shift (@_);
117
118	return $self->{'num_bytes'};
119	}
120
121	# num_processed_bytes is the number of bytes actually passed
122	# to mgpp for the current index
123	sub get_num_processed_bytes {
124	my $self = shift (@_);
125
126	return $self->{'num_processed_bytes'};
127	}
128
129	sub set_output_handle {
130	my $self = shift (@_);
131	my ($handle) = @_;
132
133	$self->{'output_handle'} = $handle;
134	}
135
136	sub set_mode {
137	my $self = shift (@_);
138	my ($mode) = @_;
139
140	$self->{'mode'} = $mode;
141	}
142
143	sub set_assocdir {
144	my $self = shift (@_);
145	my ($assocdir) = @_;
146
147	$self->{'assocdir'} = $assocdir;
148	}
149
150	sub set_dontgdbm {
151	my $self = shift (@_);
152	my ($dontgdbm) = @_;
153
154	$self->{'dontgdbm'} = $dontgdbm;
155	}
156
157	sub set_index {
158	my $self = shift (@_);
159	my ($index, $indexexparr) = @_;
160
161	$self->{'index'} = $index;
162	$self->{'indexexparr'} = $indexexparr if defined $indexexparr;
163	}
164
165	sub get_index {
166	my $self = shift (@_);
167
168	return $self->{'index'};
169	}
170
171	sub set_classifiers {
172	my $self = shift (@_);
173	my ($classifiers) = @_;
174
175	$self->{'classifiers'} = $classifiers;
176	}
177
178	sub set_indexing_text {
179	my $self = shift (@_);
180	my ($indexing_text) = @_;
181
182	$self->{'indexing_text'} = $indexing_text;
183	}
184
185	sub get_indexing_text {
186	my $self = shift (@_);
187
188	return $self->{'indexing_text'};
189	}
190
191	sub set_store_text {
192	my $self = shift (@_);
193	my ($store_text) = @_;
194
195	$self->{'store_text'} = $store_text;
196	}
197
198	sub get_doc_list {
199	my $self = shift(@_);
200
201	return @{$self->{'doclist'}};
202	}
203
204	sub set_indexfieldmap {
205	my $self = shift (@_);
206	my ($indexmap) = @_;
207
208	$self->{'indexfieldmap'} = $indexmap;
209	}
210
211	sub get_indexfieldmap {
212	my $self = shift (@_);
213
214	return $self->{'indexfieldmap'};
215	}
216
217	sub set_levels {
218	my $self = shift (@_);
219	my ($levels) = @_;
220
221	$self->{'levels'} = $levels;
222	}
223
224	sub set_strip_html {
225	my $self = shift (@_);
226	my ($strip) = @_;
227	$self->{'strip_html'}=$strip;
228	}
229
230	sub process {
231	my $self = shift (@_);
232	my $method = $self->{'mode'};
233
234	$self->$method(@_);
235	}
236
237	# use 'Paged' if document has no more than 2 levels
238	# and each section at second level has a number for
239	# Title metadata
240	sub get_document_type {
241	my $self = shift (@_);
242	my ($doc_obj) = @_;
243
244	my $thistype = "VList";
245	my $childtype = "VList";
246	my $title;
247	my @tmp = ();
248
249	my $section = $doc_obj->get_top_section ();
250	my $first = 1;
251	while (defined $section) {
252	@tmp = split /\./, $section;
253	if (scalar(@tmp) > 1) {
254	return ($thistype, $childtype);
255	}
256	if (!$first) {
257	$title = $doc_obj->get_metadata_element ($section, "Title");
258	if (!defined $title \|\| $title !~ /^\d+$/) {
259	return ($thistype, $childtype);
260	}
261	}
262	$first = 0;
263	$section = $doc_obj->get_next_section($section);
264	}
265	if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
266	$thistype = "Paged";
267	} else {
268	$thistype = "Invisible";
269	}
270	$childtype = "Paged";
271	return ($thistype, $childtype);
272	}
273
274	sub assoc_files {
275	my $self = shift (@_);
276	my ($doc_obj, $archivedir) = @_;
277	my ($afile);
278
279	foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
280	# if assoc file starts with a slash, we put it relative to the assoc
281	# dir, otherwise it is relative to the HASH... directory
282	if ($assoc_file->[1] =~ m@^[/\\]@) {
283	$afile = &util::filename_cat($self->{'assocdir'},$assoc_file->[1]);
284	} else {
285	$afile = &util::filename_cat($self->{'assocdir'}, $archivedir, $assoc_file->[1]);
286	}
287	&util::hard_link ($assoc_file->[0], $afile);
288	}
289	}
290
291	sub infodb {
292	my $self = shift (@_);
293	my ($doc_obj, $filename) = @_;
294	my $handle = $self->{'output_handle'};
295
296	my $doctype = $doc_obj->get_doc_type();
297
298	# only output this document if it is one to be indexed
299	return if ($doctype ne "indexed_doc");
300
301	#if a Section level index is not built, the gdbm file should be at doc
302	#level not Section
303	my $docs_only = 1;
304	if ($self->{'levels'}->{'section'}) {
305	$docs_only = 0;
306	}
307
308	my ($archivedir) = $filename =~ /^(.?)(?:\/\|\\)[^\/\\]$/;
309	$archivedir = "" unless defined $archivedir;
310	$archivedir =~ s/\\/\//g;
311	$archivedir =~ s/^\/+//;
312	$archivedir =~ s/\/+$//;
313
314	$self->assoc_files ($doc_obj, $archivedir);
315
316	# this is another document
317	$self->{'num_docs'} += 1 unless ($doctype eq "classification");
318
319	# is this a paged or a hierarchical document
320	my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
321
322	my $section = $doc_obj->get_top_section ();
323	my $doc_OID = $doc_obj->get_OID();
324	my $first = 1;
325	my $url = "";
326	while (defined $section) {
327	# update a few statistics
328	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
329	$self->{'num_sections'} += 1 unless ($doctype eq "classification");
330
331	# output the section name
332	if ($section eq "") { print $handle "[$doc_OID]\n"; }
333	else { print $handle "[$doc_OID.$section]\n"; }
334
335	# output the fact that this document is a document (unless doctype
336	# has been set to something else from within a plugin
337	my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
338	if (!defined $dtype \|\| $dtype !~ /\w/) {
339	print $handle "<doctype>doc\n";
340	}
341
342	# output whether this node contains text
343	if ($doc_obj->get_text_length($section) > 0) {
344	print $handle "<hastxt>1\n";
345	} else {
346	print $handle "<hastxt>0\n";
347	}
348
349	# output all the section metadata
350	my $metadata = $doc_obj->get_all_metadata ($section);
351	foreach my $pair (@$metadata) {
352	my ($field, $value) = (@$pair);
353
354	if ($field !~ /^(?:Identifier\|Encoding\|Language\|gsdl)/
355	&& defined $value && $value ne "") {
356
357	# escape problematic stuff
358	$value =~ s/\\/\\\\/g;
359	$value =~ s/\n/\\n/g;
360	$value =~ s/\r/\\r/g;
361
362	# special case for URL metadata
363	if ($field =~ /^URL$/i) {
364	$url .= "[$value]\n";
365	if ($section eq "") {$url .= "<section>$doc_OID\n";}
366	else {$url .= "<section>$doc_OID.$section\n";}
367	$url .= '-' x 70 . "\n";
368	}
369
370	if (!defined $self->{'dontgdbm'}->{$field}) {
371	print $handle "<$field>$value\n";
372	}
373	}
374	}
375
376	# output archivedir if at top level
377	if ($section eq $doc_obj->get_top_section()) {
378	print $handle "<archivedir>$archivedir\n";
379	}
380
381	# output document display type
382	if ($first) {
383	print $handle "<thistype>$thistype\n";
384	}
385
386	if (!$docs_only) {
387	# output a list of children
388	my $children = $doc_obj->get_children ($section);
389	if (scalar(@$children) > 0) {
390	print $handle "<childtype>$childtype\n";
391	print $handle "<contains>";
392	my $firstchild = 1;
393	foreach $child (@$children) {
394	print $handle ";" unless $firstchild;
395	$firstchild = 0;
396	if ($child =~ /^.*?\.(\d+)$/) {
397	print $handle "\".$1";
398	} else {
399	print $handle "\".$child";
400	}
401	# if ($child eq "") { print $handle "$doc_OID"; }
402	# elsif ($section eq "") { print $handle "$doc_OID.$child"; }
403	# else { print $handle "$doc_OID.$section.$child"; }
404	}
405	print $handle "\n";
406	}
407	#output the matching doc number
408	print $handle "<docnum>$self->{'num_sections'}\n";
409
410	} # if (!$docs_only)
411	else { #docs only, doc num is num_docs not num_sections
412	# output the matching document number
413	print $handle "<docnum>$self->{'num_docs'}\n";
414	}
415
416	print $handle '-' x 70, "\n";
417
418
419	# output a database entry for the document number
420	if ($docs_only) {
421	print $handle "[$self->{'num_docs'}]\n";
422	print $handle "<section>$doc_OID\n";
423	}
424	else {
425	print $handle "[$self->{'num_sections'}]\n";
426	if ($section eq "") { print $handle "<section>$doc_OID\n"; }
427	else { print $handle "<section>$doc_OID.$section\n"; }
428	}
429	print $handle '-' x 70, "\n";
430
431	# output entry for url
432	if ($url ne "") {
433	print $handle $url;
434	}
435
436	$first = 0;
437	$section = $doc_obj->get_next_section($section);
438	last if ($docs_only); # if no sections wanted, only gdbm the docs
439	}
440
441	#add this document to the browse structure
442	push(@{$self->{'doclist'}},$doc_obj->get_OID())
443	unless ($doctype eq "classification");
444
445	# classify this document
446	&classify::classify_doc ($self->{'classifiers'}, $doc_obj);
447
448	}
449
450	#sub find_paragraphs {
451	# $_[1] =~ s/(<p\b)/<Paragraph>$1/gi;
452	#}
453
454	#this function strips the html tags from the doc if ($strip_html) and
455	# if ($para) replaces <p> with <Paragraph> tags.
456	# if both are false, the original text is returned
457	#assumes that <pre> and </pre> have no spaces, and removes all < and > inside
458	#these tags
459	sub preprocess_text {
460	my $self = shift (@_);
461	my ($text, $strip_html, $para) = @_;
462	my ($outtext) = "";
463	if ($strip_html) {
464	while ($text =~ /<([^>]*)>/ && $text ne "") {
465
466	$tag = $1;
467	$outtext .= $`." "; #add everything before the matched tag
468	$text = $'; #everything after the matched tag
469	if ($para && $tag =~ /^\s*p\s/i) {
470	$outtext .= $para;
471	}
472	elsif ($tag =~ /^pre$/) { # a pre tag
473	$text =~ /<\/pre>/; # find the closing pre tag
474	my $tmp_text = $`; #everything before the closing pre tag
475	$text = $'; #everything after the </pre>
476	$tmp_text =~ s/[<>]//g; # remove all < and >
477	$outtext.= $tmp_text . " ";
478	}
479	}
480
481	$outtext .= $text; # add any remaining text
482	return $outtext;
483	} #if strip_html
484
485	#if ($para) {
486	#$text =~ s/(<p\b)/$para$1/gi;
487	#return $text;
488	# }
489	return $text;
490	}
491
492
493
494	sub filter_text {
495	# $self->filter_text ($field, $new_text);
496	# don't want to do anything for this version, however,
497	# in a particular collection you might want to override
498	# this method to post-process certain fields depending on
499	# the field, or whether we are outputting it for indexing
500	}
501
502	sub text {
503	my $self = shift (@_);
504	my ($doc_obj) = @_;
505	my $handle = $self->{'output_handle'};
506	my $outhandle = $self->{'outhandle'};
507	my $indexed_doc = 1;
508
509	# only output this document if it is one to be indexed
510	return if ($doc_obj->get_doc_type() ne "indexed_doc");
511
512	# see if this document belongs to this subcollection
513	foreach my $indexexp (@{$self->{'indexexparr'}}) {
514	$indexed_doc = 0;
515	my ($field, $exp, $options) = split /\//, $indexexp;
516	if (defined ($field) && defined ($exp)) {
517	my ($bool) = $field =~ /^(.)/;
518	$field =~ s/^.// if $bool eq '!';
519	if ($field =~ /^filename$/i) {
520	$field = $doc_obj->get_source_filename();
521	} else {
522	$field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
523	}
524	next unless defined $field;
525	if ($bool eq '!') {
526	if ($options =~ /^i$/i) {
527	if ($field !~ /$exp/i) {$indexed_doc = 1; last;}
528	} else {
529	if ($field !~ /$exp/) {$indexed_doc = 1; last;}
530	}
531	} else {
532	if ($options =~ /^i$/i) {
533	if ($field =~ /$exp/i) {$indexed_doc = 1; last;}
534	} else {
535	if ($field =~ /$exp/) {$indexed_doc = 1; last;}
536	}
537	}
538	}
539	}
540
541	# this is another document
542	$self->{'num_docs'} += 1;
543
544	# get the parameters for the output
545	my ($fields) = $self->{'index'};
546
547	my ($documenttag) = "";
548	my($documentendtag) = "";
549	if ($self->{'levels'}->{'document'}) {
550	$documenttag = "\n<". %level_map->{'document'} . ">\n";
551	$documentendtag = "\n</". %level_map->{'document'} . ">\n";
552	}
553	my ($sectiontag) = "";
554	if ($self->{'levels'}->{'section'}) {
555	$sectiontag = "\n<". %level_map->{'section'} . ">\n";
556	}
557	my ($paratag) = "";
558	if ($self->{'levels'}->{'paragraph'}) {
559	if ($self->{'strip_html'}) {
560	$paratag = "<". %level_map->{'paragraph'} . ">";
561	} else {
562	print $outhandle "Paragraph level can not be used with no_strip_html!. Not indexing Paragraphs.\n";
563	}
564	}
565
566	my $doc_section = 0; # just for this document
567
568	my $text = $documenttag;
569
570	# get the text for this document
571	my $section = $doc_obj->get_top_section();
572	while (defined $section) {
573	# update a few statistics
574	$doc_section++;
575	$self->{'num_sections'} += 1;
576	$text .= "$sectiontag";
577
578	if ($indexed_doc) {
579	if ($self->{'indexing_text'}) {
580	$text .= "$paratag"; # only add para tags for indexing
581	# note that we assume that metadata will not be asked for for the compressed text, so we add para tags without checking for indexing_text
582	}
583	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
584	foreach my $field (split (/,/, $fields)) {
585	# only deal with this field if it doesn't start with top or
586	# this is the first section
587	my $real_field = $field;
588	if (!($real_field =~ s/^top//) \|\| ($doc_section == 1)) {
589	my $new_text = "";
590	my $tmp_text = "";
591	if ($real_field eq "text") {
592	if ($self->{'indexing_text'}) { #tag the text with <Text>...</Text>, add the <Paragraph> tags and strip out html if needed
593	$new_text .= "$paratag<TX>\n";
594	$tmp_text .= $doc_obj->get_text ($section);
595	$tmp_text = $self->preprocess_text($tmp_text, $self->{'strip_html'}, "</TX>$paratag<TX>");
596
597	$new_text .= "$tmp_text</TX>\n";
598	#if (!defined $self->{'indexfields'}->{'TextOnly'}) {
599	#$self->{'indexfields'}->{'TextOnly'} = 1;
600	#}
601	}
602	else { # leave html stuff in, and dont add Paragraph tags - never retrieve paras at the moment
603	$new_text .= $doc_obj->get_text ($section) if $self->{'store_text'};
604	}
605	} else { # metadata field
606	if ($real_field eq "allfields") { #ignore
607	}
608	elsif ($real_field eq "metadata") { # insert all metadata
609	#except gsdl stuff
610	my $shortname = "";
611	my $metadata = $doc_obj->get_all_metadata ($section);
612	foreach $pair (@$metadata) {
613	my ($mfield, $mvalue) = (@$pair);
614	# check fields here, maybe others dont want - change to use dontindex!!
615	if ($mfield !~ /^(?:Identifier\|Encoding\|Language\|gsdl)/
616	&& $mfield ne "classifytype"
617	&& $mfield ne "assocfilepath"
618	&& defined $mvalue && $mvalue ne "") {
619
620	if (defined $self->{'indexfieldmap'}->{$mfield}) {
621	$shortname = $self->{'indexfieldmap'}->{$mfield};
622	}
623	else {
624	$shortname = $self->create_shortname($mfield);
625	$self->{'indexfieldmap'}->{$mfield} = $shortname;
626	$self->{'indexfieldmap'}->{$shortname} = 1;
627	}
628	$new_text .= "$paratag<$shortname>$mvalue</$shortname>\n";
629	if (!defined $self->{'indexfields'}->{$mfield}) {
630	$self->{'indexfields'}->{$mfield} = 1;
631	}
632	}
633	}
634
635	}
636	else { #individual metadata specified
637	my $shortname="";
638	#if (!defined $self->{'indexfields'}->{$real_field}) {
639	#$self->{'indexfields'}->{$real_field} = 1;
640	#}
641	if (defined $self->{'indexfieldmap'}->{$real_field}) {
642	$shortname = $self->{'indexfieldmap'}->{$real_field};
643	}
644	else {
645	$shortname = $self->create_shortname($real_field);
646	$self->{'indexfieldmap'}->{$real_field} = $shortname;
647	$self->{'indexfieldmap'}->{$shortname} = 1;
648	}
649	foreach $item (@{$doc_obj->get_metadata ($section, $real_field)}) {
650	$new_text .= "$paratag<$shortname>$item</$shortname>\n";
651	}
652	}
653
654	}
655
656	# filter the text
657	$self->filter_text ($field, $new_text);
658
659	$self->{'num_processed_bytes'} += length ($new_text);
660	$text .= "$new_text";
661	}
662	}
663	} # if (indexed_doc)
664
665	$section = $doc_obj->get_next_section($section);
666	} #while defined section
667	print $handle "$text\n$documentendtag";
668
669	}
670
671	#chooses the first two letters or digits for the shortname
672	#now ignores non-letdig characters
673	sub create_shortname {
674	$self = shift(@_);
675
676	my ($realname) = @_;
677	#take the first two chars
678	my $shortname;
679	if ($realname =~ /^[^\w](\w)[^\w](\w)/) {
680	$shortname = "$1$2";
681	} else {
682	# there aren't two letdig's in the field - try arbitrary combinations
683	$realname = "ABCDEFGHIJKLMNOPQRSTUVWXYZ";
684	$shortname = "AB";
685	}
686	$shortname =~ tr/a-z/A-Z/;
687
688	#if already used, take the first and third letdigs and so on
689	$count = 1;
690	while (defined $self->{'indexfieldmap'}->{$shortname}) {
691	if ($realname =~ /^[^\w](\w)([^\w]\w){$count}[^\w]*(\w)/) {
692	$shortname = "$1$3";
693	$count++;
694	$shortname =~ tr/a-z/A-Z/;
695
696	}
697	else {
698	#remove up to and incl the first letdig
699	$realname =~ s/^[^\w]*\w//;
700	$count = 0;
701	}
702	}
703
704	return $shortname;
705	}
706
707	1;
708

Note: See TracBrowser for help on using the repository browser.

Download in other formats: