Context Navigation

source: gsdl/trunk/perllib/basebuildproc.pm@ 17564

Last change on this file since 17564 was 17564, checked in by kjdon, 16 years ago
fixed up some stuff to do with indexfieldmap. still working on it, but want to commit what I've done
Property svn:keywords set to `Author Date Id Revision`
File size: 18.7 KB

Line
1	###########################################################################
2	#
3	# basebuildproc.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# This document processor outputs a document for indexing (should be
27	# implemented by subclass) and storing in the database
28
29	package basebuildproc;
30
31	eval {require bytes};
32
33	use classify;
34	use dbutil;
35	use doc;
36	use docproc;
37	use strict; no strict 'subs';
38	use util;
39
40	BEGIN {
41	@basebuildproc::ISA = ('docproc');
42	}
43
44	sub new()
45	{
46	my ($class, $collection, $source_dir, $build_dir, $keepold, $verbosity, $outhandle) = @_;
47	my $self = new docproc ();
48
49	# outhandle is where all the debugging info goes
50	# output_handle is where the output of the plugins is piped
51	# to (i.e. mg, database etc.)
52	$outhandle = STDERR unless defined $outhandle;
53
54	$self->{'collection'} = $collection;
55	$self->{'source_dir'} = $source_dir;
56	$self->{'build_dir'} = $build_dir;
57	$self->{'keepold'} = $keepold;
58	$self->{'verbosity'} = $verbosity;
59	$self->{'outhandle'} = $outhandle;
60
61	$self->{'classifiers'} = [];
62	$self->{'mode'} = "text";
63	$self->{'assocdir'} = $build_dir;
64	$self->{'dontdb'} = {};
65	$self->{'store_metadata_coverage'} = "false";
66
67	$self->{'index'} = "section:text";
68	$self->{'indexexparr'} = [];
69
70	$self->{'separate_cjk'} = 0;
71
72	my $found_num_data = 0;
73	my $buildconfigfile = undef;
74
75	if ($keepold) {
76	# For incremental building need to seed num_docs etc from values
77	# stored in build.cfg (if present)
78	$buildconfigfile = &util::filename_cat($build_dir, "build.cfg");
79	if (-e $buildconfigfile) {
80	$found_num_data = 1;
81	}
82	else {
83	# try the index dir
84	$buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},
85	"index", "build.cfg");
86	if (-e $buildconfigfile) {
87	$found_num_data = 1;
88	}
89	}
90
91	}
92
93	if ($found_num_data)
94	{
95	#print STDERR "Found_Num_Data!\n";
96	my $buildcfg = &colcfg::read_build_cfg($buildconfigfile);
97	$self->{'starting_num_docs'} = $buildcfg->{'numdocs'};
98	#print STDERR "- num_docs: $self->{'starting_num_docs'}\n";
99	$self->{'starting_num_sections'} = $buildcfg->{'numsections'};
100	#print STDERR "- num_sections: $self->{'starting_num_sections'}\n";
101	$self->{'starting_num_bytes'} = $buildcfg->{'numbytes'};
102	#print STDERR "- num_bytes: $self->{'starting_num_bytes'}\n";
103	}
104	else
105	{
106	#print STDERR "NOT Found_Num_Data!\n";
107	$self->{'starting_num_docs'} = 0;
108	$self->{'starting_num_sections'} = 0;
109	$self->{'starting_num_bytes'} = 0;
110	}
111
112	$self->{'output_handle'} = "STDOUT";
113	$self->{'num_docs'} = $self->{'starting_num_docs'};
114	$self->{'num_sections'} = $self->{'starting_num_sections'};
115	$self->{'num_bytes'} = $self->{'starting_num_bytes'};
116
117	$self->{'num_processed_bytes'} = 0;
118	$self->{'store_text'} = 1;
119
120	# what level (section/document) the database - indexer intersection is
121	$self->{'db_level'} = "section";
122	#used by browse interface
123	$self->{'doclist'} = [];
124
125	$self->{'indexing_text'} = 0;
126
127	return bless $self, $class;
128
129	}
130
131	sub reset {
132	my $self = shift (@_);
133
134	$self->{'num_docs'} = $self->{'starting_num_docs'};
135	$self->{'num_sections'} = $self->{'starting_num_sections'};
136	$self->{'num_bytes'} = $self->{'starting_num_bytes'};
137
138	$self->{'num_processed_bytes'} = 0;
139	}
140
141	sub zero_reset {
142	my $self = shift (@_);
143
144	$self->{'num_docs'} = 0;
145	$self->{'num_sections'} = 0;
146	# reconstructed docs have no text, just metadata, so we need to
147	# remember how many bytes we had initially
148	$self->{'num_bytes'} = $self->{'starting_num_bytes'};
149
150	$self->{'num_processed_bytes'} = 0;
151	}
152
153	sub is_incremental_capable
154	{
155	# By default we return 'no' as the answer
156	# Safer to assume non-incremental to start with, and then override in
157	# inherited classes that are.
158
159	return 0;
160	}
161
162	sub get_num_docs {
163	my $self = shift (@_);
164
165	return $self->{'num_docs'};
166	}
167
168	sub get_num_sections {
169	my $self = shift (@_);
170
171	return $self->{'num_sections'};
172	}
173
174	# num_bytes is the actual number of bytes in the collection
175	# this is normally the same as what's processed during text compression
176	sub get_num_bytes {
177	my $self = shift (@_);
178
179	return $self->{'num_bytes'};
180	}
181
182	# num_processed_bytes is the number of bytes actually passed
183	# to mg for the current index
184	sub get_num_processed_bytes {
185	my $self = shift (@_);
186
187	return $self->{'num_processed_bytes'};
188	}
189
190	sub set_output_handle {
191	my $self = shift (@_);
192	my ($handle) = @_;
193
194	$self->{'output_handle'} = $handle;
195	}
196
197
198	sub set_mode {
199	my $self = shift (@_);
200	my ($mode) = @_;
201
202	$self->{'mode'} = $mode;
203	}
204
205	sub get_mode {
206	my $self = shift (@_);
207
208	return $self->{'mode'};
209	}
210
211	sub set_assocdir {
212	my $self = shift (@_);
213	my ($assocdir) = @_;
214
215	$self->{'assocdir'} = $assocdir;
216	}
217
218	sub set_dontdb {
219	my $self = shift (@_);
220	my ($dontdb) = @_;
221
222	$self->{'dontdb'} = $dontdb;
223	}
224
225	sub set_infodbtype
226	{
227	my $self = shift(@_);
228	my $infodbtype = shift(@_);
229	$self->{'infodbtype'} = $infodbtype;
230	}
231
232	sub set_index {
233	my $self = shift (@_);
234	my ($index, $indexexparr) = @_;
235
236	$self->{'index'} = $index;
237	$self->{'indexexparr'} = $indexexparr if defined $indexexparr;
238	}
239
240	sub set_index_languages {
241	my $self = shift (@_);
242	my ($lang_meta, $langarr) = @_;
243	$self->{'lang_meta'} = $lang_meta;
244	$self->{'langarr'} = $langarr;
245	}
246
247	sub get_index {
248	my $self = shift (@_);
249
250	return $self->{'index'};
251	}
252
253	sub set_classifiers {
254	my $self = shift (@_);
255	my ($classifiers) = @_;
256
257	$self->{'classifiers'} = $classifiers;
258	}
259
260	sub set_indexing_text {
261	my $self = shift (@_);
262	my ($indexing_text) = @_;
263
264	$self->{'indexing_text'} = $indexing_text;
265	}
266
267	sub get_indexing_text {
268	my $self = shift (@_);
269
270	return $self->{'indexing_text'};
271	}
272
273	sub set_store_text {
274	my $self = shift (@_);
275	my ($store_text) = @_;
276
277	$self->{'store_text'} = $store_text;
278	}
279
280	sub set_store_metadata_coverage {
281	my $self = shift (@_);
282	my ($store_metadata_coverage) = @_;
283
284	$self->{'store_metadata_coverage'} = $store_metadata_coverage \|\| "";
285	}
286
287	sub get_doc_list {
288	my $self = shift(@_);
289
290	return @{$self->{'doclist'}};
291	}
292
293	# the standard database level is section, but you may want to change it to document
294	sub set_db_level {
295	my $self= shift (@_);
296	my ($db_level) = @_;
297
298	$self->{'db_level'} = $db_level;
299	}
300
301	sub set_sections_index_document_metadata {
302	my $self= shift (@_);
303	my ($index_type) = @_;
304
305	$self->{'sections_index_document_metadata'} = $index_type;
306	}
307
308	sub set_separate_cjk {
309	my $self = shift (@_);
310	my ($sep_cjk) = @_;
311
312	$self->{'separate_cjk'} = $sep_cjk;
313	}
314
315	sub process {
316	my $self = shift (@_);
317	my $method = $self->{'mode'};
318
319	$self->$method(@_);
320	}
321
322	# post process text depending on field. Currently don't do anything here
323	# except cjk separation, and only for indexing
324	# should only do this for indexed text (if $self->{'indexing_text'}),
325	# but currently search term highlighting doesn't work if you do that.
326	# once thats fixed up, then fix this.
327	sub filter_text {
328	my $self = shift (@_);
329	my ($field, $text) = @_;
330
331	# lets do cjk seg here
332	my $new_text =$text;
333	if ($self->{'separate_cjk'}) {
334	$new_text = &cnseg::segment($text);
335	}
336	return $new_text;
337	}
338
339
340	sub infodb_metadata_stats
341	{
342	my $self = shift (@_);
343	my ($field) = @_;
344
345	# Keep some statistics relating to metadata sets used and
346	# frequency of particular metadata fields within each set
347
348	# Union of metadata prefixes and frequency of fields
349	# (both scoped for this document alone, and across whole collection)
350
351	if ($field =~ m/^(.+)\.(.*)$/) {
352	my $prefix = $1;
353	my $core_field = $2;
354
355	$self->{'doc_mdprefix_fields'}->{$prefix}->{$core_field}++;
356	$self->{'mdprefix_fields'}->{$prefix}->{$core_field}++;
357	}
358	elsif ($field =~ m/^[[:upper:]]/) {
359	# implicit 'ex' metadata set
360
361	$self->{'doc_mdprefix_fields'}->{'ex'}->{$field}++;
362	$self->{'mdprefix_fields'}->{'ex'}->{$field}++;
363	}
364
365	}
366
367
368	sub infodb {
369	my $self = shift (@_);
370	my ($doc_obj, $filename) = @_;
371
372	# only output this document if it is a "indexed_doc" or "info_doc" (database only) document
373	my $doctype = $doc_obj->get_doc_type();
374	return if ($doctype ne "indexed_doc" && $doctype ne "info_doc");
375
376	my $archivedir = "";
377	if (defined $filename)
378	{
379	# doc_obj derived directly from file
380	my ($dir) = $filename =~ /^(.?)(?:\/\|\\)[^\/\\]$/;
381	$dir = "" unless defined $dir;
382	$dir =~ s/\\/\//g;
383	$dir =~ s/^\/+//;
384	$dir =~ s/\/+$//;
385
386	$archivedir = $dir;
387
388	# resolve the final filenames of the files associated with this document
389	$self->assoc_files ($doc_obj, $archivedir);
390	}
391	else
392	{
393	# doc_obj reconstructed from database (has metadata, doc structure but no text)
394	my $top_section = $doc_obj->get_top_section();
395	$archivedir = $doc_obj->get_metadata_element($top_section,"archivedir");
396	}
397
398	#add this document to the browse structure
399	push(@{$self->{'doclist'}},$doc_obj->get_OID())
400	unless ($doctype eq "classification");
401
402	# classify this document
403	&classify::classify_doc ($self->{'classifiers'}, $doc_obj);
404
405	# this is another document
406	$self->{'num_docs'} += 1 unless ($doctype eq "classification");
407
408	# is this a paged or a hierarchical document
409	my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
410
411	my $section = $doc_obj->get_top_section ();
412	my $doc_OID = $doc_obj->get_OID();
413	my $first = 1;
414	my $infodb_handle = $self->{'output_handle'};
415
416	$self->{'doc_mdprefix_fields'} = {};
417
418	while (defined $section)
419	{
420	my $section_OID = $doc_OID;
421	if ($section ne "")
422	{
423	$section_OID = $doc_OID . "." . $section;
424	}
425	my %section_infodb = ();
426
427	# update a few statistics
428	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
429	print STDERR "num bytes added = ".$doc_obj->get_text_length ($section)."\n";
430	$self->{'num_sections'} += 1 unless ($doctype eq "classification");
431
432	# output the fact that this document is a document (unless doctype
433	# has been set to something else from within a plugin
434	my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
435	if (!defined $dtype \|\| $dtype !~ /\w/) {
436	$section_infodb{"doctype"} = [ "doc" ];
437	}
438
439	# Output whether this node contains text
440	#
441	# If doc_obj reconstructed from database file then no need to
442	# explicitly add <hastxt> as this is preserved as metadata when
443	# the database file is loaded in
444	if (defined $filename)
445	{
446	# doc_obj derived directly from file
447	if ($doc_obj->get_text_length($section) > 0) {
448	$section_infodb{"hastxt"} = [ "1" ];
449	} else {
450	$section_infodb{"hastxt"} = [ "0" ];
451	}
452	}
453
454	# output all the section metadata
455	my $metadata = $doc_obj->get_all_metadata ($section);
456	foreach my $pair (@$metadata) {
457	my ($field, $value) = (@$pair);
458
459	if ($field ne "Identifier" && $field !~ /^gsdl/ &&
460	defined $value && $value ne "") {
461
462	# escape problematic stuff
463	$value =~ s/\\/\\\\/g;
464	$value =~ s/\n/\\n/g;
465	$value =~ s/\r/\\r/g;
466
467	# special case for URL metadata
468	if ($field =~ /^URL$/i) {
469	&dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $value, { 'section' => [ $section_OID ] });
470	}
471
472	if (!defined $self->{'dontdb'}->{$field}) {
473	push(@{$section_infodb{$field}}, $value);
474
475	if ($section eq "" && $self->{'store_metadata_coverage'} =~ /^true$/i)
476	{
477	$self->infodb_metadata_stats($field);
478	}
479	}
480	}
481	}
482
483	if ($section eq "")
484	{
485	my $doc_mdprefix_fields = $self->{'doc_mdprefix_fields'};
486
487	foreach my $prefix (keys %$doc_mdprefix_fields)
488	{
489	push(@{$section_infodb{"metadataset"}}, $prefix);
490
491	foreach my $field (keys %{$doc_mdprefix_fields->{$prefix}})
492	{
493	push(@{$section_infodb{"metadatalist-$prefix"}}, $field);
494
495	my $val = $doc_mdprefix_fields->{$prefix}->{$field};
496	push(@{$section_infodb{"metadatafreq-$prefix-$field"}}, $val);
497	}
498	}
499	}
500
501	# If doc_obj reconstructed from database file then no need to
502	# explicitly add <archivedir> as this is preserved as metadata when
503	# the database file is loaded in
504	if (defined $filename)
505	{
506	# output archivedir if at top level
507	if ($section eq $doc_obj->get_top_section()) {
508	$section_infodb{"archivedir"} = [ $archivedir ];
509	}
510	}
511
512	# output document display type
513	if ($first) {
514	$section_infodb{"thistype"} = [ $thistype ];
515	}
516
517	if ($self->{'db_level'} eq "document") {
518	# doc num is num_docs not num_sections
519	# output the matching document number
520	$section_infodb{"docnum"} = [ $self->{'num_docs'} ];
521	}
522	else {
523	# output a list of children
524	my $children = $doc_obj->get_children ($section);
525	if (scalar(@$children) > 0) {
526	$section_infodb{"childtype"} = [ $childtype ];
527	my $contains = "";
528	foreach my $child (@$children)
529	{
530	$contains .= ";" unless ($contains eq "");
531	if ($child =~ /^.*?\.(\d+)$/)
532	{
533	$contains .= "\".$1";
534	}
535	else
536	{
537	$contains .= "\".$child";
538	}
539	}
540	$section_infodb{"contains"} = [ $contains ];
541	}
542	# output the matching doc number
543	$section_infodb{"docnum"} = [ $self->{'num_sections'} ];
544	}
545
546	&dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $section_OID, \%section_infodb);
547
548	# output a database entry for the document number, except for Lucene (which no longer needs this information)
549	unless (ref($self) eq "lucenebuildproc")
550	{
551	if ($self->{'db_level'} eq "document") {
552	&dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_docs'}, { 'section' => [ $doc_OID ] });
553	}
554	else {
555	&dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_sections'}, { 'section' => [ $section_OID ] });
556	}
557	}
558
559	$first = 0;
560	$section = $doc_obj->get_next_section($section);
561	last if ($self->{'db_level'} eq "document"); # if no sections wanted, only add the docs
562	}
563	}
564
565
566	sub text {
567	my $self = shift (@_);
568	my ($doc_obj) = @_;
569
570	my $handle = $self->{'outhandle'};
571	print $handle "basebuildproc::text function must be implemented in sub classes\n";
572	die "\n";
573	}
574
575	# should the document be indexed - according to the subcollection and language
576	# specification.
577	sub is_subcollection_doc {
578	my $self = shift (@_);
579	my ($doc_obj) = @_;
580
581	my $indexed_doc = 1;
582	foreach my $indexexp (@{$self->{'indexexparr'}}) {
583	$indexed_doc = 0;
584	my ($field, $exp, $options) = split /\//, $indexexp;
585	if (defined ($field) && defined ($exp)) {
586	my ($bool) = $field =~ /^(.)/;
587	$field =~ s/^.// if $bool eq '!';
588	my @metadata_values;
589	if ($field =~ /^filename$/i) {
590	push(@metadata_values, $doc_obj->get_source_filename());
591	}
592	else {
593	@metadata_values = @{$doc_obj->get_metadata($doc_obj->get_top_section(), $field)};
594	}
595	next unless @metadata_values;
596	foreach my $metadata_value (@metadata_values) {
597	if ($bool eq '!') {
598	if ($options =~ /^i$/i) {
599	if ($metadata_value !~ /$exp/i) {$indexed_doc = 1; last;}
600	} else {
601	if ($metadata_value !~ /$exp/) {$indexed_doc = 1; last;}
602	}
603	} else {
604	if ($options =~ /^i$/i) {
605	if ($metadata_value =~ /$exp/i) {$indexed_doc = 1; last;}
606	} else {
607	if ($metadata_value =~ /$exp/) {$indexed_doc = 1; last;}
608	}
609	}
610	}
611
612	last if ($indexed_doc == 1);
613	}
614	}
615
616	# if this doc is so far in the sub collection, and we have lang info,
617	# now we check the languages to see if it matches
618	if($indexed_doc && defined $self->{'lang_meta'}) {
619	$indexed_doc = 0;
620	my $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'lang_meta'});
621	if (defined $field) {
622	foreach my $lang (@{$self->{'langarr'}}) {
623	my ($bool) = $lang =~ /^(.)/;
624	if ($bool eq '!') {
625	$lang =~ s/^.//;
626	if ($field !~ /$lang/) {
627	$indexed_doc = 1; last;
628	}
629	} else {
630	if ($field =~ /$lang/) {
631	$indexed_doc = 1; last;
632	}
633	}
634	}
635	}
636	}
637	return $indexed_doc;
638
639	}
640
641	# use 'Paged' if document has no more than 2 levels
642	# and each section at second level has a number for
643	# Title metadata
644	# also use Paged if gsdlthistype metadata is set to Paged
645	sub get_document_type {
646	my $self = shift (@_);
647	my ($doc_obj) = @_;
648
649	my $thistype = "VList";
650	my $childtype = "VList";
651	my $title;
652	my @tmp = ();
653
654	my $section = $doc_obj->get_top_section ();
655
656	my $gsdlthistype = $doc_obj->get_metadata_element ($section, "gsdlthistype");
657	if (defined $gsdlthistype) {
658	if ($gsdlthistype eq "Paged") {
659	$childtype = "Paged";
660	if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
661	$thistype = "Paged";
662	} else {
663	$thistype = "Invisible";
664	}
665
666	return ($thistype, $childtype);
667	} elsif ($gsdlthistype eq "Hierarchy") {
668	return ($thistype, $childtype); # use VList, VList
669	}
670	}
671	my $first = 1;
672	while (defined $section) {
673	@tmp = split /\./, $section;
674	if (scalar(@tmp) > 1) {
675	return ($thistype, $childtype);
676	}
677	if (!$first) {
678	$title = $doc_obj->get_metadata_element ($section, "Title");
679	if (!defined $title \|\| $title !~ /^\d+$/) {
680	return ($thistype, $childtype);
681	}
682	}
683	$first = 0;
684	$section = $doc_obj->get_next_section($section);
685	}
686	if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
687	$thistype = "Paged";
688	} else {
689	$thistype = "Invisible";
690	}
691	$childtype = "Paged";
692	return ($thistype, $childtype);
693	}
694
695	sub assoc_files() {
696	my $self = shift (@_);
697	my ($doc_obj, $archivedir) = @_;
698	my ($afile);
699
700	foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
701	#rint STDERR "Processing associated file - copy " . $assoc_file->[0] . " to " . $assoc_file->[1] . "\n";
702	# if assoc file starts with a slash, we put it relative to the assoc
703	# dir, otherwise it is relative to the HASH... directory
704	if ($assoc_file->[1] =~ m@^[/\\]@) {
705	$afile = &util::filename_cat($self->{'assocdir'}, $assoc_file->[1]);
706	} else {
707	$afile = &util::filename_cat($self->{'assocdir'}, $archivedir, $assoc_file->[1]);
708	}
709	&util::hard_link ($assoc_file->[0], $afile);
710	}
711	}
712

Note: See TracBrowser for help on using the repository browser.

Download in other formats: