Context Navigation

source: trunk/gsdl/perllib/basebuildproc.pm@ 12844

Last change on this file since 12844 was 12844, checked in by mdewsnip, 18 years ago
Incremental building and dynamic GDBM updating code, many thanks to John Rowe and John Thompson at DL Consulting Ltd.
Property svn:keywords set to `Author Date Id Revision`
File size: 16.9 KB

Line
1	###########################################################################
2	#
3	# basebuildproc.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# This document processor outputs a document for indexing (should be
27	# implemented by subclass) and storing in gdbm database
28
29	package basebuildproc;
30
31	eval {require bytes};
32
33	use classify;
34	use doc;
35	use docproc;
36	use util;
37
38	BEGIN {
39	@basebuildproc::ISA = ('docproc');
40	}
41
42	sub new()
43	{
44	my ($class, $collection, $source_dir, $build_dir, $keepold, $verbosity, $outhandle) = @_;
45	my $self = new docproc ();
46
47	# outhandle is where all the debugging info goes
48	# output_handle is where the output of the plugins is piped
49	# to (i.e. mg, gdbm etc.)
50	$outhandle = STDERR unless defined $outhandle;
51
52	$self->{'collection'} = $collection;
53	$self->{'source_dir'} = $source_dir;
54	$self->{'build_dir'} = $build_dir;
55	$self->{'keepold'} = $keepold;
56	$self->{'verbosity'} = $verbosity;
57	$self->{'outhandle'} = $outhandle;
58
59	$self->{'classifiers'} = [];
60	$self->{'mode'} = "text";
61	$self->{'assocdir'} = $build_dir;
62	$self->{'dontgdbm'} = {};
63
64	$self->{'index'} = "section:text";
65	$self->{'indexexparr'} = [];
66
67	my $found_num_data = 0;
68	my $buildconfigfile = undef;
69
70	if ($keepold) {
71	# For incremental building need to seed num_docs etc from values
72	# stored in build.cfg (if present)
73	print STDERR "Keepold!\n";
74	$buildconfigfile = &util::filename_cat($build_dir, "build.cfg");
75	print STDERR "Build cfg: $buildconfigfile\n";
76	if (-e $buildconfigfile) {
77	$found_num_data = 1;
78	}
79	else {
80	# try the index dir
81	$buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},
82	"index", "build.cfg");
83	print STDERR "Index cfg: $buildconfigfile\n";
84	if (-e $buildconfigfile) {
85	$found_num_data = 1;
86	}
87	}
88
89	}
90	#else
91	# {
92	# print STDERR "Removeold!\n";
93	# }
94
95	if ($found_num_data)
96	{
97	#print STDERR "Found_Num_Data!\n";
98	my $buildcfg = &colcfg::read_build_cfg($buildconfigfile);
99	$self->{'starting_num_docs'} = $buildcfg->{'numdocs'};
100	#print STDERR "- num_docs: $self->{'starting_num_docs'}\n";
101	$self->{'starting_num_sections'} = $buildcfg->{'numsections'};
102	#print STDERR "- num_sections: $self->{'starting_num_sections'}\n";
103	$self->{'starting_num_bytes'} = $buildcfg->{'numbytes'};
104	#print STDERR "- num_bytes: $self->{'starting_num_bytes'}\n";
105	}
106	else
107	{
108	#print STDERR "NOT Found_Num_Data!\n";
109	$self->{'starting_num_docs'} = 0;
110	$self->{'starting_num_sections'} = 0;
111	$self->{'starting_num_bytes'} = 0;
112	}
113
114	$self->{'output_handle'} = "STDOUT";
115	$self->{'num_docs'} = $self->{'starting_num_docs'};
116	$self->{'num_sections'} = $self->{'starting_num_sections'};
117	$self->{'num_bytes'} = $self->{'starting_num_bytes'};
118
119	$self->{'num_processed_bytes'} = 0;
120	$self->{'store_text'} = 1;
121
122	# what level (section/document) the gdbm database - indexer intersection is
123	$self->{'gdbm_level'} = "section";
124	#used by browse interface
125	$self->{'doclist'} = [];
126
127	$self->{'indexing_text'} = 0;
128
129	return bless $self, $class;
130
131	}
132
133	sub reset {
134	my $self = shift (@_);
135
136	$self->{'num_docs'} = $self->{'starting_num_docs'};
137	$self->{'num_sections'} = $self->{'starting_num_sections'};
138	$self->{'num_bytes'} = $self->{'starting_num_bytes'};
139
140	$self->{'num_processed_bytes'} = 0;
141	}
142
143	sub zero_reset {
144	my $self = shift (@_);
145
146	$self->{'num_docs'} = 0;
147	$self->{'num_sections'} = 0;
148	$self->{'num_bytes'} = 0;
149
150	$self->{'num_processed_bytes'} = 0;
151	}
152
153	sub is_incremental_capable
154	{
155	# By default we return 'no' as the answer
156	# Safer to assume non-incremental to start with, and then override in
157	# inherited classes that are.
158
159	return 0;
160	}
161
162	sub get_num_docs {
163	my $self = shift (@_);
164
165	return $self->{'num_docs'};
166	}
167
168	sub get_num_sections {
169	my $self = shift (@_);
170
171	return $self->{'num_sections'};
172	}
173
174	# num_bytes is the actual number of bytes in the collection
175	# this is normally the same as what's processed during text compression
176	sub get_num_bytes {
177	my $self = shift (@_);
178
179	return $self->{'num_bytes'};
180	}
181
182	# num_processed_bytes is the number of bytes actually passed
183	# to mg for the current index
184	sub get_num_processed_bytes {
185	my $self = shift (@_);
186
187	return $self->{'num_processed_bytes'};
188	}
189
190	sub set_output_handle {
191	my $self = shift (@_);
192	my ($handle) = @_;
193
194	$self->{'output_handle'} = $handle;
195	}
196
197
198	sub set_mode {
199	my $self = shift (@_);
200	my ($mode) = @_;
201
202	$self->{'mode'} = $mode;
203	}
204
205	sub get_mode {
206	my $self = shift (@_);
207
208	return $self->{'mode'};
209	}
210
211	sub set_assocdir {
212	my $self = shift (@_);
213	my ($assocdir) = @_;
214
215	$self->{'assocdir'} = $assocdir;
216	}
217
218	sub set_dontgdbm {
219	my $self = shift (@_);
220	my ($dontgdbm) = @_;
221
222	$self->{'dontgdbm'} = $dontgdbm;
223	}
224
225	sub set_index {
226	my $self = shift (@_);
227	my ($index, $indexexparr) = @_;
228
229	$self->{'index'} = $index;
230	$self->{'indexexparr'} = $indexexparr if defined $indexexparr;
231	}
232
233	sub set_index_languages {
234	my $self = shift (@_);
235	my ($lang_meta, $langarr) = @_;
236	$self->{'lang_meta'} = $lang_meta;
237	$self->{'langarr'} = $langarr;
238	}
239
240	sub get_index {
241	my $self = shift (@_);
242
243	return $self->{'index'};
244	}
245
246	sub set_classifiers {
247	my $self = shift (@_);
248	my ($classifiers) = @_;
249
250	$self->{'classifiers'} = $classifiers;
251	}
252
253	sub set_indexing_text {
254	my $self = shift (@_);
255	my ($indexing_text) = @_;
256
257	$self->{'indexing_text'} = $indexing_text;
258	}
259
260	sub get_indexing_text {
261	my $self = shift (@_);
262
263	return $self->{'indexing_text'};
264	}
265
266	sub set_store_text {
267	my $self = shift (@_);
268	my ($store_text) = @_;
269
270	$self->{'store_text'} = $store_text;
271	}
272	sub get_doc_list {
273	my $self = shift(@_);
274
275	return @{$self->{'doclist'}};
276	}
277
278	# the standard gdbm level is section, but you may want to change it to document
279	sub set_gdbm_level {
280	my $self= shift (@_);
281	my ($gdbm_level) = @_;
282
283	$self->{'gdbm_level'} = $gdbm_level;
284	}
285
286	sub set_sections_index_document_metadata {
287	my $self= shift (@_);
288	my ($index_type) = @_;
289
290	$self->{'sections_index_document_metadata'} = $index_type;
291	}
292	sub process {
293	my $self = shift (@_);
294	my $method = $self->{'mode'};
295
296	$self->$method(@_);
297	}
298
299	sub infodb {
300	my $self = shift (@_);
301	my ($doc_obj, $filename) = @_;
302	my $handle = $self->{'output_handle'};
303
304	my $doctype = $doc_obj->get_doc_type();
305
306	# only output this document if it is a "indexed_doc" or "info_doc" (GDBM database only) document
307	return if ($doctype ne "indexed_doc" && $doctype ne "info_doc");
308
309	my $archivedir = "";
310
311	if (defined $filename)
312	{
313	# doc_obj derived directly from file
314
315	my ($dir) = $filename =~ /^(.?)(?:\/\|\\)[^\/\\]$/;
316	$dir = "" unless defined $dir;
317	$dir =~ s/\\/\//g;
318	$dir =~ s/^\/+//;
319	$dir =~ s/\/+$//;
320
321	$archivedir = $dir;
322
323	# resolve the final filenames of the files associated with this document
324	$self->assoc_files ($doc_obj, $archivedir);
325	}
326	else
327	{
328	# doc_obj reconstructed from GDBM (has metadata, doc structure but no text)
329	my $top_section = $doc_obj->get_top_section();
330	$archivedir = $doc_obj->get_metadata_element($top_section,"archivedir");
331	}
332
333
334	#GRB: moved 1/06/2004 from GRB01062004
335	#add this document to the browse structure
336	push(@{$self->{'doclist'}},$doc_obj->get_OID())
337	unless ($doctype eq "classification");
338
339	# classify this document
340	&classify::classify_doc ($self->{'classifiers'}, $doc_obj);
341	#GRB: end of moved block
342
343	# this is another document
344	$self->{'num_docs'} += 1 unless ($doctype eq "classification");
345
346	# is this a paged or a hierarchical document
347	my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
348
349	my $section = $doc_obj->get_top_section ();
350	my $doc_OID = $doc_obj->get_OID();
351	my $first = 1;
352	my $url = "";
353	while (defined $section) {
354	# update a few statistics
355	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
356	$self->{'num_sections'} += 1 unless ($doctype eq "classification");
357
358	# output the section name
359	if ($section eq "") { print $handle "[$doc_OID]\n"; }
360	else { print $handle "[$doc_OID.$section]\n"; }
361
362	# output the fact that this document is a document (unless doctype
363	# has been set to something else from within a plugin
364	my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
365	if (!defined $dtype \|\| $dtype !~ /\w/) {
366	print $handle "<doctype>doc\n";
367	}
368
369	# Output whether this node contains text
370	#
371	# If doc_obj reconstructed from GDBM file then no need to
372	# explicitly add <hastxt> as this is preserved as metadata when
373	# the GDBM file is loaded in
374
375	if (defined $filename)
376	{
377	# doc_obj derived directly from file
378	if ($doc_obj->get_text_length($section) > 0) {
379	print $handle "<hastxt>1\n";
380	} else {
381	print $handle "<hastxt>0\n";
382	}
383	}
384
385	# output all the section metadata
386	my $metadata = $doc_obj->get_all_metadata ($section);
387	foreach my $pair (@$metadata) {
388	my ($field, $value) = (@$pair);
389
390	if ($field ne "Identifier" && $field !~ /^gsdl/ &&
391	defined $value && $value ne "") {
392
393	# escape problematic stuff
394	$value =~ s/\\/\\\\/g;
395	$value =~ s/\n/\\n/g;
396	$value =~ s/\r/\\r/g;
397	if ($value =~ /-{70,}/) {
398	# if value contains 70 or more hyphens in a row we need
399	# to escape them to prevent txt2db from treating them
400	# as a separator
401	$value =~ s/-/&\#045;/gi;
402	}
403
404	# special case for URL metadata
405	if ($field =~ /^URL$/i) {
406	$url .= "[$value]\n";
407	if ($section eq "") {$url .= "<section>$doc_OID\n";}
408	else {$url .= "<section>$doc_OID.$section\n";}
409	$url .= '-' x 70 . "\n";
410	}
411
412	if (!defined $self->{'dontgdbm'}->{$field}) {
413	print $handle "<$field>$value\n";
414	}
415	}
416	}
417
418
419	# If doc_obj reconstructed from GDBM file then no need to
420	# explicitly add <archivedir> as this is preserved as metadata when
421	# the GDBM file is loaded in
422
423	if (defined $filename)
424	{
425	# output archivedir if at top level
426	if ($section eq $doc_obj->get_top_section()) {
427	print $handle "<archivedir>$archivedir\n";
428	}
429	}
430
431	# output document display type
432	if ($first) {
433	print $handle "<thistype>$thistype\n";
434	}
435
436
437	if ($self->{'gdbm_level'} eq "document") {
438	# doc num is num_docs not num_sections
439	# output the matching document number
440	print $handle "<docnum>$self->{'num_docs'}\n";
441
442	} else {
443	# output a list of children
444	my $children = $doc_obj->get_children ($section);
445	if (scalar(@$children) > 0) {
446	print $handle "<childtype>$childtype\n";
447	print $handle "<contains>";
448	my $firstchild = 1;
449	foreach my $child (@$children) {
450	print $handle ";" unless $firstchild;
451	$firstchild = 0;
452	if ($child =~ /^.*?\.(\d+)$/) {
453	print $handle "\".$1";
454	} else {
455	print $handle "\".$child";
456	}
457	# if ($child eq "") { print $handle "$doc_OID"; }
458	# elsif ($section eq "") { print $handle "$doc_OID.$child"; }
459	# else { print $handle "$doc_OID.$section.$child"; }
460	}
461	print $handle "\n";
462	}
463	#output the matching doc number
464	print $handle "<docnum>$self->{'num_sections'}\n";
465
466	}
467
468	print $handle '-' x 70, "\n";
469
470
471	# output a database entry for the document number
472	if ($self->{'gdbm_level'} eq "document") {
473	print $handle "[$self->{'num_docs'}]\n";
474	print $handle "<section>$doc_OID\n";
475	}
476	else {
477	print $handle "[$self->{'num_sections'}]\n";
478	if ($section eq "") { print $handle "<section>$doc_OID\n"; }
479	else { print $handle "<section>$doc_OID.$section\n"; }
480	}
481	print $handle '-' x 70, "\n";
482
483	# output entry for url
484	if ($url ne "") {
485	print $handle $url;
486	}
487
488	$first = 0;
489	$section = $doc_obj->get_next_section($section);
490	last if ($self->{'gdbm_level'} eq "document"); # if no sections wanted, only gdbm the docs
491	}
492
493	#GRB01062004: see code above moved from here
494	}
495
496
497	sub text {
498	my $self = shift (@_);
499	my ($doc_obj) = @_;
500
501	my $handle = $self->{'outhandle'};
502	print $handle "basebuildproc::text function must be implemented in sub classes\n";
503	die "\n";
504	}
505
506	# should the document be indexed - according to the subcollection and language
507	# specification.
508	sub is_subcollection_doc {
509	my $self = shift (@_);
510	my ($doc_obj) = @_;
511
512	my $indexed_doc = 1;
513	foreach my $indexexp (@{$self->{'indexexparr'}}) {
514	$indexed_doc = 0;
515	my ($field, $exp, $options) = split /\//, $indexexp;
516	if (defined ($field) && defined ($exp)) {
517	my ($bool) = $field =~ /^(.)/;
518	$field =~ s/^.// if $bool eq '!';
519	my @metadata_values;
520	if ($field =~ /^filename$/i) {
521	push(@metadata_values, $doc_obj->get_source_filename());
522	}
523	else {
524	@metadata_values = @{$doc_obj->get_metadata($doc_obj->get_top_section(), $field)};
525	}
526	next unless @metadata_values;
527	foreach my $metadata_value (@metadata_values) {
528	if ($bool eq '!') {
529	if ($options =~ /^i$/i) {
530	if ($metadata_value !~ /$exp/i) {$indexed_doc = 1; last;}
531	} else {
532	if ($metadata_value !~ /$exp/) {$indexed_doc = 1; last;}
533	}
534	} else {
535	if ($options =~ /^i$/i) {
536	if ($metadata_value =~ /$exp/i) {$indexed_doc = 1; last;}
537	} else {
538	if ($metadata_value =~ /$exp/) {$indexed_doc = 1; last;}
539	}
540	}
541	}
542
543	last if ($indexed_doc == 1);
544	}
545	}
546
547	# if this doc is so far in the sub collection, and we have lang info,
548	# now we check the languages to see if it matches
549	if($indexed_doc && defined $self->{'lang_meta'}) {
550	$indexed_doc = 0;
551	my $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'lang_meta'});
552	if (defined $field) {
553	foreach my $lang (@{$self->{'langarr'}}) {
554	my ($bool) = $lang =~ /^(.)/;
555	if ($bool eq '!') {
556	$lang =~ s/^.//;
557	if ($field !~ /$lang/) {
558	$indexed_doc = 1; last;
559	}
560	} else {
561	if ($field =~ /$lang/) {
562	$indexed_doc = 1; last;
563	}
564	}
565	}
566	}
567	}
568	return $indexed_doc;
569
570	}
571
572	# use 'Paged' if document has no more than 2 levels
573	# and each section at second level has a number for
574	# Title metadata
575	# also use Paged if gsdlthistype metadata is set to Paged
576	sub get_document_type {
577	my $self = shift (@_);
578	my ($doc_obj) = @_;
579
580	my $thistype = "VList";
581	my $childtype = "VList";
582	my $title;
583	my @tmp = ();
584
585	my $section = $doc_obj->get_top_section ();
586
587	my $gsdlthistype = $doc_obj->get_metadata_element ($section, "gsdlthistype");
588	if (defined $gsdlthistype) {
589	if ($gsdlthistype eq "Paged") {
590	$childtype = "Paged";
591	if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
592	$thistype = "Paged";
593	} else {
594	$thistype = "Invisible";
595	}
596
597	return ($thistype, $childtype);
598	} elsif ($gsdlthistype eq "Hierarchy") {
599	return ($thistype, $childtype); # use VList, VList
600	}
601	}
602	my $first = 1;
603	while (defined $section) {
604	@tmp = split /\./, $section;
605	if (scalar(@tmp) > 1) {
606	return ($thistype, $childtype);
607	}
608	if (!$first) {
609	$title = $doc_obj->get_metadata_element ($section, "Title");
610	if (!defined $title \|\| $title !~ /^\d+$/) {
611	return ($thistype, $childtype);
612	}
613	}
614	$first = 0;
615	$section = $doc_obj->get_next_section($section);
616	}
617	if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
618	$thistype = "Paged";
619	} else {
620	$thistype = "Invisible";
621	}
622	$childtype = "Paged";
623	return ($thistype, $childtype);
624	}
625
626	sub assoc_files() {
627	my $self = shift (@_);
628	my ($doc_obj, $archivedir) = @_;
629	my ($afile);
630
631	foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
632	#rint STDERR "Processing associated file - copy " . $assoc_file->[0] . " to " . $assoc_file->[1] . "\n";
633	# if assoc file starts with a slash, we put it relative to the assoc
634	# dir, otherwise it is relative to the HASH... directory
635	if ($assoc_file->[1] =~ m@^[/\\]@) {
636	$afile = &util::filename_cat($self->{'assocdir'}, $assoc_file->[1]);
637	} else {
638	$afile = &util::filename_cat($self->{'assocdir'}, $archivedir, $assoc_file->[1]);
639	}
640	&util::hard_link ($assoc_file->[0], $afile);
641	}
642	}
643

Note: See TracBrowser for help on using the repository browser.

Download in other formats: