Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: trunk/gsdl/perllib/basebuildproc.pm@ 9953

Last change on this file since 9953 was 9919, checked in by kjdon, 19 years ago
made a base buildproc class, and shifted most of the buildproc code into it. mainly the subclasses just need to implement the text method
Property svn:keywords set to `Author Date Id Revision`
File size: 13.5 KB

Line
1	###########################################################################
2	#
3	# basebuildproc.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# This document processor outputs a document for indexing (should be
27	# implemented by subclass) and storing in gdbm database
28
29	package basebuildproc;
30
31	eval {require bytes};
32
33	use classify;
34	use doc;
35	use docproc;
36	use util;
37
38	BEGIN {
39	@basebuildproc::ISA = ('docproc');
40	}
41
42	sub new {
43	my ($class, $collection, $source_dir, $build_dir,
44	$verbosity, $outhandle) = @_;
45	my $self = new docproc ();
46
47	# outhandle is where all the debugging info goes
48	# output_handle is where the output of the plugins is piped
49	# to (i.e. mg, gdbm etc.)
50	$outhandle = STDERR unless defined $outhandle;
51
52	$self->{'collection'} = $collection;
53	$self->{'source_dir'} = $source_dir;
54	$self->{'build_dir'} = $build_dir;
55	$self->{'verbosity'} = $verbosity;
56	$self->{'outhandle'} = $outhandle;
57
58	$self->{'classifiers'} = [];
59	$self->{'mode'} = "text";
60	$self->{'assocdir'} = $build_dir;
61	$self->{'dontgdbm'} = {};
62
63	$self->{'index'} = "section:text";
64	$self->{'indexexparr'} = [];
65
66	$self->{'output_handle'} = "STDOUT";
67	$self->{'num_docs'} = 0;
68	$self->{'num_sections'} = 0;
69	$self->{'num_bytes'} = 0;
70	$self->{'num_processed_bytes'} = 0;
71	$self->{'store_text'} = 1;
72
73	# what level (section/document) the gdbm database - indexer intersection is
74	$self->{'gdbm_level'} = "section";
75	#used by browse interface
76	$self->{'doclist'} = [];
77
78	$self->{'indexing_text'} = 0;
79
80	return bless $self, $class;
81
82	}
83
84	sub reset {
85	my $self = shift (@_);
86
87	$self->{'num_docs'} = 0;
88	$self->{'num_sections'} = 0;
89	$self->{'num_processed_bytes'} = 0;
90	$self->{'num_bytes'} = 0;
91	}
92
93	sub get_num_docs {
94	my $self = shift (@_);
95
96	return $self->{'num_docs'};
97	}
98
99	sub get_num_sections {
100	my $self = shift (@_);
101
102	return $self->{'num_sections'};
103	}
104
105	# num_bytes is the actual number of bytes in the collection
106	# this is normally the same as what's processed during text compression
107	sub get_num_bytes {
108	my $self = shift (@_);
109
110	return $self->{'num_bytes'};
111	}
112
113	# num_processed_bytes is the number of bytes actually passed
114	# to mg for the current index
115	sub get_num_processed_bytes {
116	my $self = shift (@_);
117
118	return $self->{'num_processed_bytes'};
119	}
120
121	sub set_output_handle {
122	my $self = shift (@_);
123	my ($handle) = @_;
124
125	$self->{'output_handle'} = $handle;
126	}
127
128
129	sub set_mode {
130	my $self = shift (@_);
131	my ($mode) = @_;
132
133	$self->{'mode'} = $mode;
134	}
135
136	sub set_assocdir {
137	my $self = shift (@_);
138	my ($assocdir) = @_;
139
140	$self->{'assocdir'} = $assocdir;
141	}
142
143	sub set_dontgdbm {
144	my $self = shift (@_);
145	my ($dontgdbm) = @_;
146
147	$self->{'dontgdbm'} = $dontgdbm;
148	}
149
150	sub set_index {
151	my $self = shift (@_);
152	my ($index, $indexexparr) = @_;
153
154	$self->{'index'} = $index;
155	$self->{'indexexparr'} = $indexexparr if defined $indexexparr;
156	}
157
158	sub set_index_languages {
159	my $self = shift (@_);
160	my ($lang_meta, $langarr) = @_;
161	$self->{'lang_meta'} = $lang_meta;
162	$self->{'langarr'} = $langarr;
163	}
164
165	sub get_index {
166	my $self = shift (@_);
167
168	return $self->{'index'};
169	}
170
171	sub set_classifiers {
172	my $self = shift (@_);
173	my ($classifiers) = @_;
174
175	$self->{'classifiers'} = $classifiers;
176	}
177
178	sub set_indexing_text {
179	my $self = shift (@_);
180	my ($indexing_text) = @_;
181
182	$self->{'indexing_text'} = $indexing_text;
183	}
184
185	sub get_indexing_text {
186	my $self = shift (@_);
187
188	return $self->{'indexing_text'};
189	}
190
191	sub set_store_text {
192	my $self = shift (@_);
193	my ($store_text) = @_;
194
195	$self->{'store_text'} = $store_text;
196	}
197	sub get_doc_list {
198	my $self = shift(@_);
199
200	return @{$self->{'doclist'}};
201	}
202
203	# the standard gdbm level is section, but you may want to change it to document
204	sub set_gdbm_level {
205	my $self= shift (@_);
206	my ($gdbm_level) = @_;
207
208	$self->{'gdbm_level'} = $gdbm_level;
209	}
210
211	sub process {
212	my $self = shift (@_);
213	my $method = $self->{'mode'};
214
215	$self->$method(@_);
216	}
217
218	sub infodb {
219	my $self = shift (@_);
220	my ($doc_obj, $filename) = @_;
221	my $handle = $self->{'output_handle'};
222
223	my $doctype = $doc_obj->get_doc_type();
224
225	# only output this document if it is one to be indexed
226	return if ($doctype ne "indexed_doc");
227
228	my ($archivedir) = $filename =~ /^(.?)(?:\/\|\\)[^\/\\]$/;
229	$archivedir = "" unless defined $archivedir;
230	$archivedir =~ s/\\/\//g;
231	$archivedir =~ s/^\/+//;
232	$archivedir =~ s/\/+$//;
233
234	# resolve the final filenames of the files associated with this document
235	$self->assoc_files ($doc_obj, $archivedir);
236
237	#GRB: moved 1/06/2004 from GRB01062004
238	#add this document to the browse structure
239	push(@{$self->{'doclist'}},$doc_obj->get_OID())
240	unless ($doctype eq "classification");
241
242	# classify this document
243	&classify::classify_doc ($self->{'classifiers'}, $doc_obj);
244	#GRB: end of moved block
245
246	# this is another document
247	$self->{'num_docs'} += 1 unless ($doctype eq "classification");
248
249	# is this a paged or a hierarchical document
250	my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
251
252	my $section = $doc_obj->get_top_section ();
253	my $doc_OID = $doc_obj->get_OID();
254	my $first = 1;
255	my $url = "";
256	while (defined $section) {
257	# update a few statistics
258	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
259	$self->{'num_sections'} += 1 unless ($doctype eq "classification");
260
261	# output the section name
262	if ($section eq "") { print $handle "[$doc_OID]\n"; }
263	else { print $handle "[$doc_OID.$section]\n"; }
264
265	# output the fact that this document is a document (unless doctype
266	# has been set to something else from within a plugin
267	my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
268	if (!defined $dtype \|\| $dtype !~ /\w/) {
269	print $handle "<doctype>doc\n";
270	}
271
272	# output whether this node contains text
273	if ($doc_obj->get_text_length($section) > 0) {
274	print $handle "<hastxt>1\n";
275	} else {
276	print $handle "<hastxt>0\n";
277	}
278
279	# output all the section metadata
280	my $metadata = $doc_obj->get_all_metadata ($section);
281	foreach my $pair (@$metadata) {
282	my ($field, $value) = (@$pair);
283
284	if ($field ne "Identifier" && $field !~ /^gsdl/ &&
285	defined $value && $value ne "") {
286
287	# escape problematic stuff
288	$value =~ s/\\/\\\\/g;
289	$value =~ s/\n/\\n/g;
290	$value =~ s/\r/\\r/g;
291	if ($value =~ /-{70,}/) {
292	# if value contains 70 or more hyphens in a row we need
293	# to escape them to prevent txt2db from treating them
294	# as a separator
295	$value =~ s/-/&\#045;/gi;
296	}
297
298	# special case for URL metadata
299	if ($field =~ /^URL$/i) {
300	$url .= "[$value]\n";
301	if ($section eq "") {$url .= "<section>$doc_OID\n";}
302	else {$url .= "<section>$doc_OID.$section\n";}
303	$url .= '-' x 70 . "\n";
304	}
305
306	if (!defined $self->{'dontgdbm'}->{$field}) {
307	print $handle "<$field>$value\n";
308	}
309	}
310	}
311
312	# output archivedir if at top level
313	if ($section eq $doc_obj->get_top_section()) {
314	print $handle "<archivedir>$archivedir\n";
315	}
316
317	# output document display type
318	if ($first) {
319	print $handle "<thistype>$thistype\n";
320	}
321
322	if ($self->{'gdbm_level'} eq "document") {
323	# doc num is num_docs not num_sections
324	# output the matching document number
325	print $handle "<docnum>$self->{'num_docs'}\n";
326	} else {
327	# output a list of children
328	my $children = $doc_obj->get_children ($section);
329	if (scalar(@$children) > 0) {
330	print $handle "<childtype>$childtype\n";
331	print $handle "<contains>";
332	my $firstchild = 1;
333	foreach my $child (@$children) {
334	print $handle ";" unless $firstchild;
335	$firstchild = 0;
336	if ($child =~ /^.*?\.(\d+)$/) {
337	print $handle "\".$1";
338	} else {
339	print $handle "\".$child";
340	}
341	# if ($child eq "") { print $handle "$doc_OID"; }
342	# elsif ($section eq "") { print $handle "$doc_OID.$child"; }
343	# else { print $handle "$doc_OID.$section.$child"; }
344	}
345	print $handle "\n";
346	}
347	#output the matching doc number
348	print $handle "<docnum>$self->{'num_sections'}\n";
349
350	}
351
352	print $handle '-' x 70, "\n";
353
354
355	# output a database entry for the document number
356	if ($self->{'gdbm_level'} eq "document") {
357	print $handle "[$self->{'num_docs'}]\n";
358	print $handle "<section>$doc_OID\n";
359	}
360	else {
361	print $handle "[$self->{'num_sections'}]\n";
362	if ($section eq "") { print $handle "<section>$doc_OID\n"; }
363	else { print $handle "<section>$doc_OID.$section\n"; }
364	}
365	print $handle '-' x 70, "\n";
366
367	# output entry for url
368	if ($url ne "") {
369	print $handle $url;
370	}
371
372	$first = 0;
373	$section = $doc_obj->get_next_section($section);
374	last if ($self->{'gdbm_level'} eq "document"); # if no sections wanted, only gdbm the docs
375	}
376
377	#GRB01062004: see code above moved from here
378	}
379
380
381	sub text {
382	my $self = shift (@_);
383	my ($doc_obj) = @_;
384
385	my $handle = $self->{'outhandle'};
386	print $handle "basebuildproc::text function must be implemented in sub classes\n";
387	die "\n";
388	}
389
390	# should the document be indexed - according to the subcollection and language
391	# specification.
392	sub is_subcollection_doc {
393	my $self = shift (@_);
394	my ($doc_obj) = @_;
395
396	my $indexed_doc = 1;
397	foreach my $indexexp (@{$self->{'indexexparr'}}) {
398	$indexed_doc = 0;
399	my ($field, $exp, $options) = split /\//, $indexexp;
400	if (defined ($field) && defined ($exp)) {
401	my ($bool) = $field =~ /^(.)/;
402	$field =~ s/^.// if $bool eq '!';
403	if ($field =~ /^filename$/i) {
404	$field = $doc_obj->get_source_filename();
405	} else {
406	$field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
407	}
408	next unless defined $field;
409	if ($bool eq '!') {
410	if ($options =~ /^i$/i) {
411	if ($field !~ /$exp/i) {$indexed_doc = 1; last;}
412	} else {
413	if ($field !~ /$exp/) {$indexed_doc = 1; last;}
414	}
415	} else {
416	if ($options =~ /^i$/i) {
417	if ($field =~ /$exp/i) {$indexed_doc = 1; last;}
418	} else {
419	if ($field =~ /$exp/) {$indexed_doc = 1; last;}
420	}
421	}
422	}
423	}
424
425	# if this doc is so far in the sub collection, and we have lang info,
426	# now we check the languages to see if it matches
427	if($indexed_doc && defined $self->{'lang_meta'}) {
428	$indexed_doc = 0;
429	my $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'lang_meta'});
430	if (defined $field) {
431	foreach my $lang (@{$self->{'langarr'}}) {
432	my ($bool) = $lang =~ /^(.)/;
433	if ($bool eq '!') {
434	$lang =~ s/^.//;
435	if ($field !~ /$lang/) {
436	$indexed_doc = 1; last;
437	}
438	} else {
439	if ($field =~ /$lang/) {
440	$indexed_doc = 1; last;
441	}
442	}
443	}
444	}
445	}
446	return $indexed_doc;
447
448	}
449
450	# use 'Paged' if document has no more than 2 levels
451	# and each section at second level has a number for
452	# Title metadata
453	# also use Paged if gsdlthistype metadata is set to Paged
454	sub get_document_type {
455	my $self = shift (@_);
456	my ($doc_obj) = @_;
457
458	my $thistype = "VList";
459	my $childtype = "VList";
460	my $title;
461	my @tmp = ();
462
463	my $section = $doc_obj->get_top_section ();
464
465	my $gsdlthistype = $doc_obj->get_metadata_element ($section, "gsdlthistype");
466	if (defined $gsdlthistype) {
467	if ($gsdlthistype eq "Paged") {
468	$childtype = "Paged";
469	if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
470	$thistype = "Paged";
471	} else {
472	$thistype = "Invisible";
473	}
474
475	return ($thistype, $childtype);
476	} elsif ($gsdlthistype eq "Hierarchy") {
477	return ($thistype, $childtype); # use VList, VList
478	}
479	}
480	my $first = 1;
481	while (defined $section) {
482	@tmp = split /\./, $section;
483	if (scalar(@tmp) > 1) {
484	return ($thistype, $childtype);
485	}
486	if (!$first) {
487	$title = $doc_obj->get_metadata_element ($section, "Title");
488	if (!defined $title \|\| $title !~ /^\d+$/) {
489	return ($thistype, $childtype);
490	}
491	}
492	$first = 0;
493	$section = $doc_obj->get_next_section($section);
494	}
495	if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
496	$thistype = "Paged";
497	} else {
498	$thistype = "Invisible";
499	}
500	$childtype = "Paged";
501	return ($thistype, $childtype);
502	}
503
504	sub assoc_files {
505	my $self = shift (@_);
506	my ($doc_obj, $archivedir) = @_;
507	my ($afile);
508
509	foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
510	# if assoc file starts with a slash, we put it relative to the assoc
511	# dir, otherwise it is relative to the HASH... directory
512	if ($assoc_file->[1] =~ m@^[/\\]@) {
513	$afile = &util::filename_cat($self->{'assocdir'},$assoc_file->[1]);
514	} else {
515	$afile = &util::filename_cat($self->{'assocdir'}, $archivedir, $assoc_file->[1]);
516	}
517	&util::hard_link ($assoc_file->[0], $afile);
518	}
519	}
520

Note: See TracBrowser for help on using the repository browser.

Download in other formats: