Context Navigation

source: gsdl/trunk/perllib/basebuildproc.pm@ 14934

Last change on this file since 14934 was 14934, checked in by davidb, 16 years ago
Changes to allow statistic calculations for metadata coverage, i.e. for this docment which metadata set prefixes are used, which fields within those prefixes are used, and how many times. This is then agregated over the all documents and the summary stored as collection level metadata.
Property svn:keywords set to `Author Date Id Revision`
File size: 18.1 KB

Line
1	###########################################################################
2	#
3	# basebuildproc.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# This document processor outputs a document for indexing (should be
27	# implemented by subclass) and storing in gdbm database
28
29	package basebuildproc;
30
31	eval {require bytes};
32
33	use classify;
34	use doc;
35	use docproc;
36	use util;
37
38	BEGIN {
39	@basebuildproc::ISA = ('docproc');
40	}
41
42	sub new()
43	{
44	my ($class, $collection, $source_dir, $build_dir, $keepold, $verbosity, $outhandle) = @_;
45	my $self = new docproc ();
46
47	# outhandle is where all the debugging info goes
48	# output_handle is where the output of the plugins is piped
49	# to (i.e. mg, gdbm etc.)
50	$outhandle = STDERR unless defined $outhandle;
51
52	$self->{'collection'} = $collection;
53	$self->{'source_dir'} = $source_dir;
54	$self->{'build_dir'} = $build_dir;
55	$self->{'keepold'} = $keepold;
56	$self->{'verbosity'} = $verbosity;
57	$self->{'outhandle'} = $outhandle;
58
59	$self->{'classifiers'} = [];
60	$self->{'mode'} = "text";
61	$self->{'assocdir'} = $build_dir;
62	$self->{'dontgdbm'} = {};
63
64	$self->{'index'} = "section:text";
65	$self->{'indexexparr'} = [];
66
67	my $found_num_data = 0;
68	my $buildconfigfile = undef;
69
70	if ($keepold) {
71	# For incremental building need to seed num_docs etc from values
72	# stored in build.cfg (if present)
73	print STDERR "Keepold!\n";
74	$buildconfigfile = &util::filename_cat($build_dir, "build.cfg");
75	print STDERR "Build cfg: $buildconfigfile\n";
76	if (-e $buildconfigfile) {
77	$found_num_data = 1;
78	}
79	else {
80	# try the index dir
81	$buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},
82	"index", "build.cfg");
83	print STDERR "Index cfg: $buildconfigfile\n";
84	if (-e $buildconfigfile) {
85	$found_num_data = 1;
86	}
87	}
88
89	}
90	#else
91	# {
92	# print STDERR "Removeold!\n";
93	# }
94
95	if ($found_num_data)
96	{
97	#print STDERR "Found_Num_Data!\n";
98	my $buildcfg = &colcfg::read_build_cfg($buildconfigfile);
99	$self->{'starting_num_docs'} = $buildcfg->{'numdocs'};
100	#print STDERR "- num_docs: $self->{'starting_num_docs'}\n";
101	$self->{'starting_num_sections'} = $buildcfg->{'numsections'};
102	#print STDERR "- num_sections: $self->{'starting_num_sections'}\n";
103	$self->{'starting_num_bytes'} = $buildcfg->{'numbytes'};
104	#print STDERR "- num_bytes: $self->{'starting_num_bytes'}\n";
105	}
106	else
107	{
108	#print STDERR "NOT Found_Num_Data!\n";
109	$self->{'starting_num_docs'} = 0;
110	$self->{'starting_num_sections'} = 0;
111	$self->{'starting_num_bytes'} = 0;
112	}
113
114	$self->{'output_handle'} = "STDOUT";
115	$self->{'num_docs'} = $self->{'starting_num_docs'};
116	$self->{'num_sections'} = $self->{'starting_num_sections'};
117	$self->{'num_bytes'} = $self->{'starting_num_bytes'};
118
119	$self->{'num_processed_bytes'} = 0;
120	$self->{'store_text'} = 1;
121
122	# what level (section/document) the gdbm database - indexer intersection is
123	$self->{'gdbm_level'} = "section";
124	#used by browse interface
125	$self->{'doclist'} = [];
126
127	$self->{'indexing_text'} = 0;
128
129	return bless $self, $class;
130
131	}
132
133	sub reset {
134	my $self = shift (@_);
135
136	$self->{'num_docs'} = $self->{'starting_num_docs'};
137	$self->{'num_sections'} = $self->{'starting_num_sections'};
138	$self->{'num_bytes'} = $self->{'starting_num_bytes'};
139
140	$self->{'num_processed_bytes'} = 0;
141	}
142
143	sub zero_reset {
144	my $self = shift (@_);
145
146	$self->{'num_docs'} = 0;
147	$self->{'num_sections'} = 0;
148	$self->{'num_bytes'} = 0;
149
150	$self->{'num_processed_bytes'} = 0;
151	}
152
153	sub is_incremental_capable
154	{
155	# By default we return 'no' as the answer
156	# Safer to assume non-incremental to start with, and then override in
157	# inherited classes that are.
158
159	return 0;
160	}
161
162	sub get_num_docs {
163	my $self = shift (@_);
164
165	return $self->{'num_docs'};
166	}
167
168	sub get_num_sections {
169	my $self = shift (@_);
170
171	return $self->{'num_sections'};
172	}
173
174	# num_bytes is the actual number of bytes in the collection
175	# this is normally the same as what's processed during text compression
176	sub get_num_bytes {
177	my $self = shift (@_);
178
179	return $self->{'num_bytes'};
180	}
181
182	# num_processed_bytes is the number of bytes actually passed
183	# to mg for the current index
184	sub get_num_processed_bytes {
185	my $self = shift (@_);
186
187	return $self->{'num_processed_bytes'};
188	}
189
190	sub set_output_handle {
191	my $self = shift (@_);
192	my ($handle) = @_;
193
194	$self->{'output_handle'} = $handle;
195	}
196
197
198	sub set_mode {
199	my $self = shift (@_);
200	my ($mode) = @_;
201
202	$self->{'mode'} = $mode;
203	}
204
205	sub get_mode {
206	my $self = shift (@_);
207
208	return $self->{'mode'};
209	}
210
211	sub set_assocdir {
212	my $self = shift (@_);
213	my ($assocdir) = @_;
214
215	$self->{'assocdir'} = $assocdir;
216	}
217
218	sub set_dontgdbm {
219	my $self = shift (@_);
220	my ($dontgdbm) = @_;
221
222	$self->{'dontgdbm'} = $dontgdbm;
223	}
224
225	sub set_index {
226	my $self = shift (@_);
227	my ($index, $indexexparr) = @_;
228
229	$self->{'index'} = $index;
230	$self->{'indexexparr'} = $indexexparr if defined $indexexparr;
231	}
232
233	sub set_index_languages {
234	my $self = shift (@_);
235	my ($lang_meta, $langarr) = @_;
236	$self->{'lang_meta'} = $lang_meta;
237	$self->{'langarr'} = $langarr;
238	}
239
240	sub get_index {
241	my $self = shift (@_);
242
243	return $self->{'index'};
244	}
245
246	sub set_classifiers {
247	my $self = shift (@_);
248	my ($classifiers) = @_;
249
250	$self->{'classifiers'} = $classifiers;
251	}
252
253	sub set_indexing_text {
254	my $self = shift (@_);
255	my ($indexing_text) = @_;
256
257	$self->{'indexing_text'} = $indexing_text;
258	}
259
260	sub get_indexing_text {
261	my $self = shift (@_);
262
263	return $self->{'indexing_text'};
264	}
265
266	sub set_store_text {
267	my $self = shift (@_);
268	my ($store_text) = @_;
269
270	$self->{'store_text'} = $store_text;
271	}
272	sub get_doc_list {
273	my $self = shift(@_);
274
275	return @{$self->{'doclist'}};
276	}
277
278	# the standard gdbm level is section, but you may want to change it to document
279	sub set_gdbm_level {
280	my $self= shift (@_);
281	my ($gdbm_level) = @_;
282
283	$self->{'gdbm_level'} = $gdbm_level;
284	}
285
286	sub set_sections_index_document_metadata {
287	my $self= shift (@_);
288	my ($index_type) = @_;
289
290	$self->{'sections_index_document_metadata'} = $index_type;
291	}
292	sub process {
293	my $self = shift (@_);
294	my $method = $self->{'mode'};
295
296	$self->$method(@_);
297	}
298
299
300
301	sub infodb_metadata_stats
302	{
303	my $self = shift (@_);
304	my ($field) = @_;
305
306	# Keep some statistics relating to metadata sets used and
307	# frequency of particular metadata fields within each set
308
309	# Union of metadata prefixes and frequency of fields
310	# (both scoped for this document alone, and across whole collection)
311
312	if ($field =~ m/^(.+)\.(.*)$/) {
313	my $prefix = $1;
314	my $core_field = $2;
315
316	$self->{'doc_mdprefix_fields'}->{$prefix}->{$core_field}++;
317	$self->{'mdprefix_fields'}->{$prefix}->{$core_field}++;
318	}
319	elsif ($field =~ m/^[[:upper:]]/) {
320	# implicit 'ex' metadata set
321
322	$self->{'doc_mdprefix_fields'}->{'ex'}->{$field}++;
323	$self->{'mdprefix_fields'}->{'ex'}->{$field}++;
324	}
325
326	}
327
328
329	sub infodb {
330	my $self = shift (@_);
331	my ($doc_obj, $filename) = @_;
332	my $handle = $self->{'output_handle'};
333
334	my $doctype = $doc_obj->get_doc_type();
335
336	# only output this document if it is a "indexed_doc" or "info_doc" (GDBM database only) document
337	return if ($doctype ne "indexed_doc" && $doctype ne "info_doc");
338
339	my $archivedir = "";
340
341	if (defined $filename)
342	{
343	# doc_obj derived directly from file
344
345	my ($dir) = $filename =~ /^(.?)(?:\/\|\\)[^\/\\]$/;
346	$dir = "" unless defined $dir;
347	$dir =~ s/\\/\//g;
348	$dir =~ s/^\/+//;
349	$dir =~ s/\/+$//;
350
351	$archivedir = $dir;
352
353	# resolve the final filenames of the files associated with this document
354	$self->assoc_files ($doc_obj, $archivedir);
355	}
356	else
357	{
358	# doc_obj reconstructed from GDBM (has metadata, doc structure but no text)
359	my $top_section = $doc_obj->get_top_section();
360	$archivedir = $doc_obj->get_metadata_element($top_section,"archivedir");
361	}
362
363
364	#GRB: moved 1/06/2004 from GRB01062004
365	#add this document to the browse structure
366	push(@{$self->{'doclist'}},$doc_obj->get_OID())
367	unless ($doctype eq "classification");
368
369	# classify this document
370	&classify::classify_doc ($self->{'classifiers'}, $doc_obj);
371	#GRB: end of moved block
372
373	# this is another document
374	$self->{'num_docs'} += 1 unless ($doctype eq "classification");
375
376	# is this a paged or a hierarchical document
377	my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
378
379	my $section = $doc_obj->get_top_section ();
380	my $doc_OID = $doc_obj->get_OID();
381	my $first = 1;
382	my $url = "";
383
384	$self->{'doc_mdprefix_fields'} = {};
385
386	while (defined $section) {
387	# update a few statistics
388	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
389	$self->{'num_sections'} += 1 unless ($doctype eq "classification");
390
391	# output the section name
392	if ($section eq "") { print $handle "[$doc_OID]\n"; }
393	else { print $handle "[$doc_OID.$section]\n"; }
394
395	# output the fact that this document is a document (unless doctype
396	# has been set to something else from within a plugin
397	my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
398	if (!defined $dtype \|\| $dtype !~ /\w/) {
399	print $handle "<doctype>doc\n";
400	}
401
402	# Output whether this node contains text
403	#
404	# If doc_obj reconstructed from GDBM file then no need to
405	# explicitly add <hastxt> as this is preserved as metadata when
406	# the GDBM file is loaded in
407
408	if (defined $filename)
409	{
410	# doc_obj derived directly from file
411	if ($doc_obj->get_text_length($section) > 0) {
412	print $handle "<hastxt>1\n";
413	} else {
414	print $handle "<hastxt>0\n";
415	}
416	}
417
418	# output all the section metadata
419	my $metadata = $doc_obj->get_all_metadata ($section);
420	foreach my $pair (@$metadata) {
421	my ($field, $value) = (@$pair);
422
423	if ($field ne "Identifier" && $field !~ /^gsdl/ &&
424	defined $value && $value ne "") {
425
426	# escape problematic stuff
427	$value =~ s/\\/\\\\/g;
428	$value =~ s/\n/\\n/g;
429	$value =~ s/\r/\\r/g;
430	if ($value =~ /-{70,}/) {
431	# if value contains 70 or more hyphens in a row we need
432	# to escape them to prevent txt2db from treating them
433	# as a separator
434	$value =~ s/-/&\#045;/gi;
435	}
436
437	# special case for URL metadata
438	if ($field =~ /^URL$/i) {
439	$url .= "[$value]\n";
440	if ($section eq "") {$url .= "<section>$doc_OID\n";}
441	else {$url .= "<section>$doc_OID.$section\n";}
442	$url .= '-' x 70 . "\n";
443	}
444
445	if (!defined $self->{'dontgdbm'}->{$field}) {
446	print $handle "<$field>$value\n";
447
448	if ($section eq "")
449	{
450	$self->infodb_metadata_stats($field);
451	}
452	}
453	}
454	}
455
456	if ($section eq "")
457	{
458	my $doc_mdprefix_fields = $self->{'doc_mdprefix_fields'};
459
460	foreach my $prefix (keys %$doc_mdprefix_fields)
461	{
462	print $handle "<metadataset>$prefix\n";
463
464	foreach my $field (keys %{$doc_mdprefix_fields->{$prefix}})
465	{
466	my $val = $doc_mdprefix_fields->{$prefix}->{$field};
467
468	print $handle "<metadatalist-$prefix>$field\n";
469	print $handle "<metadatafreq-$prefix-$field>$val\n";
470	}
471
472	}
473	}
474
475	# If doc_obj reconstructed from GDBM file then no need to
476	# explicitly add <archivedir> as this is preserved as metadata when
477	# the GDBM file is loaded in
478
479	if (defined $filename)
480	{
481	# output archivedir if at top level
482	if ($section eq $doc_obj->get_top_section()) {
483	print $handle "<archivedir>$archivedir\n";
484	}
485	}
486
487	# output document display type
488	if ($first) {
489	print $handle "<thistype>$thistype\n";
490	}
491
492
493	if ($self->{'gdbm_level'} eq "document") {
494	# doc num is num_docs not num_sections
495	# output the matching document number
496	print $handle "<docnum>$self->{'num_docs'}\n";
497
498	} else {
499	# output a list of children
500	my $children = $doc_obj->get_children ($section);
501	if (scalar(@$children) > 0) {
502	print $handle "<childtype>$childtype\n";
503	print $handle "<contains>";
504	my $firstchild = 1;
505	foreach my $child (@$children) {
506	print $handle ";" unless $firstchild;
507	$firstchild = 0;
508	if ($child =~ /^.*?\.(\d+)$/) {
509	print $handle "\".$1";
510	} else {
511	print $handle "\".$child";
512	}
513	# if ($child eq "") { print $handle "$doc_OID"; }
514	# elsif ($section eq "") { print $handle "$doc_OID.$child"; }
515	# else { print $handle "$doc_OID.$section.$child"; }
516	}
517	print $handle "\n";
518	}
519	#output the matching doc number
520	print $handle "<docnum>$self->{'num_sections'}\n";
521
522	}
523
524	print $handle '-' x 70, "\n";
525
526
527	# output a database entry for the document number
528	if ($self->{'gdbm_level'} eq "document") {
529	print $handle "[$self->{'num_docs'}]\n";
530	print $handle "<section>$doc_OID\n";
531	}
532	else {
533	print $handle "[$self->{'num_sections'}]\n";
534	if ($section eq "") { print $handle "<section>$doc_OID\n"; }
535	else { print $handle "<section>$doc_OID.$section\n"; }
536	}
537	print $handle '-' x 70, "\n";
538
539	# output entry for url
540	if ($url ne "") {
541	print $handle $url;
542	}
543
544	$first = 0;
545	$section = $doc_obj->get_next_section($section);
546	last if ($self->{'gdbm_level'} eq "document"); # if no sections wanted, only gdbm the docs
547	}
548
549	#GRB01062004: see code above moved from here
550	}
551
552
553	sub text {
554	my $self = shift (@_);
555	my ($doc_obj) = @_;
556
557	my $handle = $self->{'outhandle'};
558	print $handle "basebuildproc::text function must be implemented in sub classes\n";
559	die "\n";
560	}
561
562	# should the document be indexed - according to the subcollection and language
563	# specification.
564	sub is_subcollection_doc {
565	my $self = shift (@_);
566	my ($doc_obj) = @_;
567
568	my $indexed_doc = 1;
569	foreach my $indexexp (@{$self->{'indexexparr'}}) {
570	$indexed_doc = 0;
571	my ($field, $exp, $options) = split /\//, $indexexp;
572	if (defined ($field) && defined ($exp)) {
573	my ($bool) = $field =~ /^(.)/;
574	$field =~ s/^.// if $bool eq '!';
575	my @metadata_values;
576	if ($field =~ /^filename$/i) {
577	push(@metadata_values, $doc_obj->get_source_filename());
578	}
579	else {
580	@metadata_values = @{$doc_obj->get_metadata($doc_obj->get_top_section(), $field)};
581	}
582	next unless @metadata_values;
583	foreach my $metadata_value (@metadata_values) {
584	if ($bool eq '!') {
585	if ($options =~ /^i$/i) {
586	if ($metadata_value !~ /$exp/i) {$indexed_doc = 1; last;}
587	} else {
588	if ($metadata_value !~ /$exp/) {$indexed_doc = 1; last;}
589	}
590	} else {
591	if ($options =~ /^i$/i) {
592	if ($metadata_value =~ /$exp/i) {$indexed_doc = 1; last;}
593	} else {
594	if ($metadata_value =~ /$exp/) {$indexed_doc = 1; last;}
595	}
596	}
597	}
598
599	last if ($indexed_doc == 1);
600	}
601	}
602
603	# if this doc is so far in the sub collection, and we have lang info,
604	# now we check the languages to see if it matches
605	if($indexed_doc && defined $self->{'lang_meta'}) {
606	$indexed_doc = 0;
607	my $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'lang_meta'});
608	if (defined $field) {
609	foreach my $lang (@{$self->{'langarr'}}) {
610	my ($bool) = $lang =~ /^(.)/;
611	if ($bool eq '!') {
612	$lang =~ s/^.//;
613	if ($field !~ /$lang/) {
614	$indexed_doc = 1; last;
615	}
616	} else {
617	if ($field =~ /$lang/) {
618	$indexed_doc = 1; last;
619	}
620	}
621	}
622	}
623	}
624	return $indexed_doc;
625
626	}
627
628	# use 'Paged' if document has no more than 2 levels
629	# and each section at second level has a number for
630	# Title metadata
631	# also use Paged if gsdlthistype metadata is set to Paged
632	sub get_document_type {
633	my $self = shift (@_);
634	my ($doc_obj) = @_;
635
636	my $thistype = "VList";
637	my $childtype = "VList";
638	my $title;
639	my @tmp = ();
640
641	my $section = $doc_obj->get_top_section ();
642
643	my $gsdlthistype = $doc_obj->get_metadata_element ($section, "gsdlthistype");
644	if (defined $gsdlthistype) {
645	if ($gsdlthistype eq "Paged") {
646	$childtype = "Paged";
647	if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
648	$thistype = "Paged";
649	} else {
650	$thistype = "Invisible";
651	}
652
653	return ($thistype, $childtype);
654	} elsif ($gsdlthistype eq "Hierarchy") {
655	return ($thistype, $childtype); # use VList, VList
656	}
657	}
658	my $first = 1;
659	while (defined $section) {
660	@tmp = split /\./, $section;
661	if (scalar(@tmp) > 1) {
662	return ($thistype, $childtype);
663	}
664	if (!$first) {
665	$title = $doc_obj->get_metadata_element ($section, "Title");
666	if (!defined $title \|\| $title !~ /^\d+$/) {
667	return ($thistype, $childtype);
668	}
669	}
670	$first = 0;
671	$section = $doc_obj->get_next_section($section);
672	}
673	if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
674	$thistype = "Paged";
675	} else {
676	$thistype = "Invisible";
677	}
678	$childtype = "Paged";
679	return ($thistype, $childtype);
680	}
681
682	sub assoc_files() {
683	my $self = shift (@_);
684	my ($doc_obj, $archivedir) = @_;
685	my ($afile);
686
687	foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
688	#rint STDERR "Processing associated file - copy " . $assoc_file->[0] . " to " . $assoc_file->[1] . "\n";
689	# if assoc file starts with a slash, we put it relative to the assoc
690	# dir, otherwise it is relative to the HASH... directory
691	if ($assoc_file->[1] =~ m@^[/\\]@) {
692	$afile = &util::filename_cat($self->{'assocdir'}, $assoc_file->[1]);
693	} else {
694	$afile = &util::filename_cat($self->{'assocdir'}, $archivedir, $assoc_file->[1]);
695	}
696	&util::hard_link ($assoc_file->[0], $afile);
697	}
698	}
699

Note: See TracBrowser for help on using the repository browser.

Download in other formats: