Context Navigation

source: trunk/gsdl/perllib/basebuildproc.pm@ 14079

Last change on this file since 14079 was 12844, checked in by mdewsnip, 18 years ago
Incremental building and dynamic GDBM updating code, many thanks to John Rowe and John Thompson at DL Consulting Ltd.
Property svn:keywords set to `Author Date Id Revision`
File size: 16.9 KB

Rev	Line
[9919]	1	###########################################################################
	2	#
	3	# basebuildproc.pm --
	4	# A component of the Greenstone digital library software
	5	# from the New Zealand Digital Library Project at the
	6	# University of Waikato, New Zealand.
	7	#
	8	# Copyright (C) 1999 New Zealand Digital Library Project
	9	#
	10	# This program is free software; you can redistribute it and/or modify
	11	# it under the terms of the GNU General Public License as published by
	12	# the Free Software Foundation; either version 2 of the License, or
	13	# (at your option) any later version.
	14	#
	15	# This program is distributed in the hope that it will be useful,
	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	# GNU General Public License for more details.
	19	#
	20	# You should have received a copy of the GNU General Public License
	21	# along with this program; if not, write to the Free Software
	22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	#
	24	###########################################################################
	25
	26	# This document processor outputs a document for indexing (should be
	27	# implemented by subclass) and storing in gdbm database
	28
	29	package basebuildproc;
	30
	31	eval {require bytes};
	32
	33	use classify;
	34	use doc;
	35	use docproc;
	36	use util;
	37
	38	BEGIN {
	39	@basebuildproc::ISA = ('docproc');
	40	}
	41
[12844]	42	sub new()
	43	{
	44	my ($class, $collection, $source_dir, $build_dir, $keepold, $verbosity, $outhandle) = @_;
[9919]	45	my $self = new docproc ();
	46
	47	# outhandle is where all the debugging info goes
	48	# output_handle is where the output of the plugins is piped
	49	# to (i.e. mg, gdbm etc.)
	50	$outhandle = STDERR unless defined $outhandle;
	51
	52	$self->{'collection'} = $collection;
	53	$self->{'source_dir'} = $source_dir;
[10159]	54	$self->{'build_dir'} = $build_dir;
	55	$self->{'keepold'} = $keepold;
	56	$self->{'verbosity'} = $verbosity;
	57	$self->{'outhandle'} = $outhandle;
[9919]	58
	59	$self->{'classifiers'} = [];
	60	$self->{'mode'} = "text";
	61	$self->{'assocdir'} = $build_dir;
	62	$self->{'dontgdbm'} = {};
	63
	64	$self->{'index'} = "section:text";
	65	$self->{'indexexparr'} = [];
	66
[10159]	67	my $found_num_data = 0;
	68	my $buildconfigfile = undef;
	69
	70	if ($keepold) {
	71	# For incremental building need to seed num_docs etc from values
	72	# stored in build.cfg (if present)
[12844]	73	print STDERR "Keepold!\n";
[10159]	74	$buildconfigfile = &util::filename_cat($build_dir, "build.cfg");
[12844]	75	print STDERR "Build cfg: $buildconfigfile\n";
[10159]	76	if (-e $buildconfigfile) {
	77	$found_num_data = 1;
	78	}
	79	else {
	80	# try the index dir
	81	$buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},
	82	"index", "build.cfg");
[12844]	83	print STDERR "Index cfg: $buildconfigfile\n";
[10159]	84	if (-e $buildconfigfile) {
	85	$found_num_data = 1;
	86	}
	87	}
	88
[12844]	89	}
	90	#else
	91	# {
	92	# print STDERR "Removeold!\n";
	93	# }
[10159]	94
[12844]	95	if ($found_num_data)
	96	{
	97	#print STDERR "Found_Num_Data!\n";
[10159]	98	my $buildcfg = &colcfg::read_build_cfg($buildconfigfile);
	99	$self->{'starting_num_docs'} = $buildcfg->{'numdocs'};
[12844]	100	#print STDERR "- num_docs: $self->{'starting_num_docs'}\n";
[10159]	101	$self->{'starting_num_sections'} = $buildcfg->{'numsections'};
[12844]	102	#print STDERR "- num_sections: $self->{'starting_num_sections'}\n";
[10159]	103	$self->{'starting_num_bytes'} = $buildcfg->{'numbytes'};
[12844]	104	#print STDERR "- num_bytes: $self->{'starting_num_bytes'}\n";
[10159]	105	}
[12844]	106	else
	107	{
	108	#print STDERR "NOT Found_Num_Data!\n";
	109	$self->{'starting_num_docs'} = 0;
[10159]	110	$self->{'starting_num_sections'} = 0;
	111	$self->{'starting_num_bytes'} = 0;
[12844]	112	}
[10159]	113
[9919]	114	$self->{'output_handle'} = "STDOUT";
[10159]	115	$self->{'num_docs'} = $self->{'starting_num_docs'};
	116	$self->{'num_sections'} = $self->{'starting_num_sections'};
	117	$self->{'num_bytes'} = $self->{'starting_num_bytes'};
	118
[9919]	119	$self->{'num_processed_bytes'} = 0;
	120	$self->{'store_text'} = 1;
	121
	122	# what level (section/document) the gdbm database - indexer intersection is
	123	$self->{'gdbm_level'} = "section";
	124	#used by browse interface
	125	$self->{'doclist'} = [];
	126
	127	$self->{'indexing_text'} = 0;
	128
	129	return bless $self, $class;
	130
	131	}
	132
	133	sub reset {
	134	my $self = shift (@_);
[10159]	135
	136	$self->{'num_docs'} = $self->{'starting_num_docs'};
	137	$self->{'num_sections'} = $self->{'starting_num_sections'};
	138	$self->{'num_bytes'} = $self->{'starting_num_bytes'};
[9919]	139
	140	$self->{'num_processed_bytes'} = 0;
	141	}
	142
[10159]	143	sub zero_reset {
	144	my $self = shift (@_);
	145
	146	$self->{'num_docs'} = 0;
	147	$self->{'num_sections'} = 0;
	148	$self->{'num_bytes'} = 0;
	149
	150	$self->{'num_processed_bytes'} = 0;
	151	}
	152
[10419]	153	sub is_incremental_capable
[10304]	154	{
	155	# By default we return 'no' as the answer
	156	# Safer to assume non-incremental to start with, and then override in
	157	# inherited classes that are.
	158
	159	return 0;
	160	}
	161
[9919]	162	sub get_num_docs {
	163	my $self = shift (@_);
	164
	165	return $self->{'num_docs'};
	166	}
	167
	168	sub get_num_sections {
	169	my $self = shift (@_);
	170
	171	return $self->{'num_sections'};
	172	}
	173
	174	# num_bytes is the actual number of bytes in the collection
	175	# this is normally the same as what's processed during text compression
	176	sub get_num_bytes {
	177	my $self = shift (@_);
	178
	179	return $self->{'num_bytes'};
	180	}
	181
	182	# num_processed_bytes is the number of bytes actually passed
	183	# to mg for the current index
	184	sub get_num_processed_bytes {
	185	my $self = shift (@_);
	186
	187	return $self->{'num_processed_bytes'};
	188	}
	189
	190	sub set_output_handle {
	191	my $self = shift (@_);
	192	my ($handle) = @_;
	193
	194	$self->{'output_handle'} = $handle;
	195	}
	196
	197
	198	sub set_mode {
	199	my $self = shift (@_);
	200	my ($mode) = @_;
	201
	202	$self->{'mode'} = $mode;
	203	}
	204
[10159]	205	sub get_mode {
	206	my $self = shift (@_);
	207
	208	return $self->{'mode'};
	209	}
	210
[9919]	211	sub set_assocdir {
	212	my $self = shift (@_);
	213	my ($assocdir) = @_;
	214
	215	$self->{'assocdir'} = $assocdir;
	216	}
	217
	218	sub set_dontgdbm {
	219	my $self = shift (@_);
	220	my ($dontgdbm) = @_;
	221
	222	$self->{'dontgdbm'} = $dontgdbm;
	223	}
	224
	225	sub set_index {
	226	my $self = shift (@_);
	227	my ($index, $indexexparr) = @_;
	228
	229	$self->{'index'} = $index;
	230	$self->{'indexexparr'} = $indexexparr if defined $indexexparr;
	231	}
	232
	233	sub set_index_languages {
	234	my $self = shift (@_);
	235	my ($lang_meta, $langarr) = @_;
	236	$self->{'lang_meta'} = $lang_meta;
	237	$self->{'langarr'} = $langarr;
	238	}
	239
	240	sub get_index {
	241	my $self = shift (@_);
	242
	243	return $self->{'index'};
	244	}
	245
	246	sub set_classifiers {
	247	my $self = shift (@_);
	248	my ($classifiers) = @_;
	249
	250	$self->{'classifiers'} = $classifiers;
	251	}
	252
	253	sub set_indexing_text {
	254	my $self = shift (@_);
	255	my ($indexing_text) = @_;
	256
	257	$self->{'indexing_text'} = $indexing_text;
	258	}
	259
	260	sub get_indexing_text {
	261	my $self = shift (@_);
	262
	263	return $self->{'indexing_text'};
	264	}
	265
	266	sub set_store_text {
	267	my $self = shift (@_);
	268	my ($store_text) = @_;
	269
	270	$self->{'store_text'} = $store_text;
	271	}
	272	sub get_doc_list {
	273	my $self = shift(@_);
	274
	275	return @{$self->{'doclist'}};
	276	}
	277
	278	# the standard gdbm level is section, but you may want to change it to document
	279	sub set_gdbm_level {
	280	my $self= shift (@_);
	281	my ($gdbm_level) = @_;
	282
	283	$self->{'gdbm_level'} = $gdbm_level;
	284	}
	285
[10469]	286	sub set_sections_index_document_metadata {
	287	my $self= shift (@_);
	288	my ($index_type) = @_;
	289
	290	$self->{'sections_index_document_metadata'} = $index_type;
	291	}
[9919]	292	sub process {
	293	my $self = shift (@_);
	294	my $method = $self->{'mode'};
	295
	296	$self->$method(@_);
	297	}
	298
	299	sub infodb {
	300	my $self = shift (@_);
	301	my ($doc_obj, $filename) = @_;
	302	my $handle = $self->{'output_handle'};
	303
	304	my $doctype = $doc_obj->get_doc_type();
	305
[11793]	306	# only output this document if it is a "indexed_doc" or "info_doc" (GDBM database only) document
	307	return if ($doctype ne "indexed_doc" && $doctype ne "info_doc");
[9919]	308
[11994]	309	my $archivedir = "";
[9919]	310
[11994]	311	if (defined $filename)
	312	{
	313	# doc_obj derived directly from file
[9919]	314
[11994]	315	my ($dir) = $filename =~ /^(.?)(?:\/\|\\)[^\/\\]$/;
	316	$dir = "" unless defined $dir;
	317	$dir =~ s/\\/\//g;
	318	$dir =~ s/^\/+//;
	319	$dir =~ s/\/+$//;
	320
	321	$archivedir = $dir;
	322
	323	# resolve the final filenames of the files associated with this document
	324	$self->assoc_files ($doc_obj, $archivedir);
	325	}
	326	else
	327	{
	328	# doc_obj reconstructed from GDBM (has metadata, doc structure but no text)
	329	my $top_section = $doc_obj->get_top_section();
	330	$archivedir = $doc_obj->get_metadata_element($top_section,"archivedir");
	331	}
	332
	333
[9919]	334	#GRB: moved 1/06/2004 from GRB01062004
	335	#add this document to the browse structure
	336	push(@{$self->{'doclist'}},$doc_obj->get_OID())
	337	unless ($doctype eq "classification");
	338
	339	# classify this document
	340	&classify::classify_doc ($self->{'classifiers'}, $doc_obj);
	341	#GRB: end of moved block
	342
	343	# this is another document
	344	$self->{'num_docs'} += 1 unless ($doctype eq "classification");
	345
	346	# is this a paged or a hierarchical document
	347	my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
	348
	349	my $section = $doc_obj->get_top_section ();
	350	my $doc_OID = $doc_obj->get_OID();
	351	my $first = 1;
	352	my $url = "";
	353	while (defined $section) {
	354	# update a few statistics
	355	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
	356	$self->{'num_sections'} += 1 unless ($doctype eq "classification");
	357
	358	# output the section name
	359	if ($section eq "") { print $handle "[$doc_OID]\n"; }
	360	else { print $handle "[$doc_OID.$section]\n"; }
	361
	362	# output the fact that this document is a document (unless doctype
	363	# has been set to something else from within a plugin
	364	my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
	365	if (!defined $dtype \|\| $dtype !~ /\w/) {
	366	print $handle "<doctype>doc\n";
	367	}
	368
[11994]	369	# Output whether this node contains text
	370	#
	371	# If doc_obj reconstructed from GDBM file then no need to
	372	# explicitly add <hastxt> as this is preserved as metadata when
	373	# the GDBM file is loaded in
	374
	375	if (defined $filename)
	376	{
	377	# doc_obj derived directly from file
	378	if ($doc_obj->get_text_length($section) > 0) {
	379	print $handle "<hastxt>1\n";
	380	} else {
	381	print $handle "<hastxt>0\n";
	382	}
[9919]	383	}
	384
	385	# output all the section metadata
	386	my $metadata = $doc_obj->get_all_metadata ($section);
	387	foreach my $pair (@$metadata) {
	388	my ($field, $value) = (@$pair);
	389
	390	if ($field ne "Identifier" && $field !~ /^gsdl/ &&
	391	defined $value && $value ne "") {
	392
	393	# escape problematic stuff
	394	$value =~ s/\\/\\\\/g;
	395	$value =~ s/\n/\\n/g;
	396	$value =~ s/\r/\\r/g;
	397	if ($value =~ /-{70,}/) {
	398	# if value contains 70 or more hyphens in a row we need
	399	# to escape them to prevent txt2db from treating them
	400	# as a separator
	401	$value =~ s/-/&\#045;/gi;
	402	}
	403
	404	# special case for URL metadata
	405	if ($field =~ /^URL$/i) {
	406	$url .= "[$value]\n";
	407	if ($section eq "") {$url .= "<section>$doc_OID\n";}
	408	else {$url .= "<section>$doc_OID.$section\n";}
	409	$url .= '-' x 70 . "\n";
	410	}
	411
	412	if (!defined $self->{'dontgdbm'}->{$field}) {
	413	print $handle "<$field>$value\n";
	414	}
	415	}
	416	}
	417
[11994]	418
	419	# If doc_obj reconstructed from GDBM file then no need to
	420	# explicitly add <archivedir> as this is preserved as metadata when
	421	# the GDBM file is loaded in
	422
	423	if (defined $filename)
	424	{
	425	# output archivedir if at top level
	426	if ($section eq $doc_obj->get_top_section()) {
	427	print $handle "<archivedir>$archivedir\n";
	428	}
[9919]	429	}
	430
	431	# output document display type
	432	if ($first) {
	433	print $handle "<thistype>$thistype\n";
	434	}
	435
[11994]	436
[9919]	437	if ($self->{'gdbm_level'} eq "document") {
	438	# doc num is num_docs not num_sections
	439	# output the matching document number
	440	print $handle "<docnum>$self->{'num_docs'}\n";
[12844]	441
[9919]	442	} else {
	443	# output a list of children
	444	my $children = $doc_obj->get_children ($section);
	445	if (scalar(@$children) > 0) {
	446	print $handle "<childtype>$childtype\n";
	447	print $handle "<contains>";
	448	my $firstchild = 1;
	449	foreach my $child (@$children) {
	450	print $handle ";" unless $firstchild;
	451	$firstchild = 0;
	452	if ($child =~ /^.*?\.(\d+)$/) {
	453	print $handle "\".$1";
	454	} else {
	455	print $handle "\".$child";
	456	}
	457	# if ($child eq "") { print $handle "$doc_OID"; }
	458	# elsif ($section eq "") { print $handle "$doc_OID.$child"; }
	459	# else { print $handle "$doc_OID.$section.$child"; }
	460	}
	461	print $handle "\n";
	462	}
	463	#output the matching doc number
	464	print $handle "<docnum>$self->{'num_sections'}\n";
	465
	466	}
	467
	468	print $handle '-' x 70, "\n";
	469
	470
	471	# output a database entry for the document number
	472	if ($self->{'gdbm_level'} eq "document") {
	473	print $handle "[$self->{'num_docs'}]\n";
	474	print $handle "<section>$doc_OID\n";
	475	}
	476	else {
	477	print $handle "[$self->{'num_sections'}]\n";
	478	if ($section eq "") { print $handle "<section>$doc_OID\n"; }
	479	else { print $handle "<section>$doc_OID.$section\n"; }
	480	}
	481	print $handle '-' x 70, "\n";
	482
	483	# output entry for url
	484	if ($url ne "") {
	485	print $handle $url;
	486	}
	487
	488	$first = 0;
	489	$section = $doc_obj->get_next_section($section);
	490	last if ($self->{'gdbm_level'} eq "document"); # if no sections wanted, only gdbm the docs
	491	}
	492
	493	#GRB01062004: see code above moved from here
	494	}
	495
	496
	497	sub text {
	498	my $self = shift (@_);
	499	my ($doc_obj) = @_;
	500
	501	my $handle = $self->{'outhandle'};
	502	print $handle "basebuildproc::text function must be implemented in sub classes\n";
	503	die "\n";
	504	}
	505
	506	# should the document be indexed - according to the subcollection and language
	507	# specification.
	508	sub is_subcollection_doc {
	509	my $self = shift (@_);
	510	my ($doc_obj) = @_;
	511
	512	my $indexed_doc = 1;
	513	foreach my $indexexp (@{$self->{'indexexparr'}}) {
	514	$indexed_doc = 0;
	515	my ($field, $exp, $options) = split /\//, $indexexp;
	516	if (defined ($field) && defined ($exp)) {
	517	my ($bool) = $field =~ /^(.)/;
	518	$field =~ s/^.// if $bool eq '!';
[10028]	519	my @metadata_values;
[9919]	520	if ($field =~ /^filename$/i) {
[10028]	521	push(@metadata_values, $doc_obj->get_source_filename());
[9919]	522	}
[10028]	523	else {
	524	@metadata_values = @{$doc_obj->get_metadata($doc_obj->get_top_section(), $field)};
	525	}
	526	next unless @metadata_values;
	527	foreach my $metadata_value (@metadata_values) {
	528	if ($bool eq '!') {
	529	if ($options =~ /^i$/i) {
	530	if ($metadata_value !~ /$exp/i) {$indexed_doc = 1; last;}
	531	} else {
	532	if ($metadata_value !~ /$exp/) {$indexed_doc = 1; last;}
	533	}
[9919]	534	} else {
[10028]	535	if ($options =~ /^i$/i) {
	536	if ($metadata_value =~ /$exp/i) {$indexed_doc = 1; last;}
	537	} else {
	538	if ($metadata_value =~ /$exp/) {$indexed_doc = 1; last;}
	539	}
[9919]	540	}
	541	}
[10028]	542
	543	last if ($indexed_doc == 1);
[9919]	544	}
	545	}
	546
	547	# if this doc is so far in the sub collection, and we have lang info,
	548	# now we check the languages to see if it matches
	549	if($indexed_doc && defined $self->{'lang_meta'}) {
	550	$indexed_doc = 0;
	551	my $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'lang_meta'});
	552	if (defined $field) {
	553	foreach my $lang (@{$self->{'langarr'}}) {
	554	my ($bool) = $lang =~ /^(.)/;
	555	if ($bool eq '!') {
	556	$lang =~ s/^.//;
	557	if ($field !~ /$lang/) {
	558	$indexed_doc = 1; last;
	559	}
	560	} else {
	561	if ($field =~ /$lang/) {
	562	$indexed_doc = 1; last;
	563	}
	564	}
	565	}
	566	}
	567	}
	568	return $indexed_doc;
	569
	570	}
	571
	572	# use 'Paged' if document has no more than 2 levels
	573	# and each section at second level has a number for
	574	# Title metadata
	575	# also use Paged if gsdlthistype metadata is set to Paged
	576	sub get_document_type {
	577	my $self = shift (@_);
	578	my ($doc_obj) = @_;
	579
	580	my $thistype = "VList";
	581	my $childtype = "VList";
	582	my $title;
	583	my @tmp = ();
	584
	585	my $section = $doc_obj->get_top_section ();
	586
	587	my $gsdlthistype = $doc_obj->get_metadata_element ($section, "gsdlthistype");
	588	if (defined $gsdlthistype) {
	589	if ($gsdlthistype eq "Paged") {
	590	$childtype = "Paged";
	591	if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
	592	$thistype = "Paged";
	593	} else {
	594	$thistype = "Invisible";
	595	}
	596
	597	return ($thistype, $childtype);
	598	} elsif ($gsdlthistype eq "Hierarchy") {
	599	return ($thistype, $childtype); # use VList, VList
	600	}
	601	}
	602	my $first = 1;
	603	while (defined $section) {
	604	@tmp = split /\./, $section;
	605	if (scalar(@tmp) > 1) {
	606	return ($thistype, $childtype);
	607	}
	608	if (!$first) {
	609	$title = $doc_obj->get_metadata_element ($section, "Title");
	610	if (!defined $title \|\| $title !~ /^\d+$/) {
	611	return ($thistype, $childtype);
	612	}
	613	}
	614	$first = 0;
	615	$section = $doc_obj->get_next_section($section);
	616	}
	617	if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
	618	$thistype = "Paged";
	619	} else {
	620	$thistype = "Invisible";
	621	}
	622	$childtype = "Paged";
	623	return ($thistype, $childtype);
	624	}
	625
[12844]	626	sub assoc_files() {
[9919]	627	my $self = shift (@_);
	628	my ($doc_obj, $archivedir) = @_;
	629	my ($afile);
	630
	631	foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
[12844]	632	#rint STDERR "Processing associated file - copy " . $assoc_file->[0] . " to " . $assoc_file->[1] . "\n";
[9919]	633	# if assoc file starts with a slash, we put it relative to the assoc
	634	# dir, otherwise it is relative to the HASH... directory
	635	if ($assoc_file->[1] =~ m@^[/\\]@) {
[12844]	636	$afile = &util::filename_cat($self->{'assocdir'}, $assoc_file->[1]);
[9919]	637	} else {
	638	$afile = &util::filename_cat($self->{'assocdir'}, $archivedir, $assoc_file->[1]);
	639	}
	640	&util::hard_link ($assoc_file->[0], $afile);
	641	}
	642	}
	643

Note: See TracBrowser for help on using the repository browser.

Download in other formats: