Context Navigation

source: trunk/gsdl/perllib/mgbuildproc.pm@ 8647

Last change on this file since 8647 was 8402, checked in by kjdon, 20 years ago
fixed up the header page stuff with pagedimgplug - docs always have a toplevel section, with all tehimages underneath. if there is text in teh top section, then this becomes a header page, otherwise, this is an invisible page
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 14.3 KB

Rev	Line
[537]	1	###########################################################################
	2	#
	3	# mgbuildproc.pm --
	4	# A component of the Greenstone digital library software
	5	# from the New Zealand Digital Library Project at the
	6	# University of Waikato, New Zealand.
	7	#
	8	# Copyright (C) 1999 New Zealand Digital Library Project
	9	#
	10	# This program is free software; you can redistribute it and/or modify
	11	# it under the terms of the GNU General Public License as published by
	12	# the Free Software Foundation; either version 2 of the License, or
	13	# (at your option) any later version.
	14	#
	15	# This program is distributed in the hope that it will be useful,
	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	# GNU General Public License for more details.
	19	#
	20	# You should have received a copy of the GNU General Public License
	21	# along with this program; if not, write to the Free Software
	22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	#
	24	###########################################################################
	25
[17]	26	# This document processor outputs a document
	27	# for mg to process
[4]	28
	29
	30	package mgbuildproc;
	31
[3834]	32	eval {require bytes};
[3767]	33
[215]	34	use classify;
	35	use doc;
[4]	36	use docproc;
	37	use util;
	38
	39	BEGIN {
	40	@ISA = ('docproc');
	41	}
	42
	43
	44	sub new {
[1424]	45	my ($class, $collection, $source_dir, $build_dir,
	46	$verbosity, $outhandle) = @_;
[4]	47	my $self = new docproc ();
	48
[1424]	49	# outhandle is where all the debugging info goes
	50	# output_handle is where the output of the plugins is piped
	51	# to (i.e. mg, gdbm etc.)
	52	$outhandle = STDERR unless defined $outhandle;
	53
[4]	54	$self->{'collection'} = $collection;
	55	$self->{'source_dir'} = $source_dir;
	56	$self->{'build_dir'} = $build_dir;
	57	$self->{'verbosity'} = $verbosity;
[215]	58	$self->{'classifiers'} = [];
[4]	59	$self->{'mode'} = "text";
[810]	60	$self->{'assocdir'} = $build_dir;
[780]	61	$self->{'dontgdbm'} = {};
[4]	62	$self->{'index'} = "section:text";
[69]	63	$self->{'indexexparr'} = [];
[4]	64	$self->{'output_handle'} = "STDOUT";
	65	$self->{'num_docs'} = 0;
	66	$self->{'num_sections'} = 0;
	67	$self->{'num_bytes'} = 0;
[1251]	68	$self->{'num_processed_bytes'} = 0;
[2336]	69	$self->{'store_text'} = 1;
[1424]	70	$self->{'outhandle'} = $outhandle;
[2505]	71
	72	#used by browse interface
	73	$self->{'doclist'} = [];
	74
[292]	75	$self->{'indexing_text'} = 0;
	76
[4]	77	return bless $self, $class;
	78	}
	79
	80	sub reset {
	81	my $self = shift (@_);
	82
	83	$self->{'num_docs'} = 0;
	84	$self->{'num_sections'} = 0;
[1251]	85	$self->{'num_processed_bytes'} = 0;
[4]	86	$self->{'num_bytes'} = 0;
	87	}
	88
	89	sub get_num_docs {
	90	my $self = shift (@_);
	91
	92	return $self->{'num_docs'};
	93	}
	94
	95	sub get_num_sections {
	96	my $self = shift (@_);
	97
	98	return $self->{'num_sections'};
	99	}
	100
[1251]	101	# num_bytes is the actual number of bytes in the collection
	102	# this is normally the same as what's processed during text compression
[4]	103	sub get_num_bytes {
	104	my $self = shift (@_);
	105
	106	return $self->{'num_bytes'};
	107	}
	108
[1251]	109	# num_processed_bytes is the number of bytes actually passed
	110	# to mg for the current index
	111	sub get_num_processed_bytes {
	112	my $self = shift (@_);
	113
	114	return $self->{'num_processed_bytes'};
	115	}
	116
[4]	117	sub set_output_handle {
	118	my $self = shift (@_);
	119	my ($handle) = @_;
	120
	121	$self->{'output_handle'} = $handle;
	122	}
	123
	124	sub set_mode {
	125	my $self = shift (@_);
	126	my ($mode) = @_;
	127
	128	$self->{'mode'} = $mode;
	129	}
	130
[810]	131	sub set_assocdir {
	132	my $self = shift (@_);
	133	my ($assocdir) = @_;
	134
	135	$self->{'assocdir'} = $assocdir;
	136	}
	137
[780]	138	sub set_dontgdbm {
	139	my $self = shift (@_);
	140	my ($dontgdbm) = @_;
	141
	142	$self->{'dontgdbm'} = $dontgdbm;
	143	}
	144
[4]	145	sub set_index {
	146	my $self = shift (@_);
[69]	147	my ($index, $indexexparr) = @_;
[4]	148
	149	$self->{'index'} = $index;
[69]	150	$self->{'indexexparr'} = $indexexparr if defined $indexexparr;
[4]	151	}
	152
[1251]	153	sub get_index {
	154	my $self = shift (@_);
	155
	156	return $self->{'index'};
	157	}
	158
[215]	159	sub set_classifiers {
	160	my $self = shift (@_);
	161	my ($classifiers) = @_;
	162
	163	$self->{'classifiers'} = $classifiers;
	164	}
	165
[292]	166	sub set_indexing_text {
	167	my $self = shift (@_);
	168	my ($indexing_text) = @_;
	169
	170	$self->{'indexing_text'} = $indexing_text;
	171	}
	172
[1251]	173	sub get_indexing_text {
	174	my $self = shift (@_);
	175
	176	return $self->{'indexing_text'};
	177	}
	178
[2336]	179	sub set_store_text {
	180	my $self = shift (@_);
	181	my ($store_text) = @_;
	182
	183	$self->{'store_text'} = $store_text;
	184	}
	185
[2505]	186	sub get_doc_list {
	187	my $self = shift(@_);
	188
	189	return @{$self->{'doclist'}};
	190	}
[2336]	191
[2505]	192
[4]	193	sub process {
	194	my $self = shift (@_);
	195	my $method = $self->{'mode'};
	196
[139]	197	$self->$method(@_);
[4]	198	}
	199
[677]	200	# use 'Paged' if document has no more than 2 levels
	201	# and each section at second level has a number for
	202	# Title metadata
[6553]	203	#also use Paged if gsdlthistype metadata is set to Paged
[677]	204	sub get_document_type {
	205	my $self = shift (@_);
	206	my ($doc_obj) = @_;
	207
	208	my $thistype = "VList";
	209	my $childtype = "VList";
	210	my $title;
	211	my @tmp = ();
	212
	213	my $section = $doc_obj->get_top_section ();
[6553]	214
	215	my $gsdlthistype = $doc_obj->get_metadata_element ($section, "gsdlthistype");
[6767]	216	if (defined $gsdlthistype) {
	217	if ($gsdlthistype eq "Paged") {
	218	$childtype = "Paged";
[8402]	219	if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
	220	$thistype = "Paged";
	221	} else {
	222	$thistype = "Invisible";
	223	}
	224
[6767]	225	return ($thistype, $childtype);
	226	} elsif ($gsdlthistype eq "Hierarchy") {
	227	return ($thistype, $childtype); # use VList, VList
	228	}
[6553]	229	}
[677]	230	my $first = 1;
	231	while (defined $section) {
	232	@tmp = split /\./, $section;
	233	if (scalar(@tmp) > 1) {
	234	return ($thistype, $childtype);
	235	}
	236	if (!$first) {
	237	$title = $doc_obj->get_metadata_element ($section, "Title");
[706]	238	if (!defined $title \|\| $title !~ /^\d+$/) {
[677]	239	return ($thistype, $childtype);
	240	}
	241	}
	242	$first = 0;
	243	$section = $doc_obj->get_next_section($section);
	244	}
	245	if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
	246	$thistype = "Paged";
	247	} else {
	248	$thistype = "Invisible";
	249	}
	250	$childtype = "Paged";
	251	return ($thistype, $childtype);
	252	}
	253
[810]	254	sub assoc_files {
	255	my $self = shift (@_);
[900]	256	my ($doc_obj, $archivedir) = @_;
	257	my ($afile);
[810]	258
[900]	259	foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
[3148]	260	# if assoc file starts with a slash, we put it relative to the assoc
	261	# dir, otherwise it is relative to the HASH... directory
	262	if ($assoc_file->[1] =~ m@^[/\\]@) {
	263	$afile = &util::filename_cat($self->{'assocdir'},$assoc_file->[1]);
[900]	264	} else {
	265	$afile = &util::filename_cat($self->{'assocdir'}, $archivedir, $assoc_file->[1]);
	266	}
[810]	267	&util::hard_link ($assoc_file->[0], $afile);
	268	}
	269	}
	270
[315]	271	sub infodb {
[215]	272	my $self = shift (@_);
	273	my ($doc_obj, $filename) = @_;
	274	my $handle = $self->{'output_handle'};
	275	# $handle = "main::STDOUT";
	276
	277	my $doctype = $doc_obj->get_doc_type();
	278
	279	# only output this document if it is one to be indexed
	280	return if ($doctype ne "indexed_doc");
	281
[900]	282	my ($archivedir) = $filename =~ /^(.?)(?:\/\|\\)[^\/\\]$/;
	283	$archivedir = "" unless defined $archivedir;
	284	$archivedir =~ s/\\/\//g;
	285	$archivedir =~ s/^\/+//;
	286	$archivedir =~ s/\/+$//;
	287
[8220]	288	# resolve the final filenames of the files associated with this document
[900]	289	$self->assoc_files ($doc_obj, $archivedir);
	290
[8220]	291	#GRB: moved 1/06/2004 from GRB01062004
	292	#add this document to the browse structure
	293	push(@{$self->{'doclist'}},$doc_obj->get_OID())
	294	unless ($doctype eq "classification");
	295
	296	# classify this document
	297	&classify::classify_doc ($self->{'classifiers'}, $doc_obj);
	298	#GRB: end of moved block
	299
[215]	300	# this is another document
	301	$self->{'num_docs'} += 1 unless ($doctype eq "classification");
	302
[677]	303	# is this a paged or a hierarchical document
	304	my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
	305
[215]	306	my $section = $doc_obj->get_top_section ();
	307	my $doc_OID = $doc_obj->get_OID();
[677]	308	my $first = 1;
[810]	309	my $url = "";
[215]	310	while (defined $section) {
	311	# update a few statistics
	312	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
	313	$self->{'num_sections'} += 1 unless ($doctype eq "classification");
	314
	315	# output the section name
	316	if ($section eq "") { print $handle "[$doc_OID]\n"; }
	317	else { print $handle "[$doc_OID.$section]\n"; }
	318
[1044]	319	# output the fact that this document is a document (unless doctype
	320	# has been set to something else from within a plugin
	321	my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
	322	if (!defined $dtype \|\| $dtype !~ /\w/) {
	323	print $handle "<doctype>doc\n";
	324	}
[215]	325
	326	# output whether this node contains text
	327	if ($doc_obj->get_text_length($section) > 0) {
	328	print $handle "<hastxt>1\n";
	329	} else {
	330	print $handle "<hastxt>0\n";
	331	}
	332
	333	# output all the section metadata
	334	my $metadata = $doc_obj->get_all_metadata ($section);
[900]	335	foreach my $pair (@$metadata) {
[215]	336	my ($field, $value) = (@$pair);
	337
	338	if ($field ne "Identifier" && $field !~ /^gsdl/ &&
[810]	339	defined $value && $value ne "") {
	340
[215]	341	# escape problematic stuff
	342	$value =~ s/\\/\\\\/g;
	343	$value =~ s/\n/\\n/g;
	344	$value =~ s/\r/\\r/g;
[3665]	345	if ($value =~ /-{70,}/) {
	346	# if value contains 70 or more hyphens in a row we need
	347	# to escape them to prevent txt2db from treating them
	348	# as a separator
	349	$value =~ s/-/&\#045;/gi;
	350	}
[810]	351
	352	# special case for URL metadata
	353	if ($field =~ /^URL$/i) {
	354	$url .= "[$value]\n";
	355	if ($section eq "") {$url .= "<section>$doc_OID\n";}
	356	else {$url .= "<section>$doc_OID.$section\n";}
	357	$url .= '-' x 70 . "\n";
	358	}
	359
	360	if (!defined $self->{'dontgdbm'}->{$field}) {
	361	print $handle "<$field>$value\n";
	362	}
[215]	363	}
	364	}
[810]	365
[255]	366	# output archivedir if at top level
	367	if ($section eq $doc_obj->get_top_section()) {
	368	print $handle "<archivedir>$archivedir\n";
	369	}
[810]	370
[677]	371	# output document display type
	372	if ($first) {
	373	print $handle "<thistype>$thistype\n";
	374	}
	375
[215]	376	# output a list of children
[222]	377	my $children = $doc_obj->get_children ($section);
[215]	378	if (scalar(@$children) > 0) {
[677]	379	print $handle "<childtype>$childtype\n";
[215]	380	print $handle "<contains>";
	381	my $firstchild = 1;
[900]	382	foreach my $child (@$children) {
[215]	383	print $handle ";" unless $firstchild;
	384	$firstchild = 0;
[252]	385	if ($child =~ /^.*?\.(\d+)$/) {
	386	print $handle "\".$1";
	387	} else {
	388	print $handle "\".$child";
	389	}
[222]	390	# if ($child eq "") { print $handle "$doc_OID"; }
	391	# elsif ($section eq "") { print $handle "$doc_OID.$child"; }
	392	# else { print $handle "$doc_OID.$section.$child"; }
[215]	393	}
	394	print $handle "\n";
	395	}
	396
	397	# output the matching document number
	398	print $handle "<docnum>$self->{'num_sections'}\n";
	399
	400	print $handle '-' x 70, "\n";
	401
	402
	403	# output a database entry for the document number
	404	print $handle "[$self->{'num_sections'}]\n";
	405	if ($section eq "") { print $handle "<section>$doc_OID\n"; }
	406	else { print $handle "<section>$doc_OID.$section\n"; }
	407	print $handle '-' x 70, "\n";
[810]	408
	409	# output entry for url
	410	if ($url ne "") {
	411	print $handle $url;
	412	}
[215]	413
[677]	414	$first = 0;
[215]	415	$section = $doc_obj->get_next_section($section);
	416	}
	417
[8220]	418	#GRB01062004: see code above moved from here
[215]	419	}
	420
[289]	421	sub find_paragraphs {
	422	$_[1] =~ s/(<p\b)/\cC$1/gi;
	423	}
	424
[292]	425	sub filter_text {
	426	# $self->filter_text ($field, $new_text);
	427	# don't want to do anything for this version, however,
	428	# in a particular collection you might want to override
	429	# this method to post-process certain fields depending on
	430	# the field, or whether we are outputting it for indexing
	431	}
	432
[4]	433	sub text {
	434	my $self = shift (@_);
	435	my ($doc_obj) = @_;
	436	my $handle = $self->{'output_handle'};
[69]	437	my $indexed_doc = 1;
[4]	438
	439	# only output this document if it is one to be indexed
	440	return if ($doc_obj->get_doc_type() ne "indexed_doc");
	441
[69]	442	# see if this document belongs to this subcollection
[900]	443	foreach my $indexexp (@{$self->{'indexexparr'}}) {
[69]	444	$indexed_doc = 0;
	445	my ($field, $exp, $options) = split /\//, $indexexp;
	446	if (defined ($field) && defined ($exp)) {
	447	my ($bool) = $field =~ /^(.)/;
	448	$field =~ s/^.// if $bool eq '!';
[544]	449	if ($field =~ /^filename$/i) {
[69]	450	$field = $doc_obj->get_source_filename();
	451	} else {
	452	$field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $field);
	453	}
	454	next unless defined $field;
	455	if ($bool eq '!') {
	456	if ($options =~ /^i$/i) {
	457	if ($field !~ /$exp/i) {$indexed_doc = 1; last;}
	458	} else {
	459	if ($field !~ /$exp/) {$indexed_doc = 1; last;}
	460	}
	461	} else {
	462	if ($options =~ /^i$/i) {
	463	if ($field =~ /$exp/i) {$indexed_doc = 1; last;}
	464	} else {
	465	if ($field =~ /$exp/) {$indexed_doc = 1; last;}
	466	}
	467	}
	468	}
	469	}
	470
[4]	471	# this is another document
	472	$self->{'num_docs'} += 1;
	473
	474	# get the parameters for the output
	475	my ($level, $fields) = split (/:/, $self->{'index'});
	476	$fields =~ s/\ball\b/Title,Creator,text/;
	477	$fields =~ s/\btopall\b/topTitle,topCreator,toptext/;
	478
	479	my $doc_section = 0; # just for this document
	480	my $text = "";
	481	my $text_extra = "";
	482
	483	# get the text for this document
	484	my $section = $doc_obj->get_top_section();
	485	while (defined $section) {
	486	# update a few statistics
	487	$doc_section++;
	488	$self->{'num_sections'} += 1;
[69]	489
	490	if ($indexed_doc) {
	491	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
[900]	492	foreach my $field (split (/,/, $fields)) {
[69]	493	# only deal with this field if it doesn't start with top or
	494	# this is the first section
	495	my $real_field = $field;
	496	if (!($real_field =~ s/^top//) \|\| ($doc_section == 1)) {
	497	my $new_text = "";
[4743]	498	if ($level eq "dummy") {
	499	# a dummy index is a special case used when no
	500	# indexes are specified (since there must always be
	501	# at least one index or we can't retrieve the
	502	# compressed text) - we add a small amount of text
	503	# to these dummy indexes which will never be seen
	504	# but will overcome mg's problems with building
	505	# empty indexes
	506	$new_text = "this is dummy text to stop mg barfing";
	507	$self->{'num_processed_bytes'} += length ($new_text);
	508
	509	} elsif ($real_field eq "text") {
[2336]	510	$new_text = $doc_obj->get_text ($section) if $self->{'store_text'};
[1251]	511	$self->{'num_processed_bytes'} += length ($new_text);
[69]	512	$new_text =~ s/[\cB\cC]//g;
[289]	513	$self->find_paragraphs($new_text);
[69]	514
	515	} else {
[1072]	516	my $first = 1;
	517	foreach $meta (@{$doc_obj->get_metadata ($section, $real_field)}) {
	518	$meta =~ s/[\cB\cC]//g;
[1251]	519	$self->{'num_processed_bytes'} += length ($meta);
[1072]	520	$new_text .= "\cC" unless $first;
[2336]	521	$new_text .= $meta if $self->{'store_text'};
[1072]	522	$first = 0;
	523	}
[69]	524	}
[4]	525
[292]	526	# filter the text
	527	$self->filter_text ($field, $new_text);
	528
[69]	529	$text .= "$new_text\cC";
[4]	530	}
	531	}
	532	}
	533
	534	if ($level eq "document") { $text_extra .= "\cB"; }
	535	else { $text .= "\cB"; }
	536
	537	$section = $doc_obj->get_next_section($section);
	538	}
	539
	540	print $handle "$text$text_extra";
	541	}
	542
	543	1;
	544

Note: See TracBrowser for help on using the repository browser.

Download in other formats: