Context Navigation

source: gsdl/trunk/perllib/basebuildproc.pm@ 20008

Last change on this file since 20008 was 18508, checked in by davidb, 15 years ago
Had to move location of where deletion of archive files was done
Property svn:keywords set to `Author Date Id Revision`
File size: 21.7 KB

Rev	Line
[18508]	1	##########################################################################
[9919]	2	#
	3	# basebuildproc.pm --
	4	# A component of the Greenstone digital library software
	5	# from the New Zealand Digital Library Project at the
	6	# University of Waikato, New Zealand.
	7	#
	8	# Copyright (C) 1999 New Zealand Digital Library Project
	9	#
	10	# This program is free software; you can redistribute it and/or modify
	11	# it under the terms of the GNU General Public License as published by
	12	# the Free Software Foundation; either version 2 of the License, or
	13	# (at your option) any later version.
	14	#
	15	# This program is distributed in the hope that it will be useful,
	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	# GNU General Public License for more details.
	19	#
	20	# You should have received a copy of the GNU General Public License
	21	# along with this program; if not, write to the Free Software
	22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	#
	24	###########################################################################
	25
	26	# This document processor outputs a document for indexing (should be
[15688]	27	# implemented by subclass) and storing in the database
[9919]	28
	29	package basebuildproc;
	30
	31	eval {require bytes};
	32
	33	use classify;
[15699]	34	use dbutil;
[9919]	35	use doc;
	36	use docproc;
[18456]	37	use strict;
	38	no strict 'subs';
	39	no strict 'refs';
[9919]	40	use util;
	41
	42	BEGIN {
	43	@basebuildproc::ISA = ('docproc');
	44	}
	45
[12844]	46	sub new()
	47	{
	48	my ($class, $collection, $source_dir, $build_dir, $keepold, $verbosity, $outhandle) = @_;
[9919]	49	my $self = new docproc ();
	50
	51	# outhandle is where all the debugging info goes
	52	# output_handle is where the output of the plugins is piped
[15688]	53	# to (i.e. mg, database etc.)
[9919]	54	$outhandle = STDERR unless defined $outhandle;
	55
	56	$self->{'collection'} = $collection;
	57	$self->{'source_dir'} = $source_dir;
[10159]	58	$self->{'build_dir'} = $build_dir;
	59	$self->{'keepold'} = $keepold;
	60	$self->{'verbosity'} = $verbosity;
	61	$self->{'outhandle'} = $outhandle;
[9919]	62
	63	$self->{'classifiers'} = [];
	64	$self->{'mode'} = "text";
	65	$self->{'assocdir'} = $build_dir;
[15688]	66	$self->{'dontdb'} = {};
[16222]	67	$self->{'store_metadata_coverage'} = "false";
[9919]	68
	69	$self->{'index'} = "section:text";
	70	$self->{'indexexparr'} = [];
	71
[17110]	72	$self->{'separate_cjk'} = 0;
	73
[10159]	74	my $found_num_data = 0;
	75	my $buildconfigfile = undef;
	76
	77	if ($keepold) {
	78	# For incremental building need to seed num_docs etc from values
	79	# stored in build.cfg (if present)
	80	$buildconfigfile = &util::filename_cat($build_dir, "build.cfg");
	81	if (-e $buildconfigfile) {
	82	$found_num_data = 1;
	83	}
	84	else {
	85	# try the index dir
	86	$buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},
	87	"index", "build.cfg");
	88	if (-e $buildconfigfile) {
	89	$found_num_data = 1;
	90	}
	91	}
	92
[12844]	93	}
[10159]	94
[12844]	95	if ($found_num_data)
	96	{
	97	#print STDERR "Found_Num_Data!\n";
[10159]	98	my $buildcfg = &colcfg::read_build_cfg($buildconfigfile);
	99	$self->{'starting_num_docs'} = $buildcfg->{'numdocs'};
[12844]	100	#print STDERR "- num_docs: $self->{'starting_num_docs'}\n";
[10159]	101	$self->{'starting_num_sections'} = $buildcfg->{'numsections'};
[12844]	102	#print STDERR "- num_sections: $self->{'starting_num_sections'}\n";
[10159]	103	$self->{'starting_num_bytes'} = $buildcfg->{'numbytes'};
[12844]	104	#print STDERR "- num_bytes: $self->{'starting_num_bytes'}\n";
[10159]	105	}
[12844]	106	else
	107	{
	108	#print STDERR "NOT Found_Num_Data!\n";
	109	$self->{'starting_num_docs'} = 0;
[10159]	110	$self->{'starting_num_sections'} = 0;
	111	$self->{'starting_num_bytes'} = 0;
[12844]	112	}
[10159]	113
[9919]	114	$self->{'output_handle'} = "STDOUT";
[10159]	115	$self->{'num_docs'} = $self->{'starting_num_docs'};
	116	$self->{'num_sections'} = $self->{'starting_num_sections'};
	117	$self->{'num_bytes'} = $self->{'starting_num_bytes'};
	118
[9919]	119	$self->{'num_processed_bytes'} = 0;
	120	$self->{'store_text'} = 1;
	121
[15685]	122	# what level (section/document) the database - indexer intersection is
	123	$self->{'db_level'} = "section";
[9919]	124	#used by browse interface
	125	$self->{'doclist'} = [];
	126
	127	$self->{'indexing_text'} = 0;
	128
	129	return bless $self, $class;
	130
	131	}
	132
	133	sub reset {
	134	my $self = shift (@_);
[10159]	135
	136	$self->{'num_docs'} = $self->{'starting_num_docs'};
	137	$self->{'num_sections'} = $self->{'starting_num_sections'};
	138	$self->{'num_bytes'} = $self->{'starting_num_bytes'};
[9919]	139
	140	$self->{'num_processed_bytes'} = 0;
	141	}
	142
[10159]	143	sub zero_reset {
	144	my $self = shift (@_);
	145
	146	$self->{'num_docs'} = 0;
	147	$self->{'num_sections'} = 0;
[17564]	148	# reconstructed docs have no text, just metadata, so we need to
	149	# remember how many bytes we had initially
	150	$self->{'num_bytes'} = $self->{'starting_num_bytes'};
[10159]	151
	152	$self->{'num_processed_bytes'} = 0;
	153	}
	154
[10419]	155	sub is_incremental_capable
[10304]	156	{
	157	# By default we return 'no' as the answer
	158	# Safer to assume non-incremental to start with, and then override in
	159	# inherited classes that are.
	160
	161	return 0;
	162	}
	163
[9919]	164	sub get_num_docs {
	165	my $self = shift (@_);
	166
	167	return $self->{'num_docs'};
	168	}
	169
	170	sub get_num_sections {
	171	my $self = shift (@_);
	172
	173	return $self->{'num_sections'};
	174	}
	175
	176	# num_bytes is the actual number of bytes in the collection
	177	# this is normally the same as what's processed during text compression
	178	sub get_num_bytes {
	179	my $self = shift (@_);
	180
	181	return $self->{'num_bytes'};
	182	}
	183
	184	# num_processed_bytes is the number of bytes actually passed
	185	# to mg for the current index
	186	sub get_num_processed_bytes {
	187	my $self = shift (@_);
	188
	189	return $self->{'num_processed_bytes'};
	190	}
	191
	192	sub set_output_handle {
	193	my $self = shift (@_);
	194	my ($handle) = @_;
	195
	196	$self->{'output_handle'} = $handle;
	197	}
	198
	199
	200	sub set_mode {
	201	my $self = shift (@_);
	202	my ($mode) = @_;
	203
	204	$self->{'mode'} = $mode;
	205	}
	206
[10159]	207	sub get_mode {
	208	my $self = shift (@_);
	209
	210	return $self->{'mode'};
	211	}
	212
[9919]	213	sub set_assocdir {
	214	my $self = shift (@_);
	215	my ($assocdir) = @_;
	216
	217	$self->{'assocdir'} = $assocdir;
	218	}
	219
[15688]	220	sub set_dontdb {
[9919]	221	my $self = shift (@_);
[15688]	222	my ($dontdb) = @_;
[9919]	223
[15688]	224	$self->{'dontdb'} = $dontdb;
[9919]	225	}
	226
[15725]	227	sub set_infodbtype
	228	{
	229	my $self = shift(@_);
	230	my $infodbtype = shift(@_);
	231	$self->{'infodbtype'} = $infodbtype;
	232	}
	233
[9919]	234	sub set_index {
	235	my $self = shift (@_);
	236	my ($index, $indexexparr) = @_;
	237
	238	$self->{'index'} = $index;
	239	$self->{'indexexparr'} = $indexexparr if defined $indexexparr;
	240	}
	241
	242	sub set_index_languages {
	243	my $self = shift (@_);
	244	my ($lang_meta, $langarr) = @_;
	245	$self->{'lang_meta'} = $lang_meta;
	246	$self->{'langarr'} = $langarr;
	247	}
	248
	249	sub get_index {
	250	my $self = shift (@_);
	251
	252	return $self->{'index'};
	253	}
	254
	255	sub set_classifiers {
	256	my $self = shift (@_);
	257	my ($classifiers) = @_;
	258
	259	$self->{'classifiers'} = $classifiers;
	260	}
	261
	262	sub set_indexing_text {
	263	my $self = shift (@_);
	264	my ($indexing_text) = @_;
	265
	266	$self->{'indexing_text'} = $indexing_text;
	267	}
	268
	269	sub get_indexing_text {
	270	my $self = shift (@_);
	271
	272	return $self->{'indexing_text'};
	273	}
	274
	275	sub set_store_text {
	276	my $self = shift (@_);
	277	my ($store_text) = @_;
	278
	279	$self->{'store_text'} = $store_text;
	280	}
[16222]	281
	282	sub set_store_metadata_coverage {
	283	my $self = shift (@_);
	284	my ($store_metadata_coverage) = @_;
	285
	286	$self->{'store_metadata_coverage'} = $store_metadata_coverage \|\| "";
	287	}
	288
[9919]	289	sub get_doc_list {
	290	my $self = shift(@_);
	291
	292	return @{$self->{'doclist'}};
	293	}
	294
[15685]	295	# the standard database level is section, but you may want to change it to document
	296	sub set_db_level {
[9919]	297	my $self= shift (@_);
[15685]	298	my ($db_level) = @_;
[9919]	299
[15685]	300	$self->{'db_level'} = $db_level;
[9919]	301	}
	302
[10469]	303	sub set_sections_index_document_metadata {
	304	my $self= shift (@_);
	305	my ($index_type) = @_;
	306
	307	$self->{'sections_index_document_metadata'} = $index_type;
	308	}
[17110]	309
	310	sub set_separate_cjk {
	311	my $self = shift (@_);
	312	my ($sep_cjk) = @_;
	313
	314	$self->{'separate_cjk'} = $sep_cjk;
	315	}
	316
[9919]	317	sub process {
	318	my $self = shift (@_);
	319	my $method = $self->{'mode'};
	320
	321	$self->$method(@_);
	322	}
	323
[17110]	324	# post process text depending on field. Currently don't do anything here
[17111]	325	# except cjk separation, and only for indexing
	326	# should only do this for indexed text (if $self->{'indexing_text'}),
	327	# but currently search term highlighting doesn't work if you do that.
	328	# once thats fixed up, then fix this.
[17110]	329	sub filter_text {
	330	my $self = shift (@_);
	331	my ($field, $text) = @_;
[14934]	332
[17110]	333	# lets do cjk seg here
	334	my $new_text =$text;
	335	if ($self->{'separate_cjk'}) {
	336	$new_text = &cnseg::segment($text);
	337	}
	338	return $new_text;
	339	}
[14934]	340
[17110]	341
[14934]	342	sub infodb_metadata_stats
	343	{
	344	my $self = shift (@_);
[18469]	345	my ($field,$edit_mode) = @_;
[14934]	346
	347	# Keep some statistics relating to metadata sets used and
	348	# frequency of particular metadata fields within each set
	349
	350	# Union of metadata prefixes and frequency of fields
	351	# (both scoped for this document alone, and across whole collection)
	352
	353	if ($field =~ m/^(.+)\.(.*)$/) {
	354	my $prefix = $1;
	355	my $core_field = $2;
	356
[18471]	357	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
[18469]	358	$self->{'doc_mdprefix_fields'}->{$prefix}->{$core_field}++;
	359	$self->{'mdprefix_fields'}->{$prefix}->{$core_field}++;
	360	}
	361	else {
	362	# delete
	363	$self->{'doc_mdprefix_fields'}->{$prefix}->{$core_field}--;
	364	$self->{'mdprefix_fields'}->{$prefix}->{$core_field}--;
	365	}
	366
[14934]	367	}
	368	elsif ($field =~ m/^[[:upper:]]/) {
	369	# implicit 'ex' metadata set
	370
[18471]	371	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
[18469]	372
	373	$self->{'doc_mdprefix_fields'}->{'ex'}->{$field}++;
	374	$self->{'mdprefix_fields'}->{'ex'}->{$field}++;
	375	}
	376	else {
	377	# delete
	378	$self->{'doc_mdprefix_fields'}->{'ex'}->{$field}--;
	379	$self->{'mdprefix_fields'}->{'ex'}->{$field}--;
	380	}
[14934]	381	}
	382
	383	}
	384
	385
[18456]	386	sub infodbedit {
[9919]	387	my $self = shift (@_);
[18456]	388	my ($doc_obj, $filename, $edit_mode) = @_;
[9919]	389
[15696]	390	# only output this document if it is a "indexed_doc" or "info_doc" (database only) document
[9919]	391	my $doctype = $doc_obj->get_doc_type();
[11793]	392	return if ($doctype ne "indexed_doc" && $doctype ne "info_doc");
[9919]	393
[11994]	394	my $archivedir = "";
	395	if (defined $filename)
	396	{
	397	# doc_obj derived directly from file
	398	my ($dir) = $filename =~ /^(.?)(?:\/\|\\)[^\/\\]$/;
	399	$dir = "" unless defined $dir;
	400	$dir =~ s/\\/\//g;
	401	$dir =~ s/^\/+//;
	402	$dir =~ s/\/+$//;
	403
	404	$archivedir = $dir;
	405
	406	# resolve the final filenames of the files associated with this document
	407	$self->assoc_files ($doc_obj, $archivedir);
	408	}
	409	else
	410	{
[15688]	411	# doc_obj reconstructed from database (has metadata, doc structure but no text)
[11994]	412	my $top_section = $doc_obj->get_top_section();
	413	$archivedir = $doc_obj->get_metadata_element($top_section,"archivedir");
	414	}
	415
[18471]	416	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
[18456]	417	#add this document to the browse structure
	418	push(@{$self->{'doclist'}},$doc_obj->get_OID())
	419	unless ($doctype eq "classification");
	420	}
	421	else {
	422	# delete => remove this doc from browse structure
	423	my $del_doc_oid = $doc_obj->get_OID();
[9919]	424
[18456]	425	my @filtered_doc_list = ();
	426	foreach my $oid (@{$self->{'doclist'}}) {
	427	push(@filtered_doc_list,$oid) if ($oid ne $del_doc_oid);
	428	}
	429	$self->{'doclist'} = \@filtered_doc_list;
	430	}
	431
	432
[9919]	433	# classify this document
[18456]	434	&classify::classify_doc ($self->{'classifiers'}, $doc_obj, $edit_mode);
[9919]	435
[18471]	436	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
[18456]	437	# this is another document
	438	$self->{'num_docs'} += 1 unless ($doctype eq "classification");
	439	}
	440	else {
	441	# delete
	442	$self->{'num_docs'} -= 1 unless ($doctype eq "classification");
	443	}
[9919]	444
	445	# is this a paged or a hierarchical document
	446	my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
	447
	448	my $section = $doc_obj->get_top_section ();
	449	my $doc_OID = $doc_obj->get_OID();
	450	my $first = 1;
[15699]	451	my $infodb_handle = $self->{'output_handle'};
[14934]	452
	453	$self->{'doc_mdprefix_fields'} = {};
	454
[15695]	455	while (defined $section)
	456	{
	457	my $section_OID = $doc_OID;
	458	if ($section ne "")
	459	{
	460	$section_OID = $doc_OID . "." . $section;
	461	}
[15696]	462	my %section_infodb = ();
[15695]	463
[9919]	464	# update a few statistics
[18471]	465	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
[9919]	466
[18469]	467	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
	468	$self->{'num_sections'} += 1 unless ($doctype eq "classification");
	469	}
	470	else {
	471	# delete
	472	$self->{'num_bytes'} -= $doc_obj->get_text_length ($section);
	473	$self->{'num_sections'} -= 1 unless ($doctype eq "classification");
	474	}
	475
[9919]	476	# output the fact that this document is a document (unless doctype
	477	# has been set to something else from within a plugin
	478	my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
	479	if (!defined $dtype \|\| $dtype !~ /\w/) {
[15697]	480	$section_infodb{"doctype"} = [ "doc" ];
[9919]	481	}
	482
[11994]	483	# Output whether this node contains text
	484	#
[15688]	485	# If doc_obj reconstructed from database file then no need to
[11994]	486	# explicitly add <hastxt> as this is preserved as metadata when
[15688]	487	# the database file is loaded in
[11994]	488	if (defined $filename)
	489	{
	490	# doc_obj derived directly from file
	491	if ($doc_obj->get_text_length($section) > 0) {
[15697]	492	$section_infodb{"hastxt"} = [ "1" ];
[11994]	493	} else {
[15697]	494	$section_infodb{"hastxt"} = [ "0" ];
[11994]	495	}
[9919]	496	}
	497
	498	# output all the section metadata
	499	my $metadata = $doc_obj->get_all_metadata ($section);
	500	foreach my $pair (@$metadata) {
	501	my ($field, $value) = (@$pair);
	502
	503	if ($field ne "Identifier" && $field !~ /^gsdl/ &&
	504	defined $value && $value ne "") {
	505
	506	# escape problematic stuff
	507	$value =~ s/\\/\\\\/g;
	508	$value =~ s/\n/\\n/g;
	509	$value =~ s/\r/\\r/g;
	510
	511	# special case for URL metadata
	512	if ($field =~ /^URL$/i) {
[18471]	513	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
[18469]	514
	515	&dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $value, { 'section' => [ $section_OID ] });
	516	}
	517	else {
	518	# delete
	519	&dbutil::delete_infodb_entry($self->{'infodbtype'}, $infodb_handle, $value);
	520	}
	521
	522
[9919]	523	}
	524
[15688]	525	if (!defined $self->{'dontdb'}->{$field}) {
[15697]	526	push(@{$section_infodb{$field}}, $value);
[14934]	527
[16222]	528	if ($section eq "" && $self->{'store_metadata_coverage'} =~ /^true$/i)
[14934]	529	{
[18469]	530	$self->infodb_metadata_stats($field,$edit_mode);
[14934]	531	}
[9919]	532	}
	533	}
	534	}
	535
[14934]	536	if ($section eq "")
	537	{
	538	my $doc_mdprefix_fields = $self->{'doc_mdprefix_fields'};
[11994]	539
[14934]	540	foreach my $prefix (keys %$doc_mdprefix_fields)
	541	{
[15697]	542	push(@{$section_infodb{"metadataset"}}, $prefix);
[14934]	543
	544	foreach my $field (keys %{$doc_mdprefix_fields->{$prefix}})
	545	{
[15708]	546	push(@{$section_infodb{"metadatalist-$prefix"}}, $field);
	547
[14934]	548	my $val = $doc_mdprefix_fields->{$prefix}->{$field};
[15697]	549	push(@{$section_infodb{"metadatafreq-$prefix-$field"}}, $val);
[14934]	550	}
	551	}
	552	}
	553
[15688]	554	# If doc_obj reconstructed from database file then no need to
[11994]	555	# explicitly add <archivedir> as this is preserved as metadata when
[15688]	556	# the database file is loaded in
[11994]	557	if (defined $filename)
	558	{
	559	# output archivedir if at top level
	560	if ($section eq $doc_obj->get_top_section()) {
[15697]	561	$section_infodb{"archivedir"} = [ $archivedir ];
[11994]	562	}
[9919]	563	}
	564
	565	# output document display type
	566	if ($first) {
[15697]	567	$section_infodb{"thistype"} = [ $thistype ];
[9919]	568	}
	569
[15685]	570	if ($self->{'db_level'} eq "document") {
[9919]	571	# doc num is num_docs not num_sections
	572	# output the matching document number
[15697]	573	$section_infodb{"docnum"} = [ $self->{'num_docs'} ];
[15696]	574	}
	575	else {
[9919]	576	# output a list of children
	577	my $children = $doc_obj->get_children ($section);
	578	if (scalar(@$children) > 0) {
[15697]	579	$section_infodb{"childtype"} = [ $childtype ];
[15696]	580	my $contains = "";
	581	foreach my $child (@$children)
	582	{
	583	$contains .= ";" unless ($contains eq "");
	584	if ($child =~ /^.*?\.(\d+)$/)
	585	{
	586	$contains .= "\".$1";
[9919]	587	}
[15698]	588	else
	589	{
[15696]	590	$contains .= "\".$child";
	591	}
[9919]	592	}
[15697]	593	$section_infodb{"contains"} = [ $contains ];
[9919]	594	}
[15696]	595	# output the matching doc number
[15697]	596	$section_infodb{"docnum"} = [ $self->{'num_sections'} ];
[9919]	597	}
	598
[18471]	599	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
[18469]	600
	601	&dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $section_OID, \%section_infodb);
	602	}
	603	else {
	604	# delete
	605	&dbutil::delete_infodb_entry($self->{'infodbtype'}, $infodb_handle, $section_OID);
	606	}
	607
[9919]	608
[17106]	609	# output a database entry for the document number, except for Lucene (which no longer needs this information)
	610	unless (ref($self) eq "lucenebuildproc")
	611	{
[18471]	612	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
[18469]	613
	614	if ($self->{'db_level'} eq "document") {
	615	&dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_docs'}, { 'section' => [ $doc_OID ] });
	616	}
	617	else {
	618	&dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_sections'}, { 'section' => [ $section_OID ] });
	619	}
[17106]	620	}
	621	else {
[18469]	622
	623	if ($self->{'db_level'} eq "document") {
	624	&dbutil::delete_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_docs'});
	625	}
	626	else {
	627	&dbutil::delete_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_sections'});
	628	}
	629
[17106]	630	}
[9919]	631	}
	632
	633	$first = 0;
	634	$section = $doc_obj->get_next_section($section);
[15685]	635	last if ($self->{'db_level'} eq "document"); # if no sections wanted, only add the docs
[9919]	636	}
[15696]	637	}
[9919]	638
[15696]	639
[18456]	640
	641
	642	sub infodb {
	643	my $self = shift (@_);
	644	my ($doc_obj, $filename) = @_;
	645
	646	$self->infodbedit($doc_obj,$filename,"add");
	647	}
	648
	649	sub infodbreindex {
	650	my $self = shift (@_);
	651	my ($doc_obj, $filename) = @_;
	652
[18471]	653	$self->infodbedit($doc_obj,$filename,"update");
[18456]	654	}
	655
	656	sub infodbdelete {
	657	my $self = shift (@_);
	658	my ($doc_obj, $filename) = @_;
	659
	660	$self->infodbedit($doc_obj,$filename,"delete");
	661	}
	662
	663
[9919]	664	sub text {
	665	my $self = shift (@_);
	666	my ($doc_obj) = @_;
	667
	668	my $handle = $self->{'outhandle'};
	669	print $handle "basebuildproc::text function must be implemented in sub classes\n";
	670	die "\n";
	671	}
	672
[18456]	673	sub textreindex
	674	{
	675	my $self = shift @_;
	676
	677	my $outhandle = $self->{'outhandle'};
	678	print $outhandle "basebuildproc::textreindex function must be implemented in sub classes\n";
	679	if (!$self->is_incremental_capable()) {
	680
	681	print $outhandle " This operation is only possible with indexing tools with that support\n";
	682	print $outhandle " incremental building\n";
	683	}
	684	die "\n";
	685	}
	686
	687	sub textdelete
	688	{
	689	my $self = shift @_;
	690
	691	my $outhandle = $self->{'outhandle'};
	692	print $outhandle "basebuildproc::textdelete function must be implemented in sub classes\n";
	693	if (!$self->is_incremental_capable()) {
	694
	695	print $outhandle " This operation is only possible with indexing tools with that support\n";
	696	print $outhandle " incremental building\n";
	697	}
	698	die "\n";
	699	}
	700
	701
[9919]	702	# should the document be indexed - according to the subcollection and language
	703	# specification.
	704	sub is_subcollection_doc {
	705	my $self = shift (@_);
	706	my ($doc_obj) = @_;
	707
	708	my $indexed_doc = 1;
	709	foreach my $indexexp (@{$self->{'indexexparr'}}) {
	710	$indexed_doc = 0;
	711	my ($field, $exp, $options) = split /\//, $indexexp;
	712	if (defined ($field) && defined ($exp)) {
	713	my ($bool) = $field =~ /^(.)/;
	714	$field =~ s/^.// if $bool eq '!';
[10028]	715	my @metadata_values;
[9919]	716	if ($field =~ /^filename$/i) {
[10028]	717	push(@metadata_values, $doc_obj->get_source_filename());
[9919]	718	}
[10028]	719	else {
	720	@metadata_values = @{$doc_obj->get_metadata($doc_obj->get_top_section(), $field)};
	721	}
	722	next unless @metadata_values;
	723	foreach my $metadata_value (@metadata_values) {
	724	if ($bool eq '!') {
	725	if ($options =~ /^i$/i) {
	726	if ($metadata_value !~ /$exp/i) {$indexed_doc = 1; last;}
	727	} else {
	728	if ($metadata_value !~ /$exp/) {$indexed_doc = 1; last;}
	729	}
[9919]	730	} else {
[10028]	731	if ($options =~ /^i$/i) {
	732	if ($metadata_value =~ /$exp/i) {$indexed_doc = 1; last;}
	733	} else {
	734	if ($metadata_value =~ /$exp/) {$indexed_doc = 1; last;}
	735	}
[9919]	736	}
	737	}
[10028]	738
	739	last if ($indexed_doc == 1);
[9919]	740	}
	741	}
	742
	743	# if this doc is so far in the sub collection, and we have lang info,
	744	# now we check the languages to see if it matches
	745	if($indexed_doc && defined $self->{'lang_meta'}) {
	746	$indexed_doc = 0;
	747	my $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'lang_meta'});
	748	if (defined $field) {
	749	foreach my $lang (@{$self->{'langarr'}}) {
	750	my ($bool) = $lang =~ /^(.)/;
	751	if ($bool eq '!') {
	752	$lang =~ s/^.//;
	753	if ($field !~ /$lang/) {
	754	$indexed_doc = 1; last;
	755	}
	756	} else {
	757	if ($field =~ /$lang/) {
	758	$indexed_doc = 1; last;
	759	}
	760	}
	761	}
	762	}
	763	}
	764	return $indexed_doc;
	765
	766	}
	767
	768	# use 'Paged' if document has no more than 2 levels
	769	# and each section at second level has a number for
	770	# Title metadata
	771	# also use Paged if gsdlthistype metadata is set to Paged
	772	sub get_document_type {
	773	my $self = shift (@_);
	774	my ($doc_obj) = @_;
	775
	776	my $thistype = "VList";
	777	my $childtype = "VList";
	778	my $title;
	779	my @tmp = ();
	780
	781	my $section = $doc_obj->get_top_section ();
	782
	783	my $gsdlthistype = $doc_obj->get_metadata_element ($section, "gsdlthistype");
	784	if (defined $gsdlthistype) {
	785	if ($gsdlthistype eq "Paged") {
	786	$childtype = "Paged";
	787	if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
	788	$thistype = "Paged";
	789	} else {
	790	$thistype = "Invisible";
	791	}
	792
	793	return ($thistype, $childtype);
	794	} elsif ($gsdlthistype eq "Hierarchy") {
	795	return ($thistype, $childtype); # use VList, VList
	796	}
	797	}
	798	my $first = 1;
	799	while (defined $section) {
	800	@tmp = split /\./, $section;
	801	if (scalar(@tmp) > 1) {
	802	return ($thistype, $childtype);
	803	}
	804	if (!$first) {
	805	$title = $doc_obj->get_metadata_element ($section, "Title");
	806	if (!defined $title \|\| $title !~ /^\d+$/) {
	807	return ($thistype, $childtype);
	808	}
	809	}
	810	$first = 0;
	811	$section = $doc_obj->get_next_section($section);
	812	}
	813	if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
	814	$thistype = "Paged";
	815	} else {
	816	$thistype = "Invisible";
	817	}
	818	$childtype = "Paged";
	819	return ($thistype, $childtype);
	820	}
	821
[18456]	822	sub assoc_files
	823	{
[9919]	824	my $self = shift (@_);
	825	my ($doc_obj, $archivedir) = @_;
	826	my ($afile);
	827
	828	foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
[12844]	829	#rint STDERR "Processing associated file - copy " . $assoc_file->[0] . " to " . $assoc_file->[1] . "\n";
[9919]	830	# if assoc file starts with a slash, we put it relative to the assoc
	831	# dir, otherwise it is relative to the HASH... directory
	832	if ($assoc_file->[1] =~ m@^[/\\]@) {
[12844]	833	$afile = &util::filename_cat($self->{'assocdir'}, $assoc_file->[1]);
[9919]	834	} else {
	835	$afile = &util::filename_cat($self->{'assocdir'}, $archivedir, $assoc_file->[1]);
	836	}
[18463]	837	&util::hard_link ($assoc_file->[0], $afile, $self->{'verbosity'});
[9919]	838	}
	839	}
	840

Note: See TracBrowser for help on using the repository browser.

Download in other formats: