Context Navigation

source: main/trunk/greenstone2/perllib/basebuildproc.pm@ 23452

Last change on this file since 23452 was 23387, checked in by davidb, 14 years ago
Further changes to deal with documents that use different filename encodings on the file-system. Now sets UTF8URL metadata to perform the cross-document look up. Files stored in doc.pm as associated files are now always raw filenames (rather than potentially UTF8 encoded). Storing of filenames seen by HTMLPlug when scanning for files to block on is now done in Unicode aware strings rather than utf8 but unware strings.
Property svn:keywords set to `Author Date Id Revision`
File size: 22.3 KB

Rev	Line
[18508]	1	##########################################################################
[9919]	2	#
	3	# basebuildproc.pm --
	4	# A component of the Greenstone digital library software
	5	# from the New Zealand Digital Library Project at the
	6	# University of Waikato, New Zealand.
	7	#
	8	# Copyright (C) 1999 New Zealand Digital Library Project
	9	#
	10	# This program is free software; you can redistribute it and/or modify
	11	# it under the terms of the GNU General Public License as published by
	12	# the Free Software Foundation; either version 2 of the License, or
	13	# (at your option) any later version.
	14	#
	15	# This program is distributed in the hope that it will be useful,
	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	# GNU General Public License for more details.
	19	#
	20	# You should have received a copy of the GNU General Public License
	21	# along with this program; if not, write to the Free Software
	22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	#
	24	###########################################################################
	25
	26	# This document processor outputs a document for indexing (should be
[15688]	27	# implemented by subclass) and storing in the database
[9919]	28
	29	package basebuildproc;
	30
	31	eval {require bytes};
	32
	33	use classify;
[15699]	34	use dbutil;
[9919]	35	use doc;
	36	use docproc;
[18456]	37	use strict;
	38	no strict 'subs';
	39	no strict 'refs';
[9919]	40	use util;
	41
	42	BEGIN {
	43	@basebuildproc::ISA = ('docproc');
	44	}
	45
[12844]	46	sub new()
	47	{
	48	my ($class, $collection, $source_dir, $build_dir, $keepold, $verbosity, $outhandle) = @_;
[9919]	49	my $self = new docproc ();
	50
	51	# outhandle is where all the debugging info goes
	52	# output_handle is where the output of the plugins is piped
[15688]	53	# to (i.e. mg, database etc.)
[9919]	54	$outhandle = STDERR unless defined $outhandle;
	55
	56	$self->{'collection'} = $collection;
	57	$self->{'source_dir'} = $source_dir;
[10159]	58	$self->{'build_dir'} = $build_dir;
	59	$self->{'keepold'} = $keepold;
	60	$self->{'verbosity'} = $verbosity;
	61	$self->{'outhandle'} = $outhandle;
[9919]	62
	63	$self->{'classifiers'} = [];
	64	$self->{'mode'} = "text";
	65	$self->{'assocdir'} = $build_dir;
[15688]	66	$self->{'dontdb'} = {};
[16222]	67	$self->{'store_metadata_coverage'} = "false";
[9919]	68
	69	$self->{'index'} = "section:text";
	70	$self->{'indexexparr'} = [];
	71
[17110]	72	$self->{'separate_cjk'} = 0;
	73
[10159]	74	my $found_num_data = 0;
	75	my $buildconfigfile = undef;
	76
	77	if ($keepold) {
	78	# For incremental building need to seed num_docs etc from values
	79	# stored in build.cfg (if present)
	80	$buildconfigfile = &util::filename_cat($build_dir, "build.cfg");
	81	if (-e $buildconfigfile) {
	82	$found_num_data = 1;
	83	}
	84	else {
	85	# try the index dir
	86	$buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},
	87	"index", "build.cfg");
	88	if (-e $buildconfigfile) {
	89	$found_num_data = 1;
	90	}
	91	}
	92
[12844]	93	}
[10159]	94
[12844]	95	if ($found_num_data)
	96	{
	97	#print STDERR "Found_Num_Data!\n";
[10159]	98	my $buildcfg = &colcfg::read_build_cfg($buildconfigfile);
	99	$self->{'starting_num_docs'} = $buildcfg->{'numdocs'};
[12844]	100	#print STDERR "- num_docs: $self->{'starting_num_docs'}\n";
[10159]	101	$self->{'starting_num_sections'} = $buildcfg->{'numsections'};
[12844]	102	#print STDERR "- num_sections: $self->{'starting_num_sections'}\n";
[10159]	103	$self->{'starting_num_bytes'} = $buildcfg->{'numbytes'};
[12844]	104	#print STDERR "- num_bytes: $self->{'starting_num_bytes'}\n";
[10159]	105	}
[12844]	106	else
	107	{
	108	#print STDERR "NOT Found_Num_Data!\n";
	109	$self->{'starting_num_docs'} = 0;
[10159]	110	$self->{'starting_num_sections'} = 0;
	111	$self->{'starting_num_bytes'} = 0;
[12844]	112	}
[10159]	113
[9919]	114	$self->{'output_handle'} = "STDOUT";
[10159]	115	$self->{'num_docs'} = $self->{'starting_num_docs'};
	116	$self->{'num_sections'} = $self->{'starting_num_sections'};
	117	$self->{'num_bytes'} = $self->{'starting_num_bytes'};
	118
[9919]	119	$self->{'num_processed_bytes'} = 0;
	120	$self->{'store_text'} = 1;
	121
[15685]	122	# what level (section/document) the database - indexer intersection is
	123	$self->{'db_level'} = "section";
[9919]	124	#used by browse interface
	125	$self->{'doclist'} = [];
	126
	127	$self->{'indexing_text'} = 0;
	128
	129	return bless $self, $class;
	130
	131	}
	132
	133	sub reset {
	134	my $self = shift (@_);
[10159]	135
	136	$self->{'num_docs'} = $self->{'starting_num_docs'};
	137	$self->{'num_sections'} = $self->{'starting_num_sections'};
	138	$self->{'num_bytes'} = $self->{'starting_num_bytes'};
[9919]	139
	140	$self->{'num_processed_bytes'} = 0;
	141	}
	142
[10159]	143	sub zero_reset {
	144	my $self = shift (@_);
	145
	146	$self->{'num_docs'} = 0;
	147	$self->{'num_sections'} = 0;
[17564]	148	# reconstructed docs have no text, just metadata, so we need to
	149	# remember how many bytes we had initially
[23133]	150	#$self->{'num_bytes'} = $self->{'starting_num_bytes'};
	151	$self->{'num_bytes'} = 0; # we'll store num bytes in db for reconstructed docs.
[10159]	152	$self->{'num_processed_bytes'} = 0;
	153	}
	154
[10419]	155	sub is_incremental_capable
[10304]	156	{
	157	# By default we return 'no' as the answer
	158	# Safer to assume non-incremental to start with, and then override in
	159	# inherited classes that are.
	160
	161	return 0;
	162	}
	163
[9919]	164	sub get_num_docs {
	165	my $self = shift (@_);
	166
	167	return $self->{'num_docs'};
	168	}
	169
	170	sub get_num_sections {
	171	my $self = shift (@_);
	172
	173	return $self->{'num_sections'};
	174	}
	175
	176	# num_bytes is the actual number of bytes in the collection
	177	# this is normally the same as what's processed during text compression
	178	sub get_num_bytes {
	179	my $self = shift (@_);
	180
	181	return $self->{'num_bytes'};
	182	}
	183
	184	# num_processed_bytes is the number of bytes actually passed
	185	# to mg for the current index
	186	sub get_num_processed_bytes {
	187	my $self = shift (@_);
	188
	189	return $self->{'num_processed_bytes'};
	190	}
	191
	192	sub set_output_handle {
	193	my $self = shift (@_);
	194	my ($handle) = @_;
	195
	196	$self->{'output_handle'} = $handle;
[22843]	197	binmode($handle,":utf8");
[9919]	198	}
	199
	200
	201	sub set_mode {
	202	my $self = shift (@_);
	203	my ($mode) = @_;
	204
	205	$self->{'mode'} = $mode;
	206	}
	207
[10159]	208	sub get_mode {
	209	my $self = shift (@_);
	210
	211	return $self->{'mode'};
	212	}
	213
[9919]	214	sub set_assocdir {
	215	my $self = shift (@_);
	216	my ($assocdir) = @_;
	217
	218	$self->{'assocdir'} = $assocdir;
	219	}
	220
[15688]	221	sub set_dontdb {
[9919]	222	my $self = shift (@_);
[15688]	223	my ($dontdb) = @_;
[9919]	224
[15688]	225	$self->{'dontdb'} = $dontdb;
[9919]	226	}
	227
[15725]	228	sub set_infodbtype
	229	{
	230	my $self = shift(@_);
	231	my $infodbtype = shift(@_);
	232	$self->{'infodbtype'} = $infodbtype;
	233	}
	234
[9919]	235	sub set_index {
	236	my $self = shift (@_);
	237	my ($index, $indexexparr) = @_;
	238
	239	$self->{'index'} = $index;
	240	$self->{'indexexparr'} = $indexexparr if defined $indexexparr;
	241	}
	242
	243	sub set_index_languages {
	244	my $self = shift (@_);
	245	my ($lang_meta, $langarr) = @_;
[20420]	246	$lang_meta =~ s/^ex\.//; # strip ex. if there
[9919]	247	$self->{'lang_meta'} = $lang_meta;
	248	$self->{'langarr'} = $langarr;
	249	}
	250
	251	sub get_index {
	252	my $self = shift (@_);
	253
	254	return $self->{'index'};
	255	}
	256
	257	sub set_classifiers {
	258	my $self = shift (@_);
	259	my ($classifiers) = @_;
	260
	261	$self->{'classifiers'} = $classifiers;
	262	}
	263
	264	sub set_indexing_text {
	265	my $self = shift (@_);
	266	my ($indexing_text) = @_;
	267
	268	$self->{'indexing_text'} = $indexing_text;
	269	}
	270
	271	sub get_indexing_text {
	272	my $self = shift (@_);
	273
	274	return $self->{'indexing_text'};
	275	}
	276
	277	sub set_store_text {
	278	my $self = shift (@_);
	279	my ($store_text) = @_;
	280
	281	$self->{'store_text'} = $store_text;
	282	}
[16222]	283
	284	sub set_store_metadata_coverage {
	285	my $self = shift (@_);
	286	my ($store_metadata_coverage) = @_;
	287
	288	$self->{'store_metadata_coverage'} = $store_metadata_coverage \|\| "";
	289	}
	290
[9919]	291	sub get_doc_list {
	292	my $self = shift(@_);
	293
	294	return @{$self->{'doclist'}};
	295	}
	296
[15685]	297	# the standard database level is section, but you may want to change it to document
	298	sub set_db_level {
[9919]	299	my $self= shift (@_);
[15685]	300	my ($db_level) = @_;
[9919]	301
[15685]	302	$self->{'db_level'} = $db_level;
[9919]	303	}
	304
[10469]	305	sub set_sections_index_document_metadata {
	306	my $self= shift (@_);
	307	my ($index_type) = @_;
	308
	309	$self->{'sections_index_document_metadata'} = $index_type;
	310	}
[17110]	311
	312	sub set_separate_cjk {
	313	my $self = shift (@_);
	314	my ($sep_cjk) = @_;
	315
	316	$self->{'separate_cjk'} = $sep_cjk;
	317	}
	318
[9919]	319	sub process {
	320	my $self = shift (@_);
	321	my $method = $self->{'mode'};
	322
	323	$self->$method(@_);
	324	}
	325
[17110]	326	# post process text depending on field. Currently don't do anything here
[17111]	327	# except cjk separation, and only for indexing
	328	# should only do this for indexed text (if $self->{'indexing_text'}),
	329	# but currently search term highlighting doesn't work if you do that.
	330	# once thats fixed up, then fix this.
[17110]	331	sub filter_text {
	332	my $self = shift (@_);
	333	my ($field, $text) = @_;
[14934]	334
[17110]	335	# lets do cjk seg here
	336	my $new_text =$text;
	337	if ($self->{'separate_cjk'}) {
	338	$new_text = &cnseg::segment($text);
	339	}
	340	return $new_text;
	341	}
[14934]	342
[17110]	343
[14934]	344	sub infodb_metadata_stats
	345	{
	346	my $self = shift (@_);
[18469]	347	my ($field,$edit_mode) = @_;
[14934]	348
	349	# Keep some statistics relating to metadata sets used and
	350	# frequency of particular metadata fields within each set
	351
	352	# Union of metadata prefixes and frequency of fields
	353	# (both scoped for this document alone, and across whole collection)
	354
	355	if ($field =~ m/^(.+)\.(.*)$/) {
	356	my $prefix = $1;
	357	my $core_field = $2;
	358
[18471]	359	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
[18469]	360	$self->{'doc_mdprefix_fields'}->{$prefix}->{$core_field}++;
	361	$self->{'mdprefix_fields'}->{$prefix}->{$core_field}++;
	362	}
	363	else {
	364	# delete
	365	$self->{'doc_mdprefix_fields'}->{$prefix}->{$core_field}--;
	366	$self->{'mdprefix_fields'}->{$prefix}->{$core_field}--;
	367	}
	368
[14934]	369	}
	370	elsif ($field =~ m/^[[:upper:]]/) {
	371	# implicit 'ex' metadata set
	372
[18471]	373	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
[18469]	374
	375	$self->{'doc_mdprefix_fields'}->{'ex'}->{$field}++;
	376	$self->{'mdprefix_fields'}->{'ex'}->{$field}++;
	377	}
	378	else {
	379	# delete
	380	$self->{'doc_mdprefix_fields'}->{'ex'}->{$field}--;
	381	$self->{'mdprefix_fields'}->{'ex'}->{$field}--;
	382	}
[14934]	383	}
	384
	385	}
	386
	387
[18456]	388	sub infodbedit {
[9919]	389	my $self = shift (@_);
[18456]	390	my ($doc_obj, $filename, $edit_mode) = @_;
[23133]	391
[15696]	392	# only output this document if it is a "indexed_doc" or "info_doc" (database only) document
[9919]	393	my $doctype = $doc_obj->get_doc_type();
[11793]	394	return if ($doctype ne "indexed_doc" && $doctype ne "info_doc");
[23165]	395
[11994]	396	my $archivedir = "";
	397	if (defined $filename)
	398	{
	399	# doc_obj derived directly from file
	400	my ($dir) = $filename =~ /^(.?)(?:\/\|\\)[^\/\\]$/;
	401	$dir = "" unless defined $dir;
	402	$dir =~ s/\\/\//g;
	403	$dir =~ s/^\/+//;
	404	$dir =~ s/\/+$//;
	405
	406	$archivedir = $dir;
	407
[23182]	408	if ($edit_mode eq "delete") {
	409	# record this doc so we don't process the reconstructed doc later
	410	$self->{'dont_process_reconstructed'}->{$doc_obj->get_OID()} = 1;
	411	# we don't need to do anything else for the info database for a deleted document. The infodb starts from scratch each time, so no deletion is necessary
	412	$self->delete_assoc_files ($archivedir, "delete");
	413	return;
	414	}
	415	if ($edit_mode eq "update") {
	416	# we don't want to process the reconstructed doc later, but we will process this version now.
	417	$self->{'dont_process_reconstructed'}->{$doc_obj->get_OID()} = 1;
	418	# delete the old assoc files as they may have changed
	419	$self->delete_assoc_files ($archivedir, "update");
	420	}
	421
[11994]	422	# resolve the final filenames of the files associated with this document
[23182]	423	# now save the new assoc files for an update/new doc.
[11994]	424	$self->assoc_files ($doc_obj, $archivedir);
	425	}
	426	else
	427	{
[15688]	428	# doc_obj reconstructed from database (has metadata, doc structure but no text)
[11994]	429	my $top_section = $doc_obj->get_top_section();
	430	$archivedir = $doc_obj->get_metadata_element($top_section,"archivedir");
	431	}
	432
[23133]	433	# rest of code used for add and update. In both cases, we add to the classifiers and to the info database.
	434
	435	#add this document to the browse structure
	436	push(@{$self->{'doclist'}},$doc_obj->get_OID())
	437	unless ($doctype eq "classification");
	438	$self->{'num_docs'} += 1 unless ($doctype eq "classification");
	439
	440	if (!defined $filename) {
	441	# a reconstructed doc
[23160]	442	my $num_reconstructed_bytes = $doc_obj->get_metadata_element ($doc_obj->get_top_section (), "total_numbytes");
	443	if (defined $num_reconstructed_bytes) {
	444	$self->{'num_bytes'} += $num_reconstructed_bytes;
	445	}
[23133]	446	}
	447	# classify the document
	448	&classify::classify_doc ($self->{'classifiers'}, $doc_obj);
	449
[23160]	450	# now add all the sections to the infodb.
[23133]	451
[9919]	452	# is this a paged or a hierarchical document
	453	my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
	454
	455	my $section = $doc_obj->get_top_section ();
	456	my $doc_OID = $doc_obj->get_OID();
	457	my $first = 1;
[15699]	458	my $infodb_handle = $self->{'output_handle'};
[14934]	459
	460	$self->{'doc_mdprefix_fields'} = {};
	461
[15695]	462	while (defined $section)
	463	{
	464	my $section_OID = $doc_OID;
	465	if ($section ne "")
	466	{
	467	$section_OID = $doc_OID . "." . $section;
	468	}
[15696]	469	my %section_infodb = ();
[15695]	470
[23133]	471	# update a few statistics
	472	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
	473	$self->{'num_sections'} += 1 unless ($doctype eq "classification");
	474
[9919]	475	# output the fact that this document is a document (unless doctype
	476	# has been set to something else from within a plugin
	477	my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
	478	if (!defined $dtype \|\| $dtype !~ /\w/) {
[15697]	479	$section_infodb{"doctype"} = [ "doc" ];
[9919]	480	}
	481
[23133]	482	if ($first && defined $filename) {
	483	# if we are at the top level of the document, and we are not a reconstructed document, set the total_text_length - used to count bytes when we reconstruct later
	484	my $length = $doc_obj->get_total_text_length();
	485	$section_infodb{"total_numbytes"} = [ $length ];
	486	}
[11994]	487	# Output whether this node contains text
	488	#
[15688]	489	# If doc_obj reconstructed from database file then no need to
[11994]	490	# explicitly add <hastxt> as this is preserved as metadata when
[15688]	491	# the database file is loaded in
[11994]	492	if (defined $filename)
	493	{
	494	# doc_obj derived directly from file
	495	if ($doc_obj->get_text_length($section) > 0) {
[15697]	496	$section_infodb{"hastxt"} = [ "1" ];
[11994]	497	} else {
[15697]	498	$section_infodb{"hastxt"} = [ "0" ];
[11994]	499	}
[9919]	500	}
	501
	502	# output all the section metadata
	503	my $metadata = $doc_obj->get_all_metadata ($section);
	504	foreach my $pair (@$metadata) {
	505	my ($field, $value) = (@$pair);
	506
	507	if ($field ne "Identifier" && $field !~ /^gsdl/ &&
	508	defined $value && $value ne "") {
	509
	510	# escape problematic stuff
[22590]	511	$value =~ s/([^\\])\\([^\\])/$1\\\\$2/g;
[9919]	512	$value =~ s/\n/\\n/g;
	513	$value =~ s/\r/\\r/g;
[22296]	514	# remove ex. if there
	515	$field =~ s/^ex\.//;
[9919]	516
[23387]	517	# special case for UTF8URL metadata
	518	if ($field =~ m/^UTF8URL$/i) {
[23371]	519	&dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle,
	520	$value, { 'section' => [ $section_OID ] });
[9919]	521	}
[23133]	522
[15688]	523	if (!defined $self->{'dontdb'}->{$field}) {
[15697]	524	push(@{$section_infodb{$field}}, $value);
[14934]	525
[16222]	526	if ($section eq "" && $self->{'store_metadata_coverage'} =~ /^true$/i)
[14934]	527	{
[18469]	528	$self->infodb_metadata_stats($field,$edit_mode);
[14934]	529	}
[9919]	530	}
	531	}
	532	}
	533
[14934]	534	if ($section eq "")
	535	{
	536	my $doc_mdprefix_fields = $self->{'doc_mdprefix_fields'};
[11994]	537
[14934]	538	foreach my $prefix (keys %$doc_mdprefix_fields)
	539	{
[15697]	540	push(@{$section_infodb{"metadataset"}}, $prefix);
[14934]	541
	542	foreach my $field (keys %{$doc_mdprefix_fields->{$prefix}})
	543	{
[15708]	544	push(@{$section_infodb{"metadatalist-$prefix"}}, $field);
	545
[14934]	546	my $val = $doc_mdprefix_fields->{$prefix}->{$field};
[15697]	547	push(@{$section_infodb{"metadatafreq-$prefix-$field"}}, $val);
[14934]	548	}
	549	}
	550	}
	551
[15688]	552	# If doc_obj reconstructed from database file then no need to
[11994]	553	# explicitly add <archivedir> as this is preserved as metadata when
[15688]	554	# the database file is loaded in
[11994]	555	if (defined $filename)
	556	{
	557	# output archivedir if at top level
	558	if ($section eq $doc_obj->get_top_section()) {
[15697]	559	$section_infodb{"archivedir"} = [ $archivedir ];
[11994]	560	}
[9919]	561	}
	562
	563	# output document display type
	564	if ($first) {
[15697]	565	$section_infodb{"thistype"} = [ $thistype ];
[9919]	566	}
	567
[15685]	568	if ($self->{'db_level'} eq "document") {
[9919]	569	# doc num is num_docs not num_sections
	570	# output the matching document number
[15697]	571	$section_infodb{"docnum"} = [ $self->{'num_docs'} ];
[15696]	572	}
	573	else {
[9919]	574	# output a list of children
	575	my $children = $doc_obj->get_children ($section);
	576	if (scalar(@$children) > 0) {
[15697]	577	$section_infodb{"childtype"} = [ $childtype ];
[15696]	578	my $contains = "";
	579	foreach my $child (@$children)
	580	{
	581	$contains .= ";" unless ($contains eq "");
	582	if ($child =~ /^.*?\.(\d+)$/)
	583	{
	584	$contains .= "\".$1";
[9919]	585	}
[15698]	586	else
	587	{
[15696]	588	$contains .= "\".$child";
	589	}
[9919]	590	}
[15697]	591	$section_infodb{"contains"} = [ $contains ];
[9919]	592	}
[15696]	593	# output the matching doc number
[15697]	594	$section_infodb{"docnum"} = [ $self->{'num_sections'} ];
[9919]	595	}
	596
[23133]	597	&dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $section_OID, \%section_infodb);
	598
	599	# output a database entry for the document number, unless we are incremental
	600	unless ($self->is_incremental_capable())
[17106]	601	{
[23133]	602	if ($self->{'db_level'} eq "document") {
	603	&dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_docs'}, { 'section' => [ $doc_OID ] });
[17106]	604	}
	605	else {
[23133]	606	&dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_sections'}, { 'section' => [ $section_OID ] });
[17106]	607	}
[9919]	608	}
[23133]	609
[9919]	610	$first = 0;
	611	$section = $doc_obj->get_next_section($section);
[15685]	612	last if ($self->{'db_level'} eq "document"); # if no sections wanted, only add the docs
[23133]	613	} # while defined section
	614
[15696]	615	}
[9919]	616
[15696]	617
[18456]	618
	619
	620	sub infodb {
	621	my $self = shift (@_);
	622	my ($doc_obj, $filename) = @_;
	623
	624	$self->infodbedit($doc_obj,$filename,"add");
	625	}
	626
	627	sub infodbreindex {
	628	my $self = shift (@_);
	629	my ($doc_obj, $filename) = @_;
	630
[18471]	631	$self->infodbedit($doc_obj,$filename,"update");
[18456]	632	}
	633
	634	sub infodbdelete {
	635	my $self = shift (@_);
	636	my ($doc_obj, $filename) = @_;
	637
	638	$self->infodbedit($doc_obj,$filename,"delete");
	639	}
	640
	641
[9919]	642	sub text {
	643	my $self = shift (@_);
	644	my ($doc_obj) = @_;
	645
	646	my $handle = $self->{'outhandle'};
	647	print $handle "basebuildproc::text function must be implemented in sub classes\n";
	648	die "\n";
	649	}
	650
[18456]	651	sub textreindex
	652	{
	653	my $self = shift @_;
	654
	655	my $outhandle = $self->{'outhandle'};
	656	print $outhandle "basebuildproc::textreindex function must be implemented in sub classes\n";
	657	if (!$self->is_incremental_capable()) {
	658
	659	print $outhandle " This operation is only possible with indexing tools with that support\n";
	660	print $outhandle " incremental building\n";
	661	}
	662	die "\n";
	663	}
	664
	665	sub textdelete
	666	{
	667	my $self = shift @_;
	668
	669	my $outhandle = $self->{'outhandle'};
	670	print $outhandle "basebuildproc::textdelete function must be implemented in sub classes\n";
	671	if (!$self->is_incremental_capable()) {
	672
	673	print $outhandle " This operation is only possible with indexing tools with that support\n";
	674	print $outhandle " incremental building\n";
	675	}
	676	die "\n";
	677	}
	678
	679
[9919]	680	# should the document be indexed - according to the subcollection and language
	681	# specification.
	682	sub is_subcollection_doc {
	683	my $self = shift (@_);
	684	my ($doc_obj) = @_;
	685
	686	my $indexed_doc = 1;
	687	foreach my $indexexp (@{$self->{'indexexparr'}}) {
	688	$indexed_doc = 0;
	689	my ($field, $exp, $options) = split /\//, $indexexp;
	690	if (defined ($field) && defined ($exp)) {
	691	my ($bool) = $field =~ /^(.)/;
	692	$field =~ s/^.// if $bool eq '!';
[10028]	693	my @metadata_values;
[9919]	694	if ($field =~ /^filename$/i) {
[10028]	695	push(@metadata_values, $doc_obj->get_source_filename());
[9919]	696	}
[10028]	697	else {
[20420]	698	$field =~ s/^ex\.//; #strip ex. if present
[10028]	699	@metadata_values = @{$doc_obj->get_metadata($doc_obj->get_top_section(), $field)};
	700	}
	701	next unless @metadata_values;
	702	foreach my $metadata_value (@metadata_values) {
	703	if ($bool eq '!') {
	704	if ($options =~ /^i$/i) {
	705	if ($metadata_value !~ /$exp/i) {$indexed_doc = 1; last;}
	706	} else {
	707	if ($metadata_value !~ /$exp/) {$indexed_doc = 1; last;}
	708	}
[9919]	709	} else {
[10028]	710	if ($options =~ /^i$/i) {
	711	if ($metadata_value =~ /$exp/i) {$indexed_doc = 1; last;}
	712	} else {
	713	if ($metadata_value =~ /$exp/) {$indexed_doc = 1; last;}
	714	}
[9919]	715	}
	716	}
[10028]	717
	718	last if ($indexed_doc == 1);
[9919]	719	}
	720	}
	721
	722	# if this doc is so far in the sub collection, and we have lang info,
	723	# now we check the languages to see if it matches
	724	if($indexed_doc && defined $self->{'lang_meta'}) {
	725	$indexed_doc = 0;
	726	my $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'lang_meta'});
	727	if (defined $field) {
	728	foreach my $lang (@{$self->{'langarr'}}) {
	729	my ($bool) = $lang =~ /^(.)/;
	730	if ($bool eq '!') {
	731	$lang =~ s/^.//;
	732	if ($field !~ /$lang/) {
	733	$indexed_doc = 1; last;
	734	}
	735	} else {
	736	if ($field =~ /$lang/) {
	737	$indexed_doc = 1; last;
	738	}
	739	}
	740	}
	741	}
	742	}
	743	return $indexed_doc;
	744
	745	}
	746
	747	# use 'Paged' if document has no more than 2 levels
	748	# and each section at second level has a number for
	749	# Title metadata
	750	# also use Paged if gsdlthistype metadata is set to Paged
	751	sub get_document_type {
	752	my $self = shift (@_);
	753	my ($doc_obj) = @_;
	754
	755	my $thistype = "VList";
	756	my $childtype = "VList";
	757	my $title;
	758	my @tmp = ();
	759
	760	my $section = $doc_obj->get_top_section ();
	761
	762	my $gsdlthistype = $doc_obj->get_metadata_element ($section, "gsdlthistype");
	763	if (defined $gsdlthistype) {
	764	if ($gsdlthistype eq "Paged") {
	765	$childtype = "Paged";
	766	if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
	767	$thistype = "Paged";
	768	} else {
	769	$thistype = "Invisible";
	770	}
	771
	772	return ($thistype, $childtype);
	773	} elsif ($gsdlthistype eq "Hierarchy") {
	774	return ($thistype, $childtype); # use VList, VList
	775	}
	776	}
	777	my $first = 1;
	778	while (defined $section) {
	779	@tmp = split /\./, $section;
	780	if (scalar(@tmp) > 1) {
	781	return ($thistype, $childtype);
	782	}
	783	if (!$first) {
	784	$title = $doc_obj->get_metadata_element ($section, "Title");
	785	if (!defined $title \|\| $title !~ /^\d+$/) {
	786	return ($thistype, $childtype);
	787	}
	788	}
	789	$first = 0;
	790	$section = $doc_obj->get_next_section($section);
	791	}
	792	if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
	793	$thistype = "Paged";
	794	} else {
	795	$thistype = "Invisible";
	796	}
	797	$childtype = "Paged";
	798	return ($thistype, $childtype);
	799	}
	800
[18456]	801	sub assoc_files
	802	{
[9919]	803	my $self = shift (@_);
	804	my ($doc_obj, $archivedir) = @_;
	805	my ($afile);
	806
	807	foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
[12844]	808	#rint STDERR "Processing associated file - copy " . $assoc_file->[0] . " to " . $assoc_file->[1] . "\n";
[9919]	809	# if assoc file starts with a slash, we put it relative to the assoc
	810	# dir, otherwise it is relative to the HASH... directory
	811	if ($assoc_file->[1] =~ m@^[/\\]@) {
[12844]	812	$afile = &util::filename_cat($self->{'assocdir'}, $assoc_file->[1]);
[9919]	813	} else {
	814	$afile = &util::filename_cat($self->{'assocdir'}, $archivedir, $assoc_file->[1]);
	815	}
[18463]	816	&util::hard_link ($assoc_file->[0], $afile, $self->{'verbosity'});
[9919]	817	}
	818	}
	819
[23176]	820	sub delete_assoc_files
	821	{
	822	my $self = shift (@_);
[23182]	823	my ($archivedir, $edit_mode) = @_;
[23176]	824
	825	my $assoc_dir = &util::filename_cat($self->{'assocdir'}, $archivedir);
	826	if (-d $assoc_dir) {
	827	&util::rm_r($assoc_dir);
	828	}
	829	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: