Context Navigation

source: main/trunk/greenstone2/perllib/basebuildproc.pm@ 34997

Last change on this file since 34997 was 33302, checked in by ak19, 5 years ago
Adding GPSMapOverlayLabel extracted from GPS.mapOverlay meta to text indexes for searching, as with Coordinte and CoordShort. 2. Added a shortname for this index, ML for MapLabel. 3. On testing the indexing of the GPSMapOverlayLabel text, the old problem of increasingly duplicated Coordinate/CoordShort and now also GPSMapOverlayLabel meta in the infodb reappeared. Dr Bainbridge explained why this was (documented as comments in this commit) and fixed the problem by not processing GPS.mapOverlay meta into Coordinate and Label meta during the infodb pass (and dummy pass, so specifically specifically non-text passes) of buildcol. A natural consequence is that to check whether Coord and Label meta have been indexed, can no longer check the index/text/col.jdb but need to use Luke (if a lucene collection ) to check contents of index/sidx and index/didx. 4. An important change needed for the bugfix in 3 is reordering call to &classify::reconstruct_doc_objs_metadata() in basebuilder.pm to take place AFTER build_proc->set_mode(infodb) has taken place. 5. Changed cross-files global variables declared in doc.pm from our to my variables and tested this works.
Property svn:keywords set to `Author Date Id Revision`
File size: 23.6 KB

Line
1	##########################################################################
2	#
3	# basebuildproc.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# This document processor outputs a document for indexing (should be
27	# implemented by subclass) and storing in the database
28
29	package basebuildproc;
30
31	eval {require bytes};
32
33	use classify;
34	use dbutil;
35	use doc;
36	use docproc;
37	use strict;
38	no strict 'subs';
39	no strict 'refs';
40	use util;
41	use FileUtils;
42
43	BEGIN {
44	@basebuildproc::ISA = ('docproc');
45	}
46
47	sub new()
48	{
49	my ($class, $collection, $source_dir, $build_dir, $keepold, $verbosity, $outhandle) = @_;
50	my $self = new docproc ();
51
52	# outhandle is where all the debugging info goes
53	# output_handle is where the output of the plugins is piped
54	# to (i.e. mg, database etc.)
55	$outhandle = STDERR unless defined $outhandle;
56
57	$self->{'collection'} = $collection;
58	$self->{'source_dir'} = $source_dir;
59	$self->{'build_dir'} = $build_dir;
60	$self->{'keepold'} = $keepold;
61	$self->{'verbosity'} = $verbosity;
62	$self->{'outhandle'} = $outhandle;
63
64	$self->{'classifiers'} = [];
65	$self->{'mode'} = "text";
66	$self->{'assocdir'} = $build_dir;
67	$self->{'dontdb'} = {};
68	$self->{'store_metadata_coverage'} = "false";
69
70	$self->{'index'} = "section:text";
71	$self->{'indexexparr'} = [];
72
73	$self->{'separate_cjk'} = 0;
74
75	my $found_num_data = 0;
76	my $buildconfigfile = undef;
77
78	if ($keepold) {
79	# For incremental building need to seed num_docs etc from values
80	# stored in build.cfg (if present)
81	$buildconfigfile = &FileUtils::filenameConcatenate($build_dir, "build.cfg");
82	if (-e $buildconfigfile) {
83	$found_num_data = 1;
84	}
85	else {
86	# try the index dir
87	$buildconfigfile = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},
88	"index", "build.cfg");
89	if (-e $buildconfigfile) {
90	$found_num_data = 1;
91	}
92	}
93
94	}
95
96	if ($found_num_data)
97	{
98	#print STDERR "Found_Num_Data!\n";
99	my $buildcfg = &colcfg::read_build_cfg($buildconfigfile);
100	$self->{'starting_num_docs'} = $buildcfg->{'numdocs'};
101	#print STDERR "- num_docs: $self->{'starting_num_docs'}\n";
102	$self->{'starting_num_sections'} = $buildcfg->{'numsections'};
103	#print STDERR "- num_sections: $self->{'starting_num_sections'}\n";
104	$self->{'starting_num_bytes'} = $buildcfg->{'numbytes'};
105	#print STDERR "- num_bytes: $self->{'starting_num_bytes'}\n";
106	}
107	else
108	{
109	#print STDERR "NOT Found_Num_Data!\n";
110	$self->{'starting_num_docs'} = 0;
111	$self->{'starting_num_sections'} = 0;
112	$self->{'starting_num_bytes'} = 0;
113	}
114
115	$self->{'output_handle'} = "STDOUT";
116	$self->{'num_docs'} = $self->{'starting_num_docs'};
117	$self->{'num_sections'} = $self->{'starting_num_sections'};
118	$self->{'num_bytes'} = $self->{'starting_num_bytes'};
119
120	$self->{'num_processed_bytes'} = 0;
121	$self->{'store_text'} = 1;
122
123	# what level (section/document) the database - indexer intersection is
124	$self->{'db_level'} = "section";
125	#used by browse interface
126	$self->{'doclist'} = [];
127
128	$self->{'indexing_text'} = 0;
129
130	return bless $self, $class;
131
132	}
133
134	sub reset {
135	my $self = shift (@_);
136
137	$self->{'num_docs'} = $self->{'starting_num_docs'};
138	$self->{'num_sections'} = $self->{'starting_num_sections'};
139	$self->{'num_bytes'} = $self->{'starting_num_bytes'};
140
141	$self->{'num_processed_bytes'} = 0;
142	}
143
144	sub zero_reset {
145	my $self = shift (@_);
146
147	$self->{'num_docs'} = 0;
148	$self->{'num_sections'} = 0;
149	# reconstructed docs have no text, just metadata, so we need to
150	# remember how many bytes we had initially
151	#$self->{'num_bytes'} = $self->{'starting_num_bytes'};
152	$self->{'num_bytes'} = 0; # we'll store num bytes in db for reconstructed docs.
153	$self->{'num_processed_bytes'} = 0;
154	}
155
156	sub is_incremental_capable
157	{
158	# By default we return 'no' as the answer
159	# Safer to assume non-incremental to start with, and then override in
160	# inherited classes that are.
161
162	return 0;
163	}
164
165	sub get_num_docs {
166	my $self = shift (@_);
167
168	return $self->{'num_docs'};
169	}
170
171	sub get_num_sections {
172	my $self = shift (@_);
173
174	return $self->{'num_sections'};
175	}
176
177	# num_bytes is the actual number of bytes in the collection
178	# this is normally the same as what's processed during text compression
179	sub get_num_bytes {
180	my $self = shift (@_);
181
182	return $self->{'num_bytes'};
183	}
184
185	# num_processed_bytes is the number of bytes actually passed
186	# to mg for the current index
187	sub get_num_processed_bytes {
188	my $self = shift (@_);
189
190	return $self->{'num_processed_bytes'};
191	}
192
193	sub set_output_handle {
194	my $self = shift (@_);
195	my ($handle) = @_;
196
197	$self->{'output_handle'} = $handle;
198	# The output handle isn't always an actual handle. In a couple of the
199	# database drivers (MSSQL and GDBMServer) it's actually a reference
200	# to an object. Thus we need to test the type before setting binmode.
201	# [jmt12]
202	if (ref $handle eq "GLOB")
203	{
204	binmode($handle,":utf8");
205	}
206	}
207
208
209	sub set_mode {
210	my $self = shift (@_);
211	my ($mode) = @_;
212
213	$self->{'mode'} = $mode;
214	$doc::processor_mode = $mode; # doc.pm needs to know what buildcol pass we're at
215	}
216
217	sub get_mode {
218	my $self = shift (@_);
219
220	return $self->{'mode'};
221	}
222
223	sub set_assocdir {
224	my $self = shift (@_);
225	my ($assocdir) = @_;
226
227	$self->{'assocdir'} = $assocdir;
228	}
229
230	sub set_dontdb {
231	my $self = shift (@_);
232	my ($dontdb) = @_;
233
234	$self->{'dontdb'} = $dontdb;
235	}
236
237	sub set_infodbtype
238	{
239	my $self = shift(@_);
240	my $infodbtype = shift(@_);
241	$self->{'infodbtype'} = $infodbtype;
242	}
243
244	sub set_index {
245	my $self = shift (@_);
246	my ($index, $indexexparr) = @_;
247
248	$self->{'index'} = $index;
249	$self->{'indexexparr'} = $indexexparr if defined $indexexparr;
250	}
251
252	sub set_index_languages {
253	my $self = shift (@_);
254	my ($lang_meta, $langarr) = @_;
255	$lang_meta =~ s/^ex\.([^.]+)$/$1/; # strip any ex. namespace iff it's the only namespace prefix (will leave ex.dc.* intact)
256
257	$self->{'lang_meta'} = $lang_meta;
258	$self->{'langarr'} = $langarr;
259	}
260
261	sub get_index {
262	my $self = shift (@_);
263
264	return $self->{'index'};
265	}
266
267	sub set_classifiers {
268	my $self = shift (@_);
269	my ($classifiers) = @_;
270
271	$self->{'classifiers'} = $classifiers;
272	}
273
274	sub set_indexing_text {
275	my $self = shift (@_);
276	my ($indexing_text) = @_;
277
278	$self->{'indexing_text'} = $indexing_text;
279	}
280
281	sub get_indexing_text {
282	my $self = shift (@_);
283
284	return $self->{'indexing_text'};
285	}
286
287	sub set_store_text {
288	my $self = shift (@_);
289	my ($store_text) = @_;
290
291	$self->{'store_text'} = $store_text;
292	}
293
294	sub set_store_metadata_coverage {
295	my $self = shift (@_);
296	my ($store_metadata_coverage) = @_;
297
298	$self->{'store_metadata_coverage'} = $store_metadata_coverage \|\| "";
299	}
300
301	sub get_doc_list {
302	my $self = shift(@_);
303
304	return @{$self->{'doclist'}};
305	}
306
307	# the standard database level is section, but you may want to change it to document
308	sub set_db_level {
309	my $self= shift (@_);
310	my ($db_level) = @_;
311
312	$self->{'db_level'} = $db_level;
313	}
314
315	sub set_sections_index_document_metadata {
316	my $self= shift (@_);
317	my ($index_type) = @_;
318
319	$self->{'sections_index_document_metadata'} = $index_type;
320	}
321
322	sub set_separate_cjk {
323	my $self = shift (@_);
324	my ($sep_cjk) = @_;
325
326	$self->{'separate_cjk'} = $sep_cjk;
327	}
328
329	sub process {
330	my $self = shift (@_);
331	my $method = $self->{'mode'};
332
333	$self->$method(@_);
334	}
335
336	# post process text depending on field. Currently don't do anything here
337	# except cjk separation, and only for indexing
338	# should only do this for indexed text (if $self->{'indexing_text'}),
339	# but currently search term highlighting doesn't work if you do that.
340	# once thats fixed up, then fix this.
341	sub filter_text {
342	my $self = shift (@_);
343	my ($field, $text) = @_;
344
345	# lets do cjk seg here
346	my $new_text =$text;
347	if ($self->{'separate_cjk'}) {
348	$new_text = &cnseg::segment($text);
349	}
350	return $new_text;
351	}
352
353
354	sub infodb_metadata_stats
355	{
356	my $self = shift (@_);
357	my ($field,$edit_mode) = @_;
358
359	# Keep some statistics relating to metadata sets used and
360	# frequency of particular metadata fields within each set
361
362	# Union of metadata prefixes and frequency of fields
363	# (both scoped for this document alone, and across whole collection)
364
365	if ($field =~ m/^(.+)\.(.*)$/) {
366	my $prefix = $1;
367	my $core_field = $2;
368
369	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
370	$self->{'doc_mdprefix_fields'}->{$prefix}->{$core_field}++;
371	$self->{'mdprefix_fields'}->{$prefix}->{$core_field}++;
372	}
373	else {
374	# delete
375	$self->{'doc_mdprefix_fields'}->{$prefix}->{$core_field}--;
376	$self->{'mdprefix_fields'}->{$prefix}->{$core_field}--;
377	}
378
379	}
380	elsif ($field =~ m/^[[:upper:]]/) {
381	# implicit 'ex' metadata set
382
383	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
384
385	$self->{'doc_mdprefix_fields'}->{'ex'}->{$field}++;
386	$self->{'mdprefix_fields'}->{'ex'}->{$field}++;
387	}
388	else {
389	# delete
390	$self->{'doc_mdprefix_fields'}->{'ex'}->{$field}--;
391	$self->{'mdprefix_fields'}->{'ex'}->{$field}--;
392	}
393	}
394
395	}
396
397
398	sub infodbedit {
399	my $self = shift (@_);
400	my ($doc_obj, $filename, $edit_mode) = @_;
401
402	# only output this document if it is a "indexed_doc" or "info_doc" (database only) document
403	my $doctype = $doc_obj->get_doc_type();
404	return if ($doctype ne "indexed_doc" && $doctype ne "info_doc");
405
406	my $archivedir = "";
407	if (defined $filename)
408	{
409	# doc_obj derived directly from file
410	my ($dir) = $filename =~ /^(.?)(?:\/\|\\)[^\/\\]$/;
411	$dir = "" unless defined $dir;
412	$dir =~ s/\\/\//g;
413	$dir =~ s/^\/+//;
414	$dir =~ s/\/+$//;
415
416	$archivedir = $dir;
417
418	if ($edit_mode eq "delete") {
419	# record this doc so we don't process the reconstructed doc later
420	$self->{'dont_process_reconstructed'}->{$doc_obj->get_OID()} = 1;
421	# we don't need to do anything else for the info database for a deleted document. The infodb starts from scratch each time, so no deletion is necessary
422	$self->delete_assoc_files ($archivedir, "delete");
423	return;
424	}
425	if ($edit_mode eq "update") {
426	# we don't want to process the reconstructed doc later, but we will process this version now.
427	$self->{'dont_process_reconstructed'}->{$doc_obj->get_OID()} = 1;
428	# delete the old assoc files as they may have changed
429	$self->delete_assoc_files ($archivedir, "update");
430	}
431
432	# resolve the final filenames of the files associated with this document
433	# now save the new assoc files for an update/new doc.
434	$self->assoc_files ($doc_obj, $archivedir);
435	}
436	else
437	{
438	# doc_obj reconstructed from database (has metadata, doc structure but no text)
439	my $top_section = $doc_obj->get_top_section();
440	$archivedir = $doc_obj->get_metadata_element($top_section,"archivedir");
441	}
442
443	# rest of code used for add and update. In both cases, we add to the classifiers and to the info database.
444
445	#add this document to the browse structure
446	push(@{$self->{'doclist'}},$doc_obj->get_OID())
447	unless ($doctype eq "classification");
448	$self->{'num_docs'} += 1 unless ($doctype eq "classification");
449
450	if (!defined $filename) {
451	# a reconstructed doc
452	my $num_reconstructed_bytes = $doc_obj->get_metadata_element ($doc_obj->get_top_section (), "total_numbytes");
453	if (defined $num_reconstructed_bytes) {
454	$self->{'num_bytes'} += $num_reconstructed_bytes;
455	}
456	}
457	# classify the document
458	&classify::classify_doc ($self->{'classifiers'}, $doc_obj);
459
460	# now add all the sections to the infodb.
461
462	# is this a paged or a hierarchical document
463	my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
464
465	my $section = $doc_obj->get_top_section ();
466	my $doc_OID = $doc_obj->get_OID();
467	my $first = 1;
468	my $infodb_handle = $self->{'output_handle'};
469
470	$self->{'doc_mdprefix_fields'} = {};
471
472	while (defined $section)
473	{
474	my $section_OID = $doc_OID;
475	if ($section ne "")
476	{
477	$section_OID = $doc_OID . "." . $section;
478	}
479	my %section_infodb = ();
480
481	# update a few statistics
482	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
483	$self->{'num_sections'} += 1 unless ($doctype eq "classification");
484
485	# output the fact that this document is a document (unless doctype
486	# has been set to something else from within a plugin
487	my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
488	if (!defined $dtype \|\| $dtype !~ /\w/) {
489	$section_infodb{"doctype"} = [ "doc" ];
490	}
491
492	if ($first && defined $filename) {
493	# if we are at the top level of the document, and we are not a reconstructed document, set the total_text_length - used to count bytes when we reconstruct later
494	my $length = $doc_obj->get_total_text_length();
495	$section_infodb{"total_numbytes"} = [ $length ];
496	}
497	# Output whether this node contains text
498	#
499	# If doc_obj reconstructed from database file then no need to
500	# explicitly add <hastxt> as this is preserved as metadata when
501	# the database file is loaded in
502	if (defined $filename)
503	{
504	# doc_obj derived directly from file
505	if ($doc_obj->get_text_length($section) > 0) {
506	$section_infodb{"hastxt"} = [ "1" ];
507	} else {
508	$section_infodb{"hastxt"} = [ "0" ];
509	}
510	}
511
512	# output all the section metadata
513	my $metadata = $doc_obj->get_all_metadata ($section);
514	foreach my $pair (@$metadata) {
515	my ($field, $value) = (@$pair);
516
517	if ($field ne "Identifier" && $field !~ /^gsdl/ &&
518	defined $value && $value ne "") {
519
520	# escape problematic stuff
521	$value =~ s/([^\\])\\([^\\])/$1\\\\$2/g;
522	$value =~ s/\n/\\n/g;
523	$value =~ s/\r/\\r/g;
524	# remove any ex. iff it's the only namespace prefix (will leave ex.dc.* intact)
525	$field =~ s/^ex\.([^.]+)$/$1/; # $field =~ s/^ex\.//;
526
527	# special case for UTF8URL metadata
528	if ($field =~ m/^UTF8URL$/i) {
529	&dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle,
530	$value, { 'section' => [ $section_OID ] });
531	}
532
533	if (!defined $self->{'dontdb'}->{$field}) {
534	push(@{$section_infodb{$field}}, $value);
535
536	if ($section eq ""
537	&& (($self->{'store_metadata_coverage'} =~ /^true$/i)
538	\|\| $self->{'store_metadata_coverage'} eq "1"))
539	{
540	$self->infodb_metadata_stats($field,$edit_mode);
541	}
542	}
543	}
544	}
545
546	if ($section eq "")
547	{
548	my $doc_mdprefix_fields = $self->{'doc_mdprefix_fields'};
549
550	foreach my $prefix (keys %$doc_mdprefix_fields)
551	{
552	push(@{$section_infodb{"metadataset"}}, $prefix);
553
554	foreach my $field (keys %{$doc_mdprefix_fields->{$prefix}})
555	{
556	push(@{$section_infodb{"metadatalist-$prefix"}}, $field);
557
558	my $val = $doc_mdprefix_fields->{$prefix}->{$field};
559	push(@{$section_infodb{"metadatafreq-$prefix-$field"}}, $val);
560	}
561	}
562	}
563
564	# If doc_obj reconstructed from database file then no need to
565	# explicitly add <archivedir> as this is preserved as metadata when
566	# the database file is loaded in
567	if (defined $filename)
568	{
569	# output archivedir if at top level
570	if ($section eq $doc_obj->get_top_section()) {
571	$section_infodb{"archivedir"} = [ $archivedir ];
572	}
573	}
574
575	# output document display type
576	if ($first) {
577	$section_infodb{"thistype"} = [ $thistype ];
578	}
579
580	if ($self->{'db_level'} eq "document") {
581	# doc num is num_docs not num_sections
582	# output the matching document number
583	$section_infodb{"docnum"} = [ $self->{'num_docs'} ];
584	}
585	else {
586	# output a list of children
587	my $children = $doc_obj->get_children ($section);
588	if (scalar(@$children) > 0) {
589	$section_infodb{"childtype"} = [ $childtype ];
590	my $contains = "";
591	foreach my $child (@$children)
592	{
593	$contains .= ";" unless ($contains eq "");
594	if ($child =~ /^.*?\.(\d+)$/)
595	{
596	$contains .= "\".$1";
597	}
598	else
599	{
600	$contains .= "\".$child";
601	}
602	}
603	$section_infodb{"contains"} = [ $contains ];
604	}
605	# output the matching doc number
606	$section_infodb{"docnum"} = [ $self->{'num_sections'} ];
607	}
608
609	if(defined $section_infodb{'assocfilepath'})
610	{
611	&dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $section_infodb{'assocfilepath'}[0], { 'contains' => [ $section_OID ]});
612	}
613	&dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $section_OID, \%section_infodb);
614
615	# output a database entry for the document number, unless we are incremental
616	unless ($self->is_incremental_capable())
617	{
618	if ($self->{'db_level'} eq "document") {
619	&dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_docs'}, { 'section' => [ $doc_OID ] });
620	}
621	else {
622	&dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_sections'}, { 'section' => [ $section_OID ] });
623	}
624	}
625
626	$first = 0;
627	$section = $doc_obj->get_next_section($section);
628	last if ($self->{'db_level'} eq "document"); # if no sections wanted, only add the docs
629	} # while defined section
630
631	}
632
633
634
635
636	sub infodb {
637	my $self = shift (@_);
638	my ($doc_obj, $filename) = @_;
639
640	$self->infodbedit($doc_obj,$filename,"add");
641	}
642
643	sub infodbreindex {
644	my $self = shift (@_);
645	my ($doc_obj, $filename) = @_;
646
647	$self->infodbedit($doc_obj,$filename,"update");
648	}
649
650	sub infodbdelete {
651	my $self = shift (@_);
652	my ($doc_obj, $filename) = @_;
653
654	$self->infodbedit($doc_obj,$filename,"delete");
655	}
656
657
658	sub text {
659	my $self = shift (@_);
660	my ($doc_obj) = @_;
661
662	my $handle = $self->{'outhandle'};
663	print $handle "basebuildproc::text function must be implemented in sub classes\n";
664	die "\n";
665	}
666
667	sub textreindex
668	{
669	my $self = shift @_;
670
671	my $outhandle = $self->{'outhandle'};
672	print $outhandle "basebuildproc::textreindex function must be implemented in sub classes\n";
673	if (!$self->is_incremental_capable()) {
674
675	print $outhandle " This operation is only possible with indexing tools with that support\n";
676	print $outhandle " incremental building\n";
677	}
678	die "\n";
679	}
680
681	sub textdelete
682	{
683	my $self = shift @_;
684
685	my $outhandle = $self->{'outhandle'};
686	print $outhandle "basebuildproc::textdelete function must be implemented in sub classes\n";
687	if (!$self->is_incremental_capable()) {
688
689	print $outhandle " This operation is only possible with indexing tools with that support\n";
690	print $outhandle " incremental building\n";
691	}
692	die "\n";
693	}
694
695
696	# should the document be indexed - according to the subcollection and language
697	# specification.
698	sub is_subcollection_doc {
699	my $self = shift (@_);
700	my ($doc_obj) = @_;
701
702	my $indexed_doc = 1;
703	foreach my $indexexp (@{$self->{'indexexparr'}}) {
704	$indexed_doc = 0;
705	my ($field, $exp, $options) = split /\//, $indexexp;
706	if (defined ($field) && defined ($exp)) {
707	my ($bool) = $field =~ /^(.)/;
708	$field =~ s/^.// if $bool eq '!';
709	my @metadata_values;
710	if ($field =~ /^filename$/i) {
711	push(@metadata_values, $doc_obj->get_source_filename());
712	}
713	else {
714	$field =~ s/^ex\.([^.]+)$/$1/; # remove any ex. iff it's the only namespace prefix (will leave ex.dc.* intact)
715	@metadata_values = @{$doc_obj->get_metadata($doc_obj->get_top_section(), $field)};
716	}
717	next unless @metadata_values;
718	foreach my $metadata_value (@metadata_values) {
719	if ($bool eq '!') {
720	if (defined $options && $options =~ /^i$/i) {
721	if ($metadata_value !~ /$exp/i) {$indexed_doc = 1; last;}
722	} else {
723	if ($metadata_value !~ /$exp/) {$indexed_doc = 1; last;}
724	}
725	} else {
726	if (defined $options && $options =~ /^i$/i) {
727	if ($metadata_value =~ /$exp/i) {$indexed_doc = 1; last;}
728	} else {
729	if ($metadata_value =~ /$exp/) {$indexed_doc = 1; last;}
730	}
731	}
732	}
733
734	last if ($indexed_doc == 1);
735	}
736	}
737
738	# if this doc is so far in the sub collection, and we have lang info,
739	# now we check the languages to see if it matches
740	if($indexed_doc && defined $self->{'lang_meta'}) {
741	$indexed_doc = 0;
742	my $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'lang_meta'});
743	if (defined $field) {
744	foreach my $lang (@{$self->{'langarr'}}) {
745	my ($bool) = $lang =~ /^(.)/;
746	if ($bool eq '!') {
747	$lang =~ s/^.//;
748	if ($field !~ /$lang/) {
749	$indexed_doc = 1; last;
750	}
751	} else {
752	if ($field =~ /$lang/) {
753	$indexed_doc = 1; last;
754	}
755	}
756	}
757	}
758	}
759	return $indexed_doc;
760
761	}
762
763	# use 'Paged' if document has no more than 2 levels
764	# and each section at second level has a number for
765	# Title metadata
766	# also use Paged if gsdlthistype metadata is set to Paged
767	sub get_document_type {
768	my $self = shift (@_);
769	my ($doc_obj) = @_;
770
771	my $thistype = "VList";
772	my $childtype = "VList";
773	my $title;
774	my @tmp = ();
775
776	my $section = $doc_obj->get_top_section ();
777
778	my $gsdlthistype = $doc_obj->get_metadata_element ($section, "gsdlthistype");
779	if (defined $gsdlthistype) {
780	if ($gsdlthistype =~ /^paged$/i) {
781	$childtype = "Paged";
782	if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
783	$thistype = "Paged";
784	} else {
785	$thistype = "Invisible";
786	}
787
788	return ($thistype, $childtype);
789	}
790	# gs3 pagedhierarchy option
791	elsif ($gsdlthistype =~ /^pagedhierarchy$/i) {
792	$childtype = "PagedHierarchy";
793	if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
794	$thistype = "PagedHierarchy";
795	} else {
796	$thistype = "Invisible";
797	}
798
799	return ($thistype, $childtype);
800	} elsif ($gsdlthistype =~ /^hierarchy$/i) {
801	return ($thistype, $childtype); # use VList, VList
802	}
803	}
804	my $first = 1;
805	while (defined $section) {
806	@tmp = split /\./, $section;
807	if (scalar(@tmp) > 1) {
808	return ($thistype, $childtype);
809	}
810	if (!$first) {
811	$title = $doc_obj->get_metadata_element ($section, "Title");
812	if (!defined $title \|\| $title !~ /^\d+$/) {
813	return ($thistype, $childtype);
814	}
815	}
816	$first = 0;
817	$section = $doc_obj->get_next_section($section);
818	}
819	if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
820	$thistype = "Paged";
821	} else {
822	$thistype = "Invisible";
823	}
824	$childtype = "Paged";
825	return ($thistype, $childtype);
826	}
827
828	sub assoc_files
829	{
830	my $self = shift (@_);
831	my ($doc_obj, $archivedir) = @_;
832	my ($afile);
833
834	foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
835	#rint STDERR "Processing associated file - copy " . $assoc_file->[0] . " to " . $assoc_file->[1] . "\n";
836	# if assoc file starts with a slash, we put it relative to the assoc
837	# dir, otherwise it is relative to the HASH... directory
838	if ($assoc_file->[1] =~ m@^[/\\]@) {
839	$afile = &FileUtils::filenameConcatenate($self->{'assocdir'}, $assoc_file->[1]);
840	} else {
841	$afile = &FileUtils::filenameConcatenate($self->{'assocdir'}, $archivedir, $assoc_file->[1]);
842	}
843
844	&FileUtils::hardLink($assoc_file->[0], $afile, $self->{'verbosity'});
845	}
846	}
847
848	sub delete_assoc_files
849	{
850	my $self = shift (@_);
851	my ($archivedir, $edit_mode) = @_;
852
853	my $assoc_dir = &FileUtils::filenameConcatenate($self->{'assocdir'}, $archivedir);
854	if (-d $assoc_dir) {
855	&FileUtils::removeFilesRecursive($assoc_dir);
856	}
857	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: