Context Navigation

source: main/trunk/greenstone2/perllib/basebuildproc.pm@ 27306

Last change on this file since 27306 was 27306, checked in by jmt12, 11 years ago
Moving the critical file-related functions (copy, rm, etc) out of util.pm into their own proper class FileUtils. Use of the old functions in util.pm will prompt deprecated warning messages. There may be further functions that could be moved across in the future, but these are the critical ones when considering supporting other filesystems (HTTP, HDFS, WebDav, etc). Updated some key files to use the new functions so now deprecated messages thrown when importing/building demo collection 'out of the box'
Property svn:keywords set to `Author Date Id Revision`
File size: 23.4 KB

Line
1	##########################################################################
2	#
3	# basebuildproc.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# This document processor outputs a document for indexing (should be
27	# implemented by subclass) and storing in the database
28
29	package basebuildproc;
30
31	eval {require bytes};
32
33	use classify;
34	use dbutil;
35	use doc;
36	use docproc;
37	use strict;
38	no strict 'subs';
39	no strict 'refs';
40	use util;
41	use FileUtils;
42
43	BEGIN {
44	@basebuildproc::ISA = ('docproc');
45	}
46
47	sub new()
48	{
49	my ($class, $collection, $source_dir, $build_dir, $keepold, $verbosity, $outhandle) = @_;
50	my $self = new docproc ();
51
52	# outhandle is where all the debugging info goes
53	# output_handle is where the output of the plugins is piped
54	# to (i.e. mg, database etc.)
55	$outhandle = STDERR unless defined $outhandle;
56
57	$self->{'collection'} = $collection;
58	$self->{'source_dir'} = $source_dir;
59	$self->{'build_dir'} = $build_dir;
60	$self->{'keepold'} = $keepold;
61	$self->{'verbosity'} = $verbosity;
62	$self->{'outhandle'} = $outhandle;
63
64	$self->{'classifiers'} = [];
65	$self->{'mode'} = "text";
66	$self->{'assocdir'} = $build_dir;
67	$self->{'dontdb'} = {};
68	$self->{'store_metadata_coverage'} = "false";
69
70	$self->{'index'} = "section:text";
71	$self->{'indexexparr'} = [];
72
73	$self->{'separate_cjk'} = 0;
74
75	my $found_num_data = 0;
76	my $buildconfigfile = undef;
77
78	if ($keepold) {
79	# For incremental building need to seed num_docs etc from values
80	# stored in build.cfg (if present)
81	$buildconfigfile = &FileUtils::filenameConcatenate($build_dir, "build.cfg");
82	if (-e $buildconfigfile) {
83	$found_num_data = 1;
84	}
85	else {
86	# try the index dir
87	$buildconfigfile = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},
88	"index", "build.cfg");
89	if (-e $buildconfigfile) {
90	$found_num_data = 1;
91	}
92	}
93
94	}
95
96	if ($found_num_data)
97	{
98	#print STDERR "Found_Num_Data!\n";
99	my $buildcfg = &colcfg::read_build_cfg($buildconfigfile);
100	$self->{'starting_num_docs'} = $buildcfg->{'numdocs'};
101	#print STDERR "- num_docs: $self->{'starting_num_docs'}\n";
102	$self->{'starting_num_sections'} = $buildcfg->{'numsections'};
103	#print STDERR "- num_sections: $self->{'starting_num_sections'}\n";
104	$self->{'starting_num_bytes'} = $buildcfg->{'numbytes'};
105	#print STDERR "- num_bytes: $self->{'starting_num_bytes'}\n";
106	}
107	else
108	{
109	#print STDERR "NOT Found_Num_Data!\n";
110	$self->{'starting_num_docs'} = 0;
111	$self->{'starting_num_sections'} = 0;
112	$self->{'starting_num_bytes'} = 0;
113	}
114
115	$self->{'output_handle'} = "STDOUT";
116	$self->{'num_docs'} = $self->{'starting_num_docs'};
117	$self->{'num_sections'} = $self->{'starting_num_sections'};
118	$self->{'num_bytes'} = $self->{'starting_num_bytes'};
119
120	$self->{'num_processed_bytes'} = 0;
121	$self->{'store_text'} = 1;
122
123	# what level (section/document) the database - indexer intersection is
124	$self->{'db_level'} = "section";
125	#used by browse interface
126	$self->{'doclist'} = [];
127
128	$self->{'indexing_text'} = 0;
129
130	return bless $self, $class;
131
132	}
133
134	sub reset {
135	my $self = shift (@_);
136
137	$self->{'num_docs'} = $self->{'starting_num_docs'};
138	$self->{'num_sections'} = $self->{'starting_num_sections'};
139	$self->{'num_bytes'} = $self->{'starting_num_bytes'};
140
141	$self->{'num_processed_bytes'} = 0;
142	}
143
144	sub zero_reset {
145	my $self = shift (@_);
146
147	$self->{'num_docs'} = 0;
148	$self->{'num_sections'} = 0;
149	# reconstructed docs have no text, just metadata, so we need to
150	# remember how many bytes we had initially
151	#$self->{'num_bytes'} = $self->{'starting_num_bytes'};
152	$self->{'num_bytes'} = 0; # we'll store num bytes in db for reconstructed docs.
153	$self->{'num_processed_bytes'} = 0;
154	}
155
156	sub is_incremental_capable
157	{
158	# By default we return 'no' as the answer
159	# Safer to assume non-incremental to start with, and then override in
160	# inherited classes that are.
161
162	return 0;
163	}
164
165	sub get_num_docs {
166	my $self = shift (@_);
167
168	return $self->{'num_docs'};
169	}
170
171	sub get_num_sections {
172	my $self = shift (@_);
173
174	return $self->{'num_sections'};
175	}
176
177	# num_bytes is the actual number of bytes in the collection
178	# this is normally the same as what's processed during text compression
179	sub get_num_bytes {
180	my $self = shift (@_);
181
182	return $self->{'num_bytes'};
183	}
184
185	# num_processed_bytes is the number of bytes actually passed
186	# to mg for the current index
187	sub get_num_processed_bytes {
188	my $self = shift (@_);
189
190	return $self->{'num_processed_bytes'};
191	}
192
193	sub set_output_handle {
194	my $self = shift (@_);
195	my ($handle) = @_;
196
197	$self->{'output_handle'} = $handle;
198	# The output handle isn't always an actual handle. In a couple of the
199	# database drivers (MSSQL and GDBMServer) it's actually a reference
200	# to an object. Thus we need to test the type before setting binmode.
201	# [jmt12]
202	if (ref $handle eq "GLOB")
203	{
204	binmode($handle,":utf8");
205	}
206	}
207
208
209	sub set_mode {
210	my $self = shift (@_);
211	my ($mode) = @_;
212
213	$self->{'mode'} = $mode;
214	}
215
216	sub get_mode {
217	my $self = shift (@_);
218
219	return $self->{'mode'};
220	}
221
222	sub set_assocdir {
223	my $self = shift (@_);
224	my ($assocdir) = @_;
225
226	$self->{'assocdir'} = $assocdir;
227	}
228
229	sub set_dontdb {
230	my $self = shift (@_);
231	my ($dontdb) = @_;
232
233	$self->{'dontdb'} = $dontdb;
234	}
235
236	sub set_infodbtype
237	{
238	my $self = shift(@_);
239	my $infodbtype = shift(@_);
240	$self->{'infodbtype'} = $infodbtype;
241	}
242
243	sub set_index {
244	my $self = shift (@_);
245	my ($index, $indexexparr) = @_;
246
247	$self->{'index'} = $index;
248	$self->{'indexexparr'} = $indexexparr if defined $indexexparr;
249	}
250
251	sub set_index_languages {
252	my $self = shift (@_);
253	my ($lang_meta, $langarr) = @_;
254	$lang_meta =~ s/^ex\.([^.]+)$/$1/; # strip any ex. namespace iff it's the only namespace prefix (will leave ex.dc.* intact)
255
256	$self->{'lang_meta'} = $lang_meta;
257	$self->{'langarr'} = $langarr;
258	}
259
260	sub get_index {
261	my $self = shift (@_);
262
263	return $self->{'index'};
264	}
265
266	sub set_classifiers {
267	my $self = shift (@_);
268	my ($classifiers) = @_;
269
270	$self->{'classifiers'} = $classifiers;
271	}
272
273	sub set_indexing_text {
274	my $self = shift (@_);
275	my ($indexing_text) = @_;
276
277	$self->{'indexing_text'} = $indexing_text;
278	}
279
280	sub get_indexing_text {
281	my $self = shift (@_);
282
283	return $self->{'indexing_text'};
284	}
285
286	sub set_store_text {
287	my $self = shift (@_);
288	my ($store_text) = @_;
289
290	$self->{'store_text'} = $store_text;
291	}
292
293	sub set_store_metadata_coverage {
294	my $self = shift (@_);
295	my ($store_metadata_coverage) = @_;
296
297	$self->{'store_metadata_coverage'} = $store_metadata_coverage \|\| "";
298	}
299
300	sub get_doc_list {
301	my $self = shift(@_);
302
303	return @{$self->{'doclist'}};
304	}
305
306	# the standard database level is section, but you may want to change it to document
307	sub set_db_level {
308	my $self= shift (@_);
309	my ($db_level) = @_;
310
311	$self->{'db_level'} = $db_level;
312	}
313
314	sub set_sections_index_document_metadata {
315	my $self= shift (@_);
316	my ($index_type) = @_;
317
318	$self->{'sections_index_document_metadata'} = $index_type;
319	}
320
321	sub set_separate_cjk {
322	my $self = shift (@_);
323	my ($sep_cjk) = @_;
324
325	$self->{'separate_cjk'} = $sep_cjk;
326	}
327
328	sub process {
329	my $self = shift (@_);
330	my $method = $self->{'mode'};
331
332	$self->$method(@_);
333	}
334
335	# post process text depending on field. Currently don't do anything here
336	# except cjk separation, and only for indexing
337	# should only do this for indexed text (if $self->{'indexing_text'}),
338	# but currently search term highlighting doesn't work if you do that.
339	# once thats fixed up, then fix this.
340	sub filter_text {
341	my $self = shift (@_);
342	my ($field, $text) = @_;
343
344	# lets do cjk seg here
345	my $new_text =$text;
346	if ($self->{'separate_cjk'}) {
347	$new_text = &cnseg::segment($text);
348	}
349	return $new_text;
350	}
351
352
353	sub infodb_metadata_stats
354	{
355	my $self = shift (@_);
356	my ($field,$edit_mode) = @_;
357
358	# Keep some statistics relating to metadata sets used and
359	# frequency of particular metadata fields within each set
360
361	# Union of metadata prefixes and frequency of fields
362	# (both scoped for this document alone, and across whole collection)
363
364	if ($field =~ m/^(.+)\.(.*)$/) {
365	my $prefix = $1;
366	my $core_field = $2;
367
368	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
369	$self->{'doc_mdprefix_fields'}->{$prefix}->{$core_field}++;
370	$self->{'mdprefix_fields'}->{$prefix}->{$core_field}++;
371	}
372	else {
373	# delete
374	$self->{'doc_mdprefix_fields'}->{$prefix}->{$core_field}--;
375	$self->{'mdprefix_fields'}->{$prefix}->{$core_field}--;
376	}
377
378	}
379	elsif ($field =~ m/^[[:upper:]]/) {
380	# implicit 'ex' metadata set
381
382	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
383
384	$self->{'doc_mdprefix_fields'}->{'ex'}->{$field}++;
385	$self->{'mdprefix_fields'}->{'ex'}->{$field}++;
386	}
387	else {
388	# delete
389	$self->{'doc_mdprefix_fields'}->{'ex'}->{$field}--;
390	$self->{'mdprefix_fields'}->{'ex'}->{$field}--;
391	}
392	}
393
394	}
395
396
397	sub infodbedit {
398	my $self = shift (@_);
399	my ($doc_obj, $filename, $edit_mode) = @_;
400
401	# only output this document if it is a "indexed_doc" or "info_doc" (database only) document
402	my $doctype = $doc_obj->get_doc_type();
403	return if ($doctype ne "indexed_doc" && $doctype ne "info_doc");
404
405	my $archivedir = "";
406	if (defined $filename)
407	{
408	# doc_obj derived directly from file
409	my ($dir) = $filename =~ /^(.?)(?:\/\|\\)[^\/\\]$/;
410	$dir = "" unless defined $dir;
411	$dir =~ s/\\/\//g;
412	$dir =~ s/^\/+//;
413	$dir =~ s/\/+$//;
414
415	$archivedir = $dir;
416
417	if ($edit_mode eq "delete") {
418	# record this doc so we don't process the reconstructed doc later
419	$self->{'dont_process_reconstructed'}->{$doc_obj->get_OID()} = 1;
420	# we don't need to do anything else for the info database for a deleted document. The infodb starts from scratch each time, so no deletion is necessary
421	$self->delete_assoc_files ($archivedir, "delete");
422	return;
423	}
424	if ($edit_mode eq "update") {
425	# we don't want to process the reconstructed doc later, but we will process this version now.
426	$self->{'dont_process_reconstructed'}->{$doc_obj->get_OID()} = 1;
427	# delete the old assoc files as they may have changed
428	$self->delete_assoc_files ($archivedir, "update");
429	}
430
431	# resolve the final filenames of the files associated with this document
432	# now save the new assoc files for an update/new doc.
433	$self->assoc_files ($doc_obj, $archivedir);
434	}
435	else
436	{
437	# doc_obj reconstructed from database (has metadata, doc structure but no text)
438	my $top_section = $doc_obj->get_top_section();
439	$archivedir = $doc_obj->get_metadata_element($top_section,"archivedir");
440	}
441
442	# rest of code used for add and update. In both cases, we add to the classifiers and to the info database.
443
444	#add this document to the browse structure
445	push(@{$self->{'doclist'}},$doc_obj->get_OID())
446	unless ($doctype eq "classification");
447	$self->{'num_docs'} += 1 unless ($doctype eq "classification");
448
449	if (!defined $filename) {
450	# a reconstructed doc
451	my $num_reconstructed_bytes = $doc_obj->get_metadata_element ($doc_obj->get_top_section (), "total_numbytes");
452	if (defined $num_reconstructed_bytes) {
453	$self->{'num_bytes'} += $num_reconstructed_bytes;
454	}
455	}
456	# classify the document
457	&classify::classify_doc ($self->{'classifiers'}, $doc_obj);
458
459	# now add all the sections to the infodb.
460
461	# is this a paged or a hierarchical document
462	my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
463
464	my $section = $doc_obj->get_top_section ();
465	my $doc_OID = $doc_obj->get_OID();
466	my $first = 1;
467	my $infodb_handle = $self->{'output_handle'};
468
469	$self->{'doc_mdprefix_fields'} = {};
470
471	while (defined $section)
472	{
473	my $section_OID = $doc_OID;
474	if ($section ne "")
475	{
476	$section_OID = $doc_OID . "." . $section;
477	}
478	my %section_infodb = ();
479
480	# update a few statistics
481	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
482	$self->{'num_sections'} += 1 unless ($doctype eq "classification");
483
484	# output the fact that this document is a document (unless doctype
485	# has been set to something else from within a plugin
486	my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
487	if (!defined $dtype \|\| $dtype !~ /\w/) {
488	$section_infodb{"doctype"} = [ "doc" ];
489	}
490
491	if ($first && defined $filename) {
492	# if we are at the top level of the document, and we are not a reconstructed document, set the total_text_length - used to count bytes when we reconstruct later
493	my $length = $doc_obj->get_total_text_length();
494	$section_infodb{"total_numbytes"} = [ $length ];
495	}
496	# Output whether this node contains text
497	#
498	# If doc_obj reconstructed from database file then no need to
499	# explicitly add <hastxt> as this is preserved as metadata when
500	# the database file is loaded in
501	if (defined $filename)
502	{
503	# doc_obj derived directly from file
504	if ($doc_obj->get_text_length($section) > 0) {
505	$section_infodb{"hastxt"} = [ "1" ];
506	} else {
507	$section_infodb{"hastxt"} = [ "0" ];
508	}
509	}
510
511	# output all the section metadata
512	my $metadata = $doc_obj->get_all_metadata ($section);
513	foreach my $pair (@$metadata) {
514	my ($field, $value) = (@$pair);
515
516	if ($field ne "Identifier" && $field !~ /^gsdl/ &&
517	defined $value && $value ne "") {
518
519	# escape problematic stuff
520	$value =~ s/([^\\])\\([^\\])/$1\\\\$2/g;
521	$value =~ s/\n/\\n/g;
522	$value =~ s/\r/\\r/g;
523	# remove any ex. iff it's the only namespace prefix (will leave ex.dc.* intact)
524	$field =~ s/^ex\.([^.]+)$/$1/; # $field =~ s/^ex\.//;
525
526	# special case for UTF8URL metadata
527	if ($field =~ m/^UTF8URL$/i) {
528	&dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle,
529	$value, { 'section' => [ $section_OID ] });
530	}
531
532	if (!defined $self->{'dontdb'}->{$field}) {
533	push(@{$section_infodb{$field}}, $value);
534
535	if ($section eq ""
536	&& (($self->{'store_metadata_coverage'} =~ /^true$/i)
537	\|\| $self->{'store_metadata_coverage'} eq "1"))
538	{
539	$self->infodb_metadata_stats($field,$edit_mode);
540	}
541	}
542	}
543	}
544
545	if ($section eq "")
546	{
547	my $doc_mdprefix_fields = $self->{'doc_mdprefix_fields'};
548
549	foreach my $prefix (keys %$doc_mdprefix_fields)
550	{
551	push(@{$section_infodb{"metadataset"}}, $prefix);
552
553	foreach my $field (keys %{$doc_mdprefix_fields->{$prefix}})
554	{
555	push(@{$section_infodb{"metadatalist-$prefix"}}, $field);
556
557	my $val = $doc_mdprefix_fields->{$prefix}->{$field};
558	push(@{$section_infodb{"metadatafreq-$prefix-$field"}}, $val);
559	}
560	}
561	}
562
563	# If doc_obj reconstructed from database file then no need to
564	# explicitly add <archivedir> as this is preserved as metadata when
565	# the database file is loaded in
566	if (defined $filename)
567	{
568	# output archivedir if at top level
569	if ($section eq $doc_obj->get_top_section()) {
570	$section_infodb{"archivedir"} = [ $archivedir ];
571	}
572	}
573
574	# output document display type
575	if ($first) {
576	$section_infodb{"thistype"} = [ $thistype ];
577	}
578
579	if ($self->{'db_level'} eq "document") {
580	# doc num is num_docs not num_sections
581	# output the matching document number
582	$section_infodb{"docnum"} = [ $self->{'num_docs'} ];
583	}
584	else {
585	# output a list of children
586	my $children = $doc_obj->get_children ($section);
587	if (scalar(@$children) > 0) {
588	$section_infodb{"childtype"} = [ $childtype ];
589	my $contains = "";
590	foreach my $child (@$children)
591	{
592	$contains .= ";" unless ($contains eq "");
593	if ($child =~ /^.*?\.(\d+)$/)
594	{
595	$contains .= "\".$1";
596	}
597	else
598	{
599	$contains .= "\".$child";
600	}
601	}
602	$section_infodb{"contains"} = [ $contains ];
603	}
604	# output the matching doc number
605	$section_infodb{"docnum"} = [ $self->{'num_sections'} ];
606	}
607
608	if(defined $section_infodb{'assocfilepath'})
609	{
610	&dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $section_infodb{'assocfilepath'}[0], { 'contains' => [ $section_OID ]});
611	}
612	&dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $section_OID, \%section_infodb);
613
614	# output a database entry for the document number, unless we are incremental
615	unless ($self->is_incremental_capable())
616	{
617	if ($self->{'db_level'} eq "document") {
618	&dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_docs'}, { 'section' => [ $doc_OID ] });
619	}
620	else {
621	&dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_sections'}, { 'section' => [ $section_OID ] });
622	}
623	}
624
625	$first = 0;
626	$section = $doc_obj->get_next_section($section);
627	last if ($self->{'db_level'} eq "document"); # if no sections wanted, only add the docs
628	} # while defined section
629
630	}
631
632
633
634
635	sub infodb {
636	my $self = shift (@_);
637	my ($doc_obj, $filename) = @_;
638
639	$self->infodbedit($doc_obj,$filename,"add");
640	}
641
642	sub infodbreindex {
643	my $self = shift (@_);
644	my ($doc_obj, $filename) = @_;
645
646	$self->infodbedit($doc_obj,$filename,"update");
647	}
648
649	sub infodbdelete {
650	my $self = shift (@_);
651	my ($doc_obj, $filename) = @_;
652
653	$self->infodbedit($doc_obj,$filename,"delete");
654	}
655
656
657	sub text {
658	my $self = shift (@_);
659	my ($doc_obj) = @_;
660
661	my $handle = $self->{'outhandle'};
662	print $handle "basebuildproc::text function must be implemented in sub classes\n";
663	die "\n";
664	}
665
666	sub textreindex
667	{
668	my $self = shift @_;
669
670	my $outhandle = $self->{'outhandle'};
671	print $outhandle "basebuildproc::textreindex function must be implemented in sub classes\n";
672	if (!$self->is_incremental_capable()) {
673
674	print $outhandle " This operation is only possible with indexing tools with that support\n";
675	print $outhandle " incremental building\n";
676	}
677	die "\n";
678	}
679
680	sub textdelete
681	{
682	my $self = shift @_;
683
684	my $outhandle = $self->{'outhandle'};
685	print $outhandle "basebuildproc::textdelete function must be implemented in sub classes\n";
686	if (!$self->is_incremental_capable()) {
687
688	print $outhandle " This operation is only possible with indexing tools with that support\n";
689	print $outhandle " incremental building\n";
690	}
691	die "\n";
692	}
693
694
695	# should the document be indexed - according to the subcollection and language
696	# specification.
697	sub is_subcollection_doc {
698	my $self = shift (@_);
699	my ($doc_obj) = @_;
700
701	my $indexed_doc = 1;
702	foreach my $indexexp (@{$self->{'indexexparr'}}) {
703	$indexed_doc = 0;
704	my ($field, $exp, $options) = split /\//, $indexexp;
705	if (defined ($field) && defined ($exp)) {
706	my ($bool) = $field =~ /^(.)/;
707	$field =~ s/^.// if $bool eq '!';
708	my @metadata_values;
709	if ($field =~ /^filename$/i) {
710	push(@metadata_values, $doc_obj->get_source_filename());
711	}
712	else {
713	$field =~ s/^ex\.([^.]+)$/$1/; # remove any ex. iff it's the only namespace prefix (will leave ex.dc.* intact)
714	@metadata_values = @{$doc_obj->get_metadata($doc_obj->get_top_section(), $field)};
715	}
716	next unless @metadata_values;
717	foreach my $metadata_value (@metadata_values) {
718	if ($bool eq '!') {
719	if ($options =~ /^i$/i) {
720	if ($metadata_value !~ /$exp/i) {$indexed_doc = 1; last;}
721	} else {
722	if ($metadata_value !~ /$exp/) {$indexed_doc = 1; last;}
723	}
724	} else {
725	if ($options =~ /^i$/i) {
726	if ($metadata_value =~ /$exp/i) {$indexed_doc = 1; last;}
727	} else {
728	if ($metadata_value =~ /$exp/) {$indexed_doc = 1; last;}
729	}
730	}
731	}
732
733	last if ($indexed_doc == 1);
734	}
735	}
736
737	# if this doc is so far in the sub collection, and we have lang info,
738	# now we check the languages to see if it matches
739	if($indexed_doc && defined $self->{'lang_meta'}) {
740	$indexed_doc = 0;
741	my $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'lang_meta'});
742	if (defined $field) {
743	foreach my $lang (@{$self->{'langarr'}}) {
744	my ($bool) = $lang =~ /^(.)/;
745	if ($bool eq '!') {
746	$lang =~ s/^.//;
747	if ($field !~ /$lang/) {
748	$indexed_doc = 1; last;
749	}
750	} else {
751	if ($field =~ /$lang/) {
752	$indexed_doc = 1; last;
753	}
754	}
755	}
756	}
757	}
758	return $indexed_doc;
759
760	}
761
762	# use 'Paged' if document has no more than 2 levels
763	# and each section at second level has a number for
764	# Title metadata
765	# also use Paged if gsdlthistype metadata is set to Paged
766	sub get_document_type {
767	my $self = shift (@_);
768	my ($doc_obj) = @_;
769
770	my $thistype = "VList";
771	my $childtype = "VList";
772	my $title;
773	my @tmp = ();
774
775	my $section = $doc_obj->get_top_section ();
776
777	my $gsdlthistype = $doc_obj->get_metadata_element ($section, "gsdlthistype");
778	if (defined $gsdlthistype) {
779	if ($gsdlthistype =~ /^paged$/i) {
780	$childtype = "Paged";
781	if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
782	$thistype = "Paged";
783	} else {
784	$thistype = "Invisible";
785	}
786
787	return ($thistype, $childtype);
788	}
789	# gs3 pagedhierarchy option
790	elsif ($gsdlthistype =~ /^pagedhierarchy$/i) {
791	$childtype = "PagedHierarchy";
792	if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
793	$thistype = "PagedHierarchy";
794	} else {
795	$thistype = "Invisible";
796	}
797
798	return ($thistype, $childtype);
799	} elsif ($gsdlthistype =~ /^hierarchy$/i) {
800	return ($thistype, $childtype); # use VList, VList
801	}
802	}
803	my $first = 1;
804	while (defined $section) {
805	@tmp = split /\./, $section;
806	if (scalar(@tmp) > 1) {
807	return ($thistype, $childtype);
808	}
809	if (!$first) {
810	$title = $doc_obj->get_metadata_element ($section, "Title");
811	if (!defined $title \|\| $title !~ /^\d+$/) {
812	return ($thistype, $childtype);
813	}
814	}
815	$first = 0;
816	$section = $doc_obj->get_next_section($section);
817	}
818	if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
819	$thistype = "Paged";
820	} else {
821	$thistype = "Invisible";
822	}
823	$childtype = "Paged";
824	return ($thistype, $childtype);
825	}
826
827	sub assoc_files
828	{
829	my $self = shift (@_);
830	my ($doc_obj, $archivedir) = @_;
831	my ($afile);
832
833	foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
834	#rint STDERR "Processing associated file - copy " . $assoc_file->[0] . " to " . $assoc_file->[1] . "\n";
835	# if assoc file starts with a slash, we put it relative to the assoc
836	# dir, otherwise it is relative to the HASH... directory
837	if ($assoc_file->[1] =~ m@^[/\\]@) {
838	$afile = &FileUtils::filenameConcatenate($self->{'assocdir'}, $assoc_file->[1]);
839	} else {
840	$afile = &FileUtils::filenameConcatenate($self->{'assocdir'}, $archivedir, $assoc_file->[1]);
841	}
842
843	&FileUtils::hardLink($assoc_file->[0], $afile, $self->{'verbosity'});
844	}
845	}
846
847	sub delete_assoc_files
848	{
849	my $self = shift (@_);
850	my ($archivedir, $edit_mode) = @_;
851
852	my $assoc_dir = &FileUtils::filenameConcatenate($self->{'assocdir'}, $archivedir);
853	if (-d $assoc_dir) {
854	&FileUtils::removeFilesRecursive($assoc_dir);
855	}
856	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: