Context Navigation

source: main/trunk/greenstone2/perllib/basebuildproc.pm@ 22296

Last change on this file since 22296 was 22296, checked in by kjdon, 14 years ago
remove ex. from any metadata before writing it to gdbm file. EmbeddedMetadataPlugin now saves its metadata as ex.
Property svn:keywords set to `Author Date Id Revision`
File size: 22.2 KB

Line
1	##########################################################################
2	#
3	# basebuildproc.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# This document processor outputs a document for indexing (should be
27	# implemented by subclass) and storing in the database
28
29	package basebuildproc;
30
31	eval {require bytes};
32
33	use classify;
34	use dbutil;
35	use doc;
36	use docproc;
37	use strict;
38	no strict 'subs';
39	no strict 'refs';
40	use util;
41
42	BEGIN {
43	@basebuildproc::ISA = ('docproc');
44	}
45
46	sub new()
47	{
48	my ($class, $collection, $source_dir, $build_dir, $keepold, $verbosity, $outhandle) = @_;
49	my $self = new docproc ();
50
51	# outhandle is where all the debugging info goes
52	# output_handle is where the output of the plugins is piped
53	# to (i.e. mg, database etc.)
54	$outhandle = STDERR unless defined $outhandle;
55
56	$self->{'collection'} = $collection;
57	$self->{'source_dir'} = $source_dir;
58	$self->{'build_dir'} = $build_dir;
59	$self->{'keepold'} = $keepold;
60	$self->{'verbosity'} = $verbosity;
61	$self->{'outhandle'} = $outhandle;
62
63	$self->{'classifiers'} = [];
64	$self->{'mode'} = "text";
65	$self->{'assocdir'} = $build_dir;
66	$self->{'dontdb'} = {};
67	$self->{'store_metadata_coverage'} = "false";
68
69	$self->{'index'} = "section:text";
70	$self->{'indexexparr'} = [];
71
72	$self->{'separate_cjk'} = 0;
73
74	my $found_num_data = 0;
75	my $buildconfigfile = undef;
76
77	if ($keepold) {
78	# For incremental building need to seed num_docs etc from values
79	# stored in build.cfg (if present)
80	$buildconfigfile = &util::filename_cat($build_dir, "build.cfg");
81	if (-e $buildconfigfile) {
82	$found_num_data = 1;
83	}
84	else {
85	# try the index dir
86	$buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},
87	"index", "build.cfg");
88	if (-e $buildconfigfile) {
89	$found_num_data = 1;
90	}
91	}
92
93	}
94
95	if ($found_num_data)
96	{
97	#print STDERR "Found_Num_Data!\n";
98	my $buildcfg = &colcfg::read_build_cfg($buildconfigfile);
99	$self->{'starting_num_docs'} = $buildcfg->{'numdocs'};
100	#print STDERR "- num_docs: $self->{'starting_num_docs'}\n";
101	$self->{'starting_num_sections'} = $buildcfg->{'numsections'};
102	#print STDERR "- num_sections: $self->{'starting_num_sections'}\n";
103	$self->{'starting_num_bytes'} = $buildcfg->{'numbytes'};
104	#print STDERR "- num_bytes: $self->{'starting_num_bytes'}\n";
105	}
106	else
107	{
108	#print STDERR "NOT Found_Num_Data!\n";
109	$self->{'starting_num_docs'} = 0;
110	$self->{'starting_num_sections'} = 0;
111	$self->{'starting_num_bytes'} = 0;
112	}
113
114	$self->{'output_handle'} = "STDOUT";
115	$self->{'num_docs'} = $self->{'starting_num_docs'};
116	$self->{'num_sections'} = $self->{'starting_num_sections'};
117	$self->{'num_bytes'} = $self->{'starting_num_bytes'};
118
119	$self->{'num_processed_bytes'} = 0;
120	$self->{'store_text'} = 1;
121
122	# what level (section/document) the database - indexer intersection is
123	$self->{'db_level'} = "section";
124	#used by browse interface
125	$self->{'doclist'} = [];
126
127	$self->{'indexing_text'} = 0;
128
129	return bless $self, $class;
130
131	}
132
133	sub reset {
134	my $self = shift (@_);
135
136	$self->{'num_docs'} = $self->{'starting_num_docs'};
137	$self->{'num_sections'} = $self->{'starting_num_sections'};
138	$self->{'num_bytes'} = $self->{'starting_num_bytes'};
139
140	$self->{'num_processed_bytes'} = 0;
141	}
142
143	sub zero_reset {
144	my $self = shift (@_);
145
146	$self->{'num_docs'} = 0;
147	$self->{'num_sections'} = 0;
148	# reconstructed docs have no text, just metadata, so we need to
149	# remember how many bytes we had initially
150	$self->{'num_bytes'} = $self->{'starting_num_bytes'};
151
152	$self->{'num_processed_bytes'} = 0;
153	}
154
155	sub is_incremental_capable
156	{
157	# By default we return 'no' as the answer
158	# Safer to assume non-incremental to start with, and then override in
159	# inherited classes that are.
160
161	return 0;
162	}
163
164	sub get_num_docs {
165	my $self = shift (@_);
166
167	return $self->{'num_docs'};
168	}
169
170	sub get_num_sections {
171	my $self = shift (@_);
172
173	return $self->{'num_sections'};
174	}
175
176	# num_bytes is the actual number of bytes in the collection
177	# this is normally the same as what's processed during text compression
178	sub get_num_bytes {
179	my $self = shift (@_);
180
181	return $self->{'num_bytes'};
182	}
183
184	# num_processed_bytes is the number of bytes actually passed
185	# to mg for the current index
186	sub get_num_processed_bytes {
187	my $self = shift (@_);
188
189	return $self->{'num_processed_bytes'};
190	}
191
192	sub set_output_handle {
193	my $self = shift (@_);
194	my ($handle) = @_;
195
196	$self->{'output_handle'} = $handle;
197	}
198
199
200	sub set_mode {
201	my $self = shift (@_);
202	my ($mode) = @_;
203
204	$self->{'mode'} = $mode;
205	}
206
207	sub get_mode {
208	my $self = shift (@_);
209
210	return $self->{'mode'};
211	}
212
213	sub set_assocdir {
214	my $self = shift (@_);
215	my ($assocdir) = @_;
216
217	$self->{'assocdir'} = $assocdir;
218	}
219
220	sub set_dontdb {
221	my $self = shift (@_);
222	my ($dontdb) = @_;
223
224	$self->{'dontdb'} = $dontdb;
225	}
226
227	sub set_infodbtype
228	{
229	my $self = shift(@_);
230	my $infodbtype = shift(@_);
231	$self->{'infodbtype'} = $infodbtype;
232	}
233
234	sub set_index {
235	my $self = shift (@_);
236	my ($index, $indexexparr) = @_;
237
238	$self->{'index'} = $index;
239	$self->{'indexexparr'} = $indexexparr if defined $indexexparr;
240	}
241
242	sub set_index_languages {
243	my $self = shift (@_);
244	my ($lang_meta, $langarr) = @_;
245	$lang_meta =~ s/^ex\.//; # strip ex. if there
246	$self->{'lang_meta'} = $lang_meta;
247	$self->{'langarr'} = $langarr;
248	}
249
250	sub get_index {
251	my $self = shift (@_);
252
253	return $self->{'index'};
254	}
255
256	sub set_classifiers {
257	my $self = shift (@_);
258	my ($classifiers) = @_;
259
260	$self->{'classifiers'} = $classifiers;
261	}
262
263	sub set_indexing_text {
264	my $self = shift (@_);
265	my ($indexing_text) = @_;
266
267	$self->{'indexing_text'} = $indexing_text;
268	}
269
270	sub get_indexing_text {
271	my $self = shift (@_);
272
273	return $self->{'indexing_text'};
274	}
275
276	sub set_store_text {
277	my $self = shift (@_);
278	my ($store_text) = @_;
279
280	$self->{'store_text'} = $store_text;
281	}
282
283	sub set_store_metadata_coverage {
284	my $self = shift (@_);
285	my ($store_metadata_coverage) = @_;
286
287	$self->{'store_metadata_coverage'} = $store_metadata_coverage \|\| "";
288	}
289
290	sub get_doc_list {
291	my $self = shift(@_);
292
293	return @{$self->{'doclist'}};
294	}
295
296	# the standard database level is section, but you may want to change it to document
297	sub set_db_level {
298	my $self= shift (@_);
299	my ($db_level) = @_;
300
301	$self->{'db_level'} = $db_level;
302	}
303
304	sub set_sections_index_document_metadata {
305	my $self= shift (@_);
306	my ($index_type) = @_;
307
308	$self->{'sections_index_document_metadata'} = $index_type;
309	}
310
311	sub set_separate_cjk {
312	my $self = shift (@_);
313	my ($sep_cjk) = @_;
314
315	$self->{'separate_cjk'} = $sep_cjk;
316	}
317
318	sub process {
319	my $self = shift (@_);
320	my $method = $self->{'mode'};
321
322	$self->$method(@_);
323	}
324
325	# post process text depending on field. Currently don't do anything here
326	# except cjk separation, and only for indexing
327	# should only do this for indexed text (if $self->{'indexing_text'}),
328	# but currently search term highlighting doesn't work if you do that.
329	# once thats fixed up, then fix this.
330	sub filter_text {
331	my $self = shift (@_);
332	my ($field, $text) = @_;
333
334	# lets do cjk seg here
335	my $new_text =$text;
336	if ($self->{'separate_cjk'}) {
337	$new_text = &cnseg::segment($text);
338	}
339	return $new_text;
340	}
341
342
343	sub infodb_metadata_stats
344	{
345	my $self = shift (@_);
346	my ($field,$edit_mode) = @_;
347
348	# Keep some statistics relating to metadata sets used and
349	# frequency of particular metadata fields within each set
350
351	# Union of metadata prefixes and frequency of fields
352	# (both scoped for this document alone, and across whole collection)
353
354	if ($field =~ m/^(.+)\.(.*)$/) {
355	my $prefix = $1;
356	my $core_field = $2;
357
358	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
359	$self->{'doc_mdprefix_fields'}->{$prefix}->{$core_field}++;
360	$self->{'mdprefix_fields'}->{$prefix}->{$core_field}++;
361	}
362	else {
363	# delete
364	$self->{'doc_mdprefix_fields'}->{$prefix}->{$core_field}--;
365	$self->{'mdprefix_fields'}->{$prefix}->{$core_field}--;
366	}
367
368	}
369	elsif ($field =~ m/^[[:upper:]]/) {
370	# implicit 'ex' metadata set
371
372	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
373
374	$self->{'doc_mdprefix_fields'}->{'ex'}->{$field}++;
375	$self->{'mdprefix_fields'}->{'ex'}->{$field}++;
376	}
377	else {
378	# delete
379	$self->{'doc_mdprefix_fields'}->{'ex'}->{$field}--;
380	$self->{'mdprefix_fields'}->{'ex'}->{$field}--;
381	}
382	}
383
384	}
385
386
387	sub infodbedit {
388	my $self = shift (@_);
389	my ($doc_obj, $filename, $edit_mode) = @_;
390
391	# only output this document if it is a "indexed_doc" or "info_doc" (database only) document
392	my $doctype = $doc_obj->get_doc_type();
393	return if ($doctype ne "indexed_doc" && $doctype ne "info_doc");
394
395	my $archivedir = "";
396	if (defined $filename)
397	{
398	# doc_obj derived directly from file
399	my ($dir) = $filename =~ /^(.?)(?:\/\|\\)[^\/\\]$/;
400	$dir = "" unless defined $dir;
401	$dir =~ s/\\/\//g;
402	$dir =~ s/^\/+//;
403	$dir =~ s/\/+$//;
404
405	$archivedir = $dir;
406
407	# resolve the final filenames of the files associated with this document
408	$self->assoc_files ($doc_obj, $archivedir);
409	}
410	else
411	{
412	# doc_obj reconstructed from database (has metadata, doc structure but no text)
413	my $top_section = $doc_obj->get_top_section();
414	$archivedir = $doc_obj->get_metadata_element($top_section,"archivedir");
415	}
416
417	if (($edit_mode eq "add")) {# \|\| ($edit_mode eq "update")) {
418	#add this document to the browse structure
419	push(@{$self->{'doclist'}},$doc_obj->get_OID())
420	unless ($doctype eq "classification");
421	}
422	# elsif ($edit_mode eq "update") {
423	# don't do anything. this assumes it will keep the same id.
424	# }
425	elsif ($edit_mode eq "delete") {
426	# delete => remove this doc from browse structure
427	my $del_doc_oid = $doc_obj->get_OID();
428
429	my @filtered_doc_list = ();
430	foreach my $oid (@{$self->{'doclist'}}) {
431	push(@filtered_doc_list,$oid) if ($oid ne $del_doc_oid);
432	}
433	$self->{'doclist'} = \@filtered_doc_list;
434	}
435
436
437	# classify this document
438	&classify::classify_doc ($self->{'classifiers'}, $doc_obj, $edit_mode);
439
440	if (($edit_mode eq "add")) {# \|\| ($edit_mode eq "update")) {
441	# this is another document
442	$self->{'num_docs'} += 1 unless ($doctype eq "classification");
443	}
444	elsif ($edit_mode eq "delete") {
445	# delete
446	$self->{'num_docs'} -= 1 unless ($doctype eq "classification");
447	}
448
449	# is this a paged or a hierarchical document
450	my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
451
452	my $section = $doc_obj->get_top_section ();
453	my $doc_OID = $doc_obj->get_OID();
454	my $first = 1;
455	my $infodb_handle = $self->{'output_handle'};
456
457	$self->{'doc_mdprefix_fields'} = {};
458
459	while (defined $section)
460	{
461	my $section_OID = $doc_OID;
462	if ($section ne "")
463	{
464	$section_OID = $doc_OID . "." . $section;
465	}
466	my %section_infodb = ();
467
468	# update a few statistics - how do we do update????
469	if (($edit_mode eq "add")) {# \|\| ($edit_mode eq "update")) {
470
471	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
472	$self->{'num_sections'} += 1 unless ($doctype eq "classification");
473	}
474	elsif ($edit_mode eq "delete") {
475	# delete
476	$self->{'num_bytes'} -= $doc_obj->get_text_length ($section);
477	$self->{'num_sections'} -= 1 unless ($doctype eq "classification");
478	}
479
480	# output the fact that this document is a document (unless doctype
481	# has been set to something else from within a plugin
482	my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
483	if (!defined $dtype \|\| $dtype !~ /\w/) {
484	$section_infodb{"doctype"} = [ "doc" ];
485	}
486
487	# Output whether this node contains text
488	#
489	# If doc_obj reconstructed from database file then no need to
490	# explicitly add <hastxt> as this is preserved as metadata when
491	# the database file is loaded in
492	if (defined $filename)
493	{
494	# doc_obj derived directly from file
495	if ($doc_obj->get_text_length($section) > 0) {
496	$section_infodb{"hastxt"} = [ "1" ];
497	} else {
498	$section_infodb{"hastxt"} = [ "0" ];
499	}
500	}
501
502	# output all the section metadata
503	my $metadata = $doc_obj->get_all_metadata ($section);
504	foreach my $pair (@$metadata) {
505	my ($field, $value) = (@$pair);
506
507	if ($field ne "Identifier" && $field !~ /^gsdl/ &&
508	defined $value && $value ne "") {
509
510	# escape problematic stuff
511	$value =~ s/\\/\\\\/g;
512	$value =~ s/\n/\\n/g;
513	$value =~ s/\r/\\r/g;
514	# remove ex. if there
515	$field =~ s/^ex\.//;
516
517	# special case for URL metadata
518	if ($field =~ /^URL$/i) {
519	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
520
521	&dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $value, { 'section' => [ $section_OID ] });
522	}
523	else {
524	# delete
525	&dbutil::delete_infodb_entry($self->{'infodbtype'}, $infodb_handle, $value);
526	}
527
528
529	}
530
531	if (!defined $self->{'dontdb'}->{$field}) {
532	push(@{$section_infodb{$field}}, $value);
533
534	if ($section eq "" && $self->{'store_metadata_coverage'} =~ /^true$/i)
535	{
536	$self->infodb_metadata_stats($field,$edit_mode);
537	}
538	}
539	}
540	}
541
542	if ($section eq "")
543	{
544	my $doc_mdprefix_fields = $self->{'doc_mdprefix_fields'};
545
546	foreach my $prefix (keys %$doc_mdprefix_fields)
547	{
548	push(@{$section_infodb{"metadataset"}}, $prefix);
549
550	foreach my $field (keys %{$doc_mdprefix_fields->{$prefix}})
551	{
552	push(@{$section_infodb{"metadatalist-$prefix"}}, $field);
553
554	my $val = $doc_mdprefix_fields->{$prefix}->{$field};
555	push(@{$section_infodb{"metadatafreq-$prefix-$field"}}, $val);
556	}
557	}
558	}
559
560	# If doc_obj reconstructed from database file then no need to
561	# explicitly add <archivedir> as this is preserved as metadata when
562	# the database file is loaded in
563	if (defined $filename)
564	{
565	# output archivedir if at top level
566	if ($section eq $doc_obj->get_top_section()) {
567	$section_infodb{"archivedir"} = [ $archivedir ];
568	}
569	}
570
571	# output document display type
572	if ($first) {
573	$section_infodb{"thistype"} = [ $thistype ];
574	}
575
576	if ($self->{'db_level'} eq "document") {
577	# doc num is num_docs not num_sections
578	# output the matching document number
579	$section_infodb{"docnum"} = [ $self->{'num_docs'} ];
580	}
581	else {
582	# output a list of children
583	my $children = $doc_obj->get_children ($section);
584	if (scalar(@$children) > 0) {
585	$section_infodb{"childtype"} = [ $childtype ];
586	my $contains = "";
587	foreach my $child (@$children)
588	{
589	$contains .= ";" unless ($contains eq "");
590	if ($child =~ /^.*?\.(\d+)$/)
591	{
592	$contains .= "\".$1";
593	}
594	else
595	{
596	$contains .= "\".$child";
597	}
598	}
599	$section_infodb{"contains"} = [ $contains ];
600	}
601	# output the matching doc number
602	$section_infodb{"docnum"} = [ $self->{'num_sections'} ];
603	}
604
605	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
606	# in case of update, this will overwrite old entry??
607	&dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $section_OID, \%section_infodb);
608	}
609	else {
610	# delete
611	&dbutil::delete_infodb_entry($self->{'infodbtype'}, $infodb_handle, $section_OID);
612	}
613
614
615	# output a database entry for the document number, except for Lucene (which no longer needs this information)
616	unless (ref($self) eq "lucenebuildproc")
617	{
618	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
619
620	if ($self->{'db_level'} eq "document") {
621	&dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_docs'}, { 'section' => [ $doc_OID ] });
622	}
623	else {
624	&dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_sections'}, { 'section' => [ $section_OID ] });
625	}
626	}
627	else {
628
629	if ($self->{'db_level'} eq "document") {
630	&dbutil::delete_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_docs'});
631	}
632	else {
633	&dbutil::delete_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_sections'});
634	}
635
636	}
637	}
638
639	$first = 0;
640	$section = $doc_obj->get_next_section($section);
641	last if ($self->{'db_level'} eq "document"); # if no sections wanted, only add the docs
642	}
643	}
644
645
646
647
648	sub infodb {
649	my $self = shift (@_);
650	my ($doc_obj, $filename) = @_;
651
652	$self->infodbedit($doc_obj,$filename,"add");
653	}
654
655	sub infodbreindex {
656	my $self = shift (@_);
657	my ($doc_obj, $filename) = @_;
658
659	$self->infodbedit($doc_obj,$filename,"update");
660	}
661
662	sub infodbdelete {
663	my $self = shift (@_);
664	my ($doc_obj, $filename) = @_;
665
666	$self->infodbedit($doc_obj,$filename,"delete");
667	}
668
669
670	sub text {
671	my $self = shift (@_);
672	my ($doc_obj) = @_;
673
674	my $handle = $self->{'outhandle'};
675	print $handle "basebuildproc::text function must be implemented in sub classes\n";
676	die "\n";
677	}
678
679	sub textreindex
680	{
681	my $self = shift @_;
682
683	my $outhandle = $self->{'outhandle'};
684	print $outhandle "basebuildproc::textreindex function must be implemented in sub classes\n";
685	if (!$self->is_incremental_capable()) {
686
687	print $outhandle " This operation is only possible with indexing tools with that support\n";
688	print $outhandle " incremental building\n";
689	}
690	die "\n";
691	}
692
693	sub textdelete
694	{
695	my $self = shift @_;
696
697	my $outhandle = $self->{'outhandle'};
698	print $outhandle "basebuildproc::textdelete function must be implemented in sub classes\n";
699	if (!$self->is_incremental_capable()) {
700
701	print $outhandle " This operation is only possible with indexing tools with that support\n";
702	print $outhandle " incremental building\n";
703	}
704	die "\n";
705	}
706
707
708	# should the document be indexed - according to the subcollection and language
709	# specification.
710	sub is_subcollection_doc {
711	my $self = shift (@_);
712	my ($doc_obj) = @_;
713
714	my $indexed_doc = 1;
715	foreach my $indexexp (@{$self->{'indexexparr'}}) {
716	$indexed_doc = 0;
717	my ($field, $exp, $options) = split /\//, $indexexp;
718	if (defined ($field) && defined ($exp)) {
719	my ($bool) = $field =~ /^(.)/;
720	$field =~ s/^.// if $bool eq '!';
721	my @metadata_values;
722	if ($field =~ /^filename$/i) {
723	push(@metadata_values, $doc_obj->get_source_filename());
724	}
725	else {
726	$field =~ s/^ex\.//; #strip ex. if present
727	@metadata_values = @{$doc_obj->get_metadata($doc_obj->get_top_section(), $field)};
728	}
729	next unless @metadata_values;
730	foreach my $metadata_value (@metadata_values) {
731	if ($bool eq '!') {
732	if ($options =~ /^i$/i) {
733	if ($metadata_value !~ /$exp/i) {$indexed_doc = 1; last;}
734	} else {
735	if ($metadata_value !~ /$exp/) {$indexed_doc = 1; last;}
736	}
737	} else {
738	if ($options =~ /^i$/i) {
739	if ($metadata_value =~ /$exp/i) {$indexed_doc = 1; last;}
740	} else {
741	if ($metadata_value =~ /$exp/) {$indexed_doc = 1; last;}
742	}
743	}
744	}
745
746	last if ($indexed_doc == 1);
747	}
748	}
749
750	# if this doc is so far in the sub collection, and we have lang info,
751	# now we check the languages to see if it matches
752	if($indexed_doc && defined $self->{'lang_meta'}) {
753	$indexed_doc = 0;
754	my $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'lang_meta'});
755	if (defined $field) {
756	foreach my $lang (@{$self->{'langarr'}}) {
757	my ($bool) = $lang =~ /^(.)/;
758	if ($bool eq '!') {
759	$lang =~ s/^.//;
760	if ($field !~ /$lang/) {
761	$indexed_doc = 1; last;
762	}
763	} else {
764	if ($field =~ /$lang/) {
765	$indexed_doc = 1; last;
766	}
767	}
768	}
769	}
770	}
771	return $indexed_doc;
772
773	}
774
775	# use 'Paged' if document has no more than 2 levels
776	# and each section at second level has a number for
777	# Title metadata
778	# also use Paged if gsdlthistype metadata is set to Paged
779	sub get_document_type {
780	my $self = shift (@_);
781	my ($doc_obj) = @_;
782
783	my $thistype = "VList";
784	my $childtype = "VList";
785	my $title;
786	my @tmp = ();
787
788	my $section = $doc_obj->get_top_section ();
789
790	my $gsdlthistype = $doc_obj->get_metadata_element ($section, "gsdlthistype");
791	if (defined $gsdlthistype) {
792	if ($gsdlthistype eq "Paged") {
793	$childtype = "Paged";
794	if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
795	$thistype = "Paged";
796	} else {
797	$thistype = "Invisible";
798	}
799
800	return ($thistype, $childtype);
801	} elsif ($gsdlthistype eq "Hierarchy") {
802	return ($thistype, $childtype); # use VList, VList
803	}
804	}
805	my $first = 1;
806	while (defined $section) {
807	@tmp = split /\./, $section;
808	if (scalar(@tmp) > 1) {
809	return ($thistype, $childtype);
810	}
811	if (!$first) {
812	$title = $doc_obj->get_metadata_element ($section, "Title");
813	if (!defined $title \|\| $title !~ /^\d+$/) {
814	return ($thistype, $childtype);
815	}
816	}
817	$first = 0;
818	$section = $doc_obj->get_next_section($section);
819	}
820	if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
821	$thistype = "Paged";
822	} else {
823	$thistype = "Invisible";
824	}
825	$childtype = "Paged";
826	return ($thistype, $childtype);
827	}
828
829	sub assoc_files
830	{
831	my $self = shift (@_);
832	my ($doc_obj, $archivedir) = @_;
833	my ($afile);
834
835	foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
836	#rint STDERR "Processing associated file - copy " . $assoc_file->[0] . " to " . $assoc_file->[1] . "\n";
837	# if assoc file starts with a slash, we put it relative to the assoc
838	# dir, otherwise it is relative to the HASH... directory
839	if ($assoc_file->[1] =~ m@^[/\\]@) {
840	$afile = &util::filename_cat($self->{'assocdir'}, $assoc_file->[1]);
841	} else {
842	$afile = &util::filename_cat($self->{'assocdir'}, $archivedir, $assoc_file->[1]);
843	}
844	&util::hard_link ($assoc_file->[0], $afile, $self->{'verbosity'});
845	}
846	}
847

Note: See TracBrowser for help on using the repository browser.

Download in other formats: