Context Navigation

source: main/trunk/greenstone2/perllib/basebuildproc.pm@ 22843

Last change on this file since 22843 was 22843, checked in by davidb, 14 years ago
More explicit use of utf8 for input and output file handling. Relies on strings in Perl being Unicode aware (and not merely binary bytes) otherwise binary bytes will then be incorrectly re-incoded as UTF-8 (which is not what you want as they already are in UTF-8 form).
Property svn:keywords set to `Author Date Id Revision`
File size: 22.2 KB

Line
1	##########################################################################
2	#
3	# basebuildproc.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# This document processor outputs a document for indexing (should be
27	# implemented by subclass) and storing in the database
28
29	package basebuildproc;
30
31	eval {require bytes};
32
33	use classify;
34	use dbutil;
35	use doc;
36	use docproc;
37	use strict;
38	no strict 'subs';
39	no strict 'refs';
40	use util;
41
42	BEGIN {
43	@basebuildproc::ISA = ('docproc');
44	}
45
46	sub new()
47	{
48	my ($class, $collection, $source_dir, $build_dir, $keepold, $verbosity, $outhandle) = @_;
49	my $self = new docproc ();
50
51	# outhandle is where all the debugging info goes
52	# output_handle is where the output of the plugins is piped
53	# to (i.e. mg, database etc.)
54	$outhandle = STDERR unless defined $outhandle;
55
56	$self->{'collection'} = $collection;
57	$self->{'source_dir'} = $source_dir;
58	$self->{'build_dir'} = $build_dir;
59	$self->{'keepold'} = $keepold;
60	$self->{'verbosity'} = $verbosity;
61	$self->{'outhandle'} = $outhandle;
62
63	$self->{'classifiers'} = [];
64	$self->{'mode'} = "text";
65	$self->{'assocdir'} = $build_dir;
66	$self->{'dontdb'} = {};
67	$self->{'store_metadata_coverage'} = "false";
68
69	$self->{'index'} = "section:text";
70	$self->{'indexexparr'} = [];
71
72	$self->{'separate_cjk'} = 0;
73
74	my $found_num_data = 0;
75	my $buildconfigfile = undef;
76
77	if ($keepold) {
78	# For incremental building need to seed num_docs etc from values
79	# stored in build.cfg (if present)
80	$buildconfigfile = &util::filename_cat($build_dir, "build.cfg");
81	if (-e $buildconfigfile) {
82	$found_num_data = 1;
83	}
84	else {
85	# try the index dir
86	$buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},
87	"index", "build.cfg");
88	if (-e $buildconfigfile) {
89	$found_num_data = 1;
90	}
91	}
92
93	}
94
95	if ($found_num_data)
96	{
97	#print STDERR "Found_Num_Data!\n";
98	my $buildcfg = &colcfg::read_build_cfg($buildconfigfile);
99	$self->{'starting_num_docs'} = $buildcfg->{'numdocs'};
100	#print STDERR "- num_docs: $self->{'starting_num_docs'}\n";
101	$self->{'starting_num_sections'} = $buildcfg->{'numsections'};
102	#print STDERR "- num_sections: $self->{'starting_num_sections'}\n";
103	$self->{'starting_num_bytes'} = $buildcfg->{'numbytes'};
104	#print STDERR "- num_bytes: $self->{'starting_num_bytes'}\n";
105	}
106	else
107	{
108	#print STDERR "NOT Found_Num_Data!\n";
109	$self->{'starting_num_docs'} = 0;
110	$self->{'starting_num_sections'} = 0;
111	$self->{'starting_num_bytes'} = 0;
112	}
113
114	$self->{'output_handle'} = "STDOUT";
115	$self->{'num_docs'} = $self->{'starting_num_docs'};
116	$self->{'num_sections'} = $self->{'starting_num_sections'};
117	$self->{'num_bytes'} = $self->{'starting_num_bytes'};
118
119	$self->{'num_processed_bytes'} = 0;
120	$self->{'store_text'} = 1;
121
122	# what level (section/document) the database - indexer intersection is
123	$self->{'db_level'} = "section";
124	#used by browse interface
125	$self->{'doclist'} = [];
126
127	$self->{'indexing_text'} = 0;
128
129	return bless $self, $class;
130
131	}
132
133	sub reset {
134	my $self = shift (@_);
135
136	$self->{'num_docs'} = $self->{'starting_num_docs'};
137	$self->{'num_sections'} = $self->{'starting_num_sections'};
138	$self->{'num_bytes'} = $self->{'starting_num_bytes'};
139
140	$self->{'num_processed_bytes'} = 0;
141	}
142
143	sub zero_reset {
144	my $self = shift (@_);
145
146	$self->{'num_docs'} = 0;
147	$self->{'num_sections'} = 0;
148	# reconstructed docs have no text, just metadata, so we need to
149	# remember how many bytes we had initially
150	$self->{'num_bytes'} = $self->{'starting_num_bytes'};
151
152	$self->{'num_processed_bytes'} = 0;
153	}
154
155	sub is_incremental_capable
156	{
157	# By default we return 'no' as the answer
158	# Safer to assume non-incremental to start with, and then override in
159	# inherited classes that are.
160
161	return 0;
162	}
163
164	sub get_num_docs {
165	my $self = shift (@_);
166
167	return $self->{'num_docs'};
168	}
169
170	sub get_num_sections {
171	my $self = shift (@_);
172
173	return $self->{'num_sections'};
174	}
175
176	# num_bytes is the actual number of bytes in the collection
177	# this is normally the same as what's processed during text compression
178	sub get_num_bytes {
179	my $self = shift (@_);
180
181	return $self->{'num_bytes'};
182	}
183
184	# num_processed_bytes is the number of bytes actually passed
185	# to mg for the current index
186	sub get_num_processed_bytes {
187	my $self = shift (@_);
188
189	return $self->{'num_processed_bytes'};
190	}
191
192	sub set_output_handle {
193	my $self = shift (@_);
194	my ($handle) = @_;
195
196	$self->{'output_handle'} = $handle;
197	binmode($handle,":utf8");
198	}
199
200
201	sub set_mode {
202	my $self = shift (@_);
203	my ($mode) = @_;
204
205	$self->{'mode'} = $mode;
206	}
207
208	sub get_mode {
209	my $self = shift (@_);
210
211	return $self->{'mode'};
212	}
213
214	sub set_assocdir {
215	my $self = shift (@_);
216	my ($assocdir) = @_;
217
218	$self->{'assocdir'} = $assocdir;
219	}
220
221	sub set_dontdb {
222	my $self = shift (@_);
223	my ($dontdb) = @_;
224
225	$self->{'dontdb'} = $dontdb;
226	}
227
228	sub set_infodbtype
229	{
230	my $self = shift(@_);
231	my $infodbtype = shift(@_);
232	$self->{'infodbtype'} = $infodbtype;
233	}
234
235	sub set_index {
236	my $self = shift (@_);
237	my ($index, $indexexparr) = @_;
238
239	$self->{'index'} = $index;
240	$self->{'indexexparr'} = $indexexparr if defined $indexexparr;
241	}
242
243	sub set_index_languages {
244	my $self = shift (@_);
245	my ($lang_meta, $langarr) = @_;
246	$lang_meta =~ s/^ex\.//; # strip ex. if there
247	$self->{'lang_meta'} = $lang_meta;
248	$self->{'langarr'} = $langarr;
249	}
250
251	sub get_index {
252	my $self = shift (@_);
253
254	return $self->{'index'};
255	}
256
257	sub set_classifiers {
258	my $self = shift (@_);
259	my ($classifiers) = @_;
260
261	$self->{'classifiers'} = $classifiers;
262	}
263
264	sub set_indexing_text {
265	my $self = shift (@_);
266	my ($indexing_text) = @_;
267
268	$self->{'indexing_text'} = $indexing_text;
269	}
270
271	sub get_indexing_text {
272	my $self = shift (@_);
273
274	return $self->{'indexing_text'};
275	}
276
277	sub set_store_text {
278	my $self = shift (@_);
279	my ($store_text) = @_;
280
281	$self->{'store_text'} = $store_text;
282	}
283
284	sub set_store_metadata_coverage {
285	my $self = shift (@_);
286	my ($store_metadata_coverage) = @_;
287
288	$self->{'store_metadata_coverage'} = $store_metadata_coverage \|\| "";
289	}
290
291	sub get_doc_list {
292	my $self = shift(@_);
293
294	return @{$self->{'doclist'}};
295	}
296
297	# the standard database level is section, but you may want to change it to document
298	sub set_db_level {
299	my $self= shift (@_);
300	my ($db_level) = @_;
301
302	$self->{'db_level'} = $db_level;
303	}
304
305	sub set_sections_index_document_metadata {
306	my $self= shift (@_);
307	my ($index_type) = @_;
308
309	$self->{'sections_index_document_metadata'} = $index_type;
310	}
311
312	sub set_separate_cjk {
313	my $self = shift (@_);
314	my ($sep_cjk) = @_;
315
316	$self->{'separate_cjk'} = $sep_cjk;
317	}
318
319	sub process {
320	my $self = shift (@_);
321	my $method = $self->{'mode'};
322
323	$self->$method(@_);
324	}
325
326	# post process text depending on field. Currently don't do anything here
327	# except cjk separation, and only for indexing
328	# should only do this for indexed text (if $self->{'indexing_text'}),
329	# but currently search term highlighting doesn't work if you do that.
330	# once thats fixed up, then fix this.
331	sub filter_text {
332	my $self = shift (@_);
333	my ($field, $text) = @_;
334
335	# lets do cjk seg here
336	my $new_text =$text;
337	if ($self->{'separate_cjk'}) {
338	$new_text = &cnseg::segment($text);
339	}
340	return $new_text;
341	}
342
343
344	sub infodb_metadata_stats
345	{
346	my $self = shift (@_);
347	my ($field,$edit_mode) = @_;
348
349	# Keep some statistics relating to metadata sets used and
350	# frequency of particular metadata fields within each set
351
352	# Union of metadata prefixes and frequency of fields
353	# (both scoped for this document alone, and across whole collection)
354
355	if ($field =~ m/^(.+)\.(.*)$/) {
356	my $prefix = $1;
357	my $core_field = $2;
358
359	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
360	$self->{'doc_mdprefix_fields'}->{$prefix}->{$core_field}++;
361	$self->{'mdprefix_fields'}->{$prefix}->{$core_field}++;
362	}
363	else {
364	# delete
365	$self->{'doc_mdprefix_fields'}->{$prefix}->{$core_field}--;
366	$self->{'mdprefix_fields'}->{$prefix}->{$core_field}--;
367	}
368
369	}
370	elsif ($field =~ m/^[[:upper:]]/) {
371	# implicit 'ex' metadata set
372
373	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
374
375	$self->{'doc_mdprefix_fields'}->{'ex'}->{$field}++;
376	$self->{'mdprefix_fields'}->{'ex'}->{$field}++;
377	}
378	else {
379	# delete
380	$self->{'doc_mdprefix_fields'}->{'ex'}->{$field}--;
381	$self->{'mdprefix_fields'}->{'ex'}->{$field}--;
382	}
383	}
384
385	}
386
387
388	sub infodbedit {
389	my $self = shift (@_);
390	my ($doc_obj, $filename, $edit_mode) = @_;
391
392	# only output this document if it is a "indexed_doc" or "info_doc" (database only) document
393	my $doctype = $doc_obj->get_doc_type();
394	return if ($doctype ne "indexed_doc" && $doctype ne "info_doc");
395
396	my $archivedir = "";
397	if (defined $filename)
398	{
399	# doc_obj derived directly from file
400	my ($dir) = $filename =~ /^(.?)(?:\/\|\\)[^\/\\]$/;
401	$dir = "" unless defined $dir;
402	$dir =~ s/\\/\//g;
403	$dir =~ s/^\/+//;
404	$dir =~ s/\/+$//;
405
406	$archivedir = $dir;
407
408	# resolve the final filenames of the files associated with this document
409	$self->assoc_files ($doc_obj, $archivedir);
410	}
411	else
412	{
413	# doc_obj reconstructed from database (has metadata, doc structure but no text)
414	my $top_section = $doc_obj->get_top_section();
415	$archivedir = $doc_obj->get_metadata_element($top_section,"archivedir");
416	}
417
418	if (($edit_mode eq "add")) {# \|\| ($edit_mode eq "update")) {
419	#add this document to the browse structure
420	push(@{$self->{'doclist'}},$doc_obj->get_OID())
421	unless ($doctype eq "classification");
422	}
423	# elsif ($edit_mode eq "update") {
424	# don't do anything. this assumes it will keep the same id.
425	# }
426	elsif ($edit_mode eq "delete") {
427	# delete => remove this doc from browse structure
428	my $del_doc_oid = $doc_obj->get_OID();
429
430	my @filtered_doc_list = ();
431	foreach my $oid (@{$self->{'doclist'}}) {
432	push(@filtered_doc_list,$oid) if ($oid ne $del_doc_oid);
433	}
434	$self->{'doclist'} = \@filtered_doc_list;
435	}
436
437
438	# classify this document
439	&classify::classify_doc ($self->{'classifiers'}, $doc_obj, $edit_mode);
440
441	if (($edit_mode eq "add")) {# \|\| ($edit_mode eq "update")) {
442	# this is another document
443	$self->{'num_docs'} += 1 unless ($doctype eq "classification");
444	}
445	elsif ($edit_mode eq "delete") {
446	# delete
447	$self->{'num_docs'} -= 1 unless ($doctype eq "classification");
448	}
449
450	# is this a paged or a hierarchical document
451	my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
452
453	my $section = $doc_obj->get_top_section ();
454	my $doc_OID = $doc_obj->get_OID();
455	my $first = 1;
456	my $infodb_handle = $self->{'output_handle'};
457
458	$self->{'doc_mdprefix_fields'} = {};
459
460	while (defined $section)
461	{
462	my $section_OID = $doc_OID;
463	if ($section ne "")
464	{
465	$section_OID = $doc_OID . "." . $section;
466	}
467	my %section_infodb = ();
468
469	# update a few statistics - how do we do update????
470	if (($edit_mode eq "add")) {# \|\| ($edit_mode eq "update")) {
471
472	$self->{'num_bytes'} += $doc_obj->get_text_length ($section);
473	$self->{'num_sections'} += 1 unless ($doctype eq "classification");
474	}
475	elsif ($edit_mode eq "delete") {
476	# delete
477	$self->{'num_bytes'} -= $doc_obj->get_text_length ($section);
478	$self->{'num_sections'} -= 1 unless ($doctype eq "classification");
479	}
480
481	# output the fact that this document is a document (unless doctype
482	# has been set to something else from within a plugin
483	my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
484	if (!defined $dtype \|\| $dtype !~ /\w/) {
485	$section_infodb{"doctype"} = [ "doc" ];
486	}
487
488	# Output whether this node contains text
489	#
490	# If doc_obj reconstructed from database file then no need to
491	# explicitly add <hastxt> as this is preserved as metadata when
492	# the database file is loaded in
493	if (defined $filename)
494	{
495	# doc_obj derived directly from file
496	if ($doc_obj->get_text_length($section) > 0) {
497	$section_infodb{"hastxt"} = [ "1" ];
498	} else {
499	$section_infodb{"hastxt"} = [ "0" ];
500	}
501	}
502
503	# output all the section metadata
504	my $metadata = $doc_obj->get_all_metadata ($section);
505	foreach my $pair (@$metadata) {
506	my ($field, $value) = (@$pair);
507
508	if ($field ne "Identifier" && $field !~ /^gsdl/ &&
509	defined $value && $value ne "") {
510
511	# escape problematic stuff
512	$value =~ s/([^\\])\\([^\\])/$1\\\\$2/g;
513	$value =~ s/\n/\\n/g;
514	$value =~ s/\r/\\r/g;
515	# remove ex. if there
516	$field =~ s/^ex\.//;
517
518	# special case for URL metadata
519	if ($field =~ /^URL$/i) {
520	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
521
522	&dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $value, { 'section' => [ $section_OID ] });
523	}
524	else {
525	# delete
526	&dbutil::delete_infodb_entry($self->{'infodbtype'}, $infodb_handle, $value);
527	}
528
529
530	}
531
532	if (!defined $self->{'dontdb'}->{$field}) {
533	push(@{$section_infodb{$field}}, $value);
534
535	if ($section eq "" && $self->{'store_metadata_coverage'} =~ /^true$/i)
536	{
537	$self->infodb_metadata_stats($field,$edit_mode);
538	}
539	}
540	}
541	}
542
543	if ($section eq "")
544	{
545	my $doc_mdprefix_fields = $self->{'doc_mdprefix_fields'};
546
547	foreach my $prefix (keys %$doc_mdprefix_fields)
548	{
549	push(@{$section_infodb{"metadataset"}}, $prefix);
550
551	foreach my $field (keys %{$doc_mdprefix_fields->{$prefix}})
552	{
553	push(@{$section_infodb{"metadatalist-$prefix"}}, $field);
554
555	my $val = $doc_mdprefix_fields->{$prefix}->{$field};
556	push(@{$section_infodb{"metadatafreq-$prefix-$field"}}, $val);
557	}
558	}
559	}
560
561	# If doc_obj reconstructed from database file then no need to
562	# explicitly add <archivedir> as this is preserved as metadata when
563	# the database file is loaded in
564	if (defined $filename)
565	{
566	# output archivedir if at top level
567	if ($section eq $doc_obj->get_top_section()) {
568	$section_infodb{"archivedir"} = [ $archivedir ];
569	}
570	}
571
572	# output document display type
573	if ($first) {
574	$section_infodb{"thistype"} = [ $thistype ];
575	}
576
577	if ($self->{'db_level'} eq "document") {
578	# doc num is num_docs not num_sections
579	# output the matching document number
580	$section_infodb{"docnum"} = [ $self->{'num_docs'} ];
581	}
582	else {
583	# output a list of children
584	my $children = $doc_obj->get_children ($section);
585	if (scalar(@$children) > 0) {
586	$section_infodb{"childtype"} = [ $childtype ];
587	my $contains = "";
588	foreach my $child (@$children)
589	{
590	$contains .= ";" unless ($contains eq "");
591	if ($child =~ /^.*?\.(\d+)$/)
592	{
593	$contains .= "\".$1";
594	}
595	else
596	{
597	$contains .= "\".$child";
598	}
599	}
600	$section_infodb{"contains"} = [ $contains ];
601	}
602	# output the matching doc number
603	$section_infodb{"docnum"} = [ $self->{'num_sections'} ];
604	}
605
606	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
607	# in case of update, this will overwrite old entry??
608	&dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $section_OID, \%section_infodb);
609	}
610	else {
611	# delete
612	&dbutil::delete_infodb_entry($self->{'infodbtype'}, $infodb_handle, $section_OID);
613	}
614
615
616	# output a database entry for the document number, except for Lucene (which no longer needs this information)
617	unless (ref($self) eq "lucenebuildproc")
618	{
619	if (($edit_mode eq "add") \|\| ($edit_mode eq "update")) {
620
621	if ($self->{'db_level'} eq "document") {
622	&dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_docs'}, { 'section' => [ $doc_OID ] });
623	}
624	else {
625	&dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_sections'}, { 'section' => [ $section_OID ] });
626	}
627	}
628	else {
629
630	if ($self->{'db_level'} eq "document") {
631	&dbutil::delete_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_docs'});
632	}
633	else {
634	&dbutil::delete_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_sections'});
635	}
636
637	}
638	}
639
640	$first = 0;
641	$section = $doc_obj->get_next_section($section);
642	last if ($self->{'db_level'} eq "document"); # if no sections wanted, only add the docs
643	}
644	}
645
646
647
648
649	sub infodb {
650	my $self = shift (@_);
651	my ($doc_obj, $filename) = @_;
652
653	$self->infodbedit($doc_obj,$filename,"add");
654	}
655
656	sub infodbreindex {
657	my $self = shift (@_);
658	my ($doc_obj, $filename) = @_;
659
660	$self->infodbedit($doc_obj,$filename,"update");
661	}
662
663	sub infodbdelete {
664	my $self = shift (@_);
665	my ($doc_obj, $filename) = @_;
666
667	$self->infodbedit($doc_obj,$filename,"delete");
668	}
669
670
671	sub text {
672	my $self = shift (@_);
673	my ($doc_obj) = @_;
674
675	my $handle = $self->{'outhandle'};
676	print $handle "basebuildproc::text function must be implemented in sub classes\n";
677	die "\n";
678	}
679
680	sub textreindex
681	{
682	my $self = shift @_;
683
684	my $outhandle = $self->{'outhandle'};
685	print $outhandle "basebuildproc::textreindex function must be implemented in sub classes\n";
686	if (!$self->is_incremental_capable()) {
687
688	print $outhandle " This operation is only possible with indexing tools with that support\n";
689	print $outhandle " incremental building\n";
690	}
691	die "\n";
692	}
693
694	sub textdelete
695	{
696	my $self = shift @_;
697
698	my $outhandle = $self->{'outhandle'};
699	print $outhandle "basebuildproc::textdelete function must be implemented in sub classes\n";
700	if (!$self->is_incremental_capable()) {
701
702	print $outhandle " This operation is only possible with indexing tools with that support\n";
703	print $outhandle " incremental building\n";
704	}
705	die "\n";
706	}
707
708
709	# should the document be indexed - according to the subcollection and language
710	# specification.
711	sub is_subcollection_doc {
712	my $self = shift (@_);
713	my ($doc_obj) = @_;
714
715	my $indexed_doc = 1;
716	foreach my $indexexp (@{$self->{'indexexparr'}}) {
717	$indexed_doc = 0;
718	my ($field, $exp, $options) = split /\//, $indexexp;
719	if (defined ($field) && defined ($exp)) {
720	my ($bool) = $field =~ /^(.)/;
721	$field =~ s/^.// if $bool eq '!';
722	my @metadata_values;
723	if ($field =~ /^filename$/i) {
724	push(@metadata_values, $doc_obj->get_source_filename());
725	}
726	else {
727	$field =~ s/^ex\.//; #strip ex. if present
728	@metadata_values = @{$doc_obj->get_metadata($doc_obj->get_top_section(), $field)};
729	}
730	next unless @metadata_values;
731	foreach my $metadata_value (@metadata_values) {
732	if ($bool eq '!') {
733	if ($options =~ /^i$/i) {
734	if ($metadata_value !~ /$exp/i) {$indexed_doc = 1; last;}
735	} else {
736	if ($metadata_value !~ /$exp/) {$indexed_doc = 1; last;}
737	}
738	} else {
739	if ($options =~ /^i$/i) {
740	if ($metadata_value =~ /$exp/i) {$indexed_doc = 1; last;}
741	} else {
742	if ($metadata_value =~ /$exp/) {$indexed_doc = 1; last;}
743	}
744	}
745	}
746
747	last if ($indexed_doc == 1);
748	}
749	}
750
751	# if this doc is so far in the sub collection, and we have lang info,
752	# now we check the languages to see if it matches
753	if($indexed_doc && defined $self->{'lang_meta'}) {
754	$indexed_doc = 0;
755	my $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'lang_meta'});
756	if (defined $field) {
757	foreach my $lang (@{$self->{'langarr'}}) {
758	my ($bool) = $lang =~ /^(.)/;
759	if ($bool eq '!') {
760	$lang =~ s/^.//;
761	if ($field !~ /$lang/) {
762	$indexed_doc = 1; last;
763	}
764	} else {
765	if ($field =~ /$lang/) {
766	$indexed_doc = 1; last;
767	}
768	}
769	}
770	}
771	}
772	return $indexed_doc;
773
774	}
775
776	# use 'Paged' if document has no more than 2 levels
777	# and each section at second level has a number for
778	# Title metadata
779	# also use Paged if gsdlthistype metadata is set to Paged
780	sub get_document_type {
781	my $self = shift (@_);
782	my ($doc_obj) = @_;
783
784	my $thistype = "VList";
785	my $childtype = "VList";
786	my $title;
787	my @tmp = ();
788
789	my $section = $doc_obj->get_top_section ();
790
791	my $gsdlthistype = $doc_obj->get_metadata_element ($section, "gsdlthistype");
792	if (defined $gsdlthistype) {
793	if ($gsdlthistype eq "Paged") {
794	$childtype = "Paged";
795	if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
796	$thistype = "Paged";
797	} else {
798	$thistype = "Invisible";
799	}
800
801	return ($thistype, $childtype);
802	} elsif ($gsdlthistype eq "Hierarchy") {
803	return ($thistype, $childtype); # use VList, VList
804	}
805	}
806	my $first = 1;
807	while (defined $section) {
808	@tmp = split /\./, $section;
809	if (scalar(@tmp) > 1) {
810	return ($thistype, $childtype);
811	}
812	if (!$first) {
813	$title = $doc_obj->get_metadata_element ($section, "Title");
814	if (!defined $title \|\| $title !~ /^\d+$/) {
815	return ($thistype, $childtype);
816	}
817	}
818	$first = 0;
819	$section = $doc_obj->get_next_section($section);
820	}
821	if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
822	$thistype = "Paged";
823	} else {
824	$thistype = "Invisible";
825	}
826	$childtype = "Paged";
827	return ($thistype, $childtype);
828	}
829
830	sub assoc_files
831	{
832	my $self = shift (@_);
833	my ($doc_obj, $archivedir) = @_;
834	my ($afile);
835
836	foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
837	#rint STDERR "Processing associated file - copy " . $assoc_file->[0] . " to " . $assoc_file->[1] . "\n";
838	# if assoc file starts with a slash, we put it relative to the assoc
839	# dir, otherwise it is relative to the HASH... directory
840	if ($assoc_file->[1] =~ m@^[/\\]@) {
841	$afile = &util::filename_cat($self->{'assocdir'}, $assoc_file->[1]);
842	} else {
843	$afile = &util::filename_cat($self->{'assocdir'}, $archivedir, $assoc_file->[1]);
844	}
845	&util::hard_link ($assoc_file->[0], $afile, $self->{'verbosity'});
846	}
847	}
848

Note: See TracBrowser for help on using the repository browser.

Download in other formats: