source: main/trunk/greenstone2/perllib/basebuildproc.pm@ 22843

Last change on this file since 22843 was 22843, checked in by davidb, 14 years ago

More explicit use of utf8 for input and output file handling. Relies on strings in Perl being Unicode aware (and not merely binary bytes) otherwise binary bytes will then be incorrectly re-incoded as UTF-8 (which is not what you want as they already are in UTF-8 form).

  • Property svn:keywords set to Author Date Id Revision
File size: 22.2 KB
Line 
1##########################################################################
2#
3# basebuildproc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor outputs a document for indexing (should be
27# implemented by subclass) and storing in the database
28
29package basebuildproc;
30
31eval {require bytes};
32
33use classify;
34use dbutil;
35use doc;
36use docproc;
37use strict;
38no strict 'subs';
39no strict 'refs';
40use util;
41
42BEGIN {
43 @basebuildproc::ISA = ('docproc');
44}
45
46sub new()
47 {
48 my ($class, $collection, $source_dir, $build_dir, $keepold, $verbosity, $outhandle) = @_;
49 my $self = new docproc ();
50
51 # outhandle is where all the debugging info goes
52 # output_handle is where the output of the plugins is piped
53 # to (i.e. mg, database etc.)
54 $outhandle = STDERR unless defined $outhandle;
55
56 $self->{'collection'} = $collection;
57 $self->{'source_dir'} = $source_dir;
58 $self->{'build_dir'} = $build_dir;
59 $self->{'keepold'} = $keepold;
60 $self->{'verbosity'} = $verbosity;
61 $self->{'outhandle'} = $outhandle;
62
63 $self->{'classifiers'} = [];
64 $self->{'mode'} = "text";
65 $self->{'assocdir'} = $build_dir;
66 $self->{'dontdb'} = {};
67 $self->{'store_metadata_coverage'} = "false";
68
69 $self->{'index'} = "section:text";
70 $self->{'indexexparr'} = [];
71
72 $self->{'separate_cjk'} = 0;
73
74 my $found_num_data = 0;
75 my $buildconfigfile = undef;
76
77 if ($keepold) {
78 # For incremental building need to seed num_docs etc from values
79 # stored in build.cfg (if present)
80 $buildconfigfile = &util::filename_cat($build_dir, "build.cfg");
81 if (-e $buildconfigfile) {
82 $found_num_data = 1;
83 }
84 else {
85 # try the index dir
86 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},
87 "index", "build.cfg");
88 if (-e $buildconfigfile) {
89 $found_num_data = 1;
90 }
91 }
92
93 }
94
95 if ($found_num_data)
96 {
97 #print STDERR "Found_Num_Data!\n";
98 my $buildcfg = &colcfg::read_build_cfg($buildconfigfile);
99 $self->{'starting_num_docs'} = $buildcfg->{'numdocs'};
100 #print STDERR "- num_docs: $self->{'starting_num_docs'}\n";
101 $self->{'starting_num_sections'} = $buildcfg->{'numsections'};
102 #print STDERR "- num_sections: $self->{'starting_num_sections'}\n";
103 $self->{'starting_num_bytes'} = $buildcfg->{'numbytes'};
104 #print STDERR "- num_bytes: $self->{'starting_num_bytes'}\n";
105 }
106 else
107 {
108 #print STDERR "NOT Found_Num_Data!\n";
109 $self->{'starting_num_docs'} = 0;
110 $self->{'starting_num_sections'} = 0;
111 $self->{'starting_num_bytes'} = 0;
112 }
113
114 $self->{'output_handle'} = "STDOUT";
115 $self->{'num_docs'} = $self->{'starting_num_docs'};
116 $self->{'num_sections'} = $self->{'starting_num_sections'};
117 $self->{'num_bytes'} = $self->{'starting_num_bytes'};
118
119 $self->{'num_processed_bytes'} = 0;
120 $self->{'store_text'} = 1;
121
122 # what level (section/document) the database - indexer intersection is
123 $self->{'db_level'} = "section";
124 #used by browse interface
125 $self->{'doclist'} = [];
126
127 $self->{'indexing_text'} = 0;
128
129 return bless $self, $class;
130
131}
132
133sub reset {
134 my $self = shift (@_);
135
136 $self->{'num_docs'} = $self->{'starting_num_docs'};
137 $self->{'num_sections'} = $self->{'starting_num_sections'};
138 $self->{'num_bytes'} = $self->{'starting_num_bytes'};
139
140 $self->{'num_processed_bytes'} = 0;
141}
142
143sub zero_reset {
144 my $self = shift (@_);
145
146 $self->{'num_docs'} = 0;
147 $self->{'num_sections'} = 0;
148 # reconstructed docs have no text, just metadata, so we need to
149 # remember how many bytes we had initially
150 $self->{'num_bytes'} = $self->{'starting_num_bytes'};
151
152 $self->{'num_processed_bytes'} = 0;
153}
154
155sub is_incremental_capable
156{
157 # By default we return 'no' as the answer
158 # Safer to assume non-incremental to start with, and then override in
159 # inherited classes that are.
160
161 return 0;
162}
163
164sub get_num_docs {
165 my $self = shift (@_);
166
167 return $self->{'num_docs'};
168}
169
170sub get_num_sections {
171 my $self = shift (@_);
172
173 return $self->{'num_sections'};
174}
175
176# num_bytes is the actual number of bytes in the collection
177# this is normally the same as what's processed during text compression
178sub get_num_bytes {
179 my $self = shift (@_);
180
181 return $self->{'num_bytes'};
182}
183
184# num_processed_bytes is the number of bytes actually passed
185# to mg for the current index
186sub get_num_processed_bytes {
187 my $self = shift (@_);
188
189 return $self->{'num_processed_bytes'};
190}
191
192sub set_output_handle {
193 my $self = shift (@_);
194 my ($handle) = @_;
195
196 $self->{'output_handle'} = $handle;
197 binmode($handle,":utf8");
198}
199
200
201sub set_mode {
202 my $self = shift (@_);
203 my ($mode) = @_;
204
205 $self->{'mode'} = $mode;
206}
207
208sub get_mode {
209 my $self = shift (@_);
210
211 return $self->{'mode'};
212}
213
214sub set_assocdir {
215 my $self = shift (@_);
216 my ($assocdir) = @_;
217
218 $self->{'assocdir'} = $assocdir;
219}
220
221sub set_dontdb {
222 my $self = shift (@_);
223 my ($dontdb) = @_;
224
225 $self->{'dontdb'} = $dontdb;
226}
227
228sub set_infodbtype
229{
230 my $self = shift(@_);
231 my $infodbtype = shift(@_);
232 $self->{'infodbtype'} = $infodbtype;
233}
234
235sub set_index {
236 my $self = shift (@_);
237 my ($index, $indexexparr) = @_;
238
239 $self->{'index'} = $index;
240 $self->{'indexexparr'} = $indexexparr if defined $indexexparr;
241}
242
243sub set_index_languages {
244 my $self = shift (@_);
245 my ($lang_meta, $langarr) = @_;
246 $lang_meta =~ s/^ex\.//; # strip ex. if there
247 $self->{'lang_meta'} = $lang_meta;
248 $self->{'langarr'} = $langarr;
249}
250
251sub get_index {
252 my $self = shift (@_);
253
254 return $self->{'index'};
255}
256
257sub set_classifiers {
258 my $self = shift (@_);
259 my ($classifiers) = @_;
260
261 $self->{'classifiers'} = $classifiers;
262}
263
264sub set_indexing_text {
265 my $self = shift (@_);
266 my ($indexing_text) = @_;
267
268 $self->{'indexing_text'} = $indexing_text;
269}
270
271sub get_indexing_text {
272 my $self = shift (@_);
273
274 return $self->{'indexing_text'};
275}
276
277sub set_store_text {
278 my $self = shift (@_);
279 my ($store_text) = @_;
280
281 $self->{'store_text'} = $store_text;
282}
283
284sub set_store_metadata_coverage {
285 my $self = shift (@_);
286 my ($store_metadata_coverage) = @_;
287
288 $self->{'store_metadata_coverage'} = $store_metadata_coverage || "";
289}
290
291sub get_doc_list {
292 my $self = shift(@_);
293
294 return @{$self->{'doclist'}};
295}
296
297# the standard database level is section, but you may want to change it to document
298sub set_db_level {
299 my $self= shift (@_);
300 my ($db_level) = @_;
301
302 $self->{'db_level'} = $db_level;
303}
304
305sub set_sections_index_document_metadata {
306 my $self= shift (@_);
307 my ($index_type) = @_;
308
309 $self->{'sections_index_document_metadata'} = $index_type;
310}
311
312sub set_separate_cjk {
313 my $self = shift (@_);
314 my ($sep_cjk) = @_;
315
316 $self->{'separate_cjk'} = $sep_cjk;
317}
318
319sub process {
320 my $self = shift (@_);
321 my $method = $self->{'mode'};
322
323 $self->$method(@_);
324}
325
326# post process text depending on field. Currently don't do anything here
327# except cjk separation, and only for indexing
328# should only do this for indexed text (if $self->{'indexing_text'}),
329# but currently search term highlighting doesn't work if you do that.
330# once thats fixed up, then fix this.
331sub filter_text {
332 my $self = shift (@_);
333 my ($field, $text) = @_;
334
335 # lets do cjk seg here
336 my $new_text =$text;
337 if ($self->{'separate_cjk'}) {
338 $new_text = &cnseg::segment($text);
339 }
340 return $new_text;
341}
342
343
344sub infodb_metadata_stats
345{
346 my $self = shift (@_);
347 my ($field,$edit_mode) = @_;
348
349 # Keep some statistics relating to metadata sets used and
350 # frequency of particular metadata fields within each set
351
352 # Union of metadata prefixes and frequency of fields
353 # (both scoped for this document alone, and across whole collection)
354
355 if ($field =~ m/^(.+)\.(.*)$/) {
356 my $prefix = $1;
357 my $core_field = $2;
358
359 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
360 $self->{'doc_mdprefix_fields'}->{$prefix}->{$core_field}++;
361 $self->{'mdprefix_fields'}->{$prefix}->{$core_field}++;
362 }
363 else {
364 # delete
365 $self->{'doc_mdprefix_fields'}->{$prefix}->{$core_field}--;
366 $self->{'mdprefix_fields'}->{$prefix}->{$core_field}--;
367 }
368
369 }
370 elsif ($field =~ m/^[[:upper:]]/) {
371 # implicit 'ex' metadata set
372
373 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
374
375 $self->{'doc_mdprefix_fields'}->{'ex'}->{$field}++;
376 $self->{'mdprefix_fields'}->{'ex'}->{$field}++;
377 }
378 else {
379 # delete
380 $self->{'doc_mdprefix_fields'}->{'ex'}->{$field}--;
381 $self->{'mdprefix_fields'}->{'ex'}->{$field}--;
382 }
383 }
384
385}
386
387
388sub infodbedit {
389 my $self = shift (@_);
390 my ($doc_obj, $filename, $edit_mode) = @_;
391
392 # only output this document if it is a "indexed_doc" or "info_doc" (database only) document
393 my $doctype = $doc_obj->get_doc_type();
394 return if ($doctype ne "indexed_doc" && $doctype ne "info_doc");
395
396 my $archivedir = "";
397 if (defined $filename)
398 {
399 # doc_obj derived directly from file
400 my ($dir) = $filename =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
401 $dir = "" unless defined $dir;
402 $dir =~ s/\\/\//g;
403 $dir =~ s/^\/+//;
404 $dir =~ s/\/+$//;
405
406 $archivedir = $dir;
407
408 # resolve the final filenames of the files associated with this document
409 $self->assoc_files ($doc_obj, $archivedir);
410 }
411 else
412 {
413 # doc_obj reconstructed from database (has metadata, doc structure but no text)
414 my $top_section = $doc_obj->get_top_section();
415 $archivedir = $doc_obj->get_metadata_element($top_section,"archivedir");
416 }
417
418 if (($edit_mode eq "add")) {# || ($edit_mode eq "update")) {
419 #add this document to the browse structure
420 push(@{$self->{'doclist'}},$doc_obj->get_OID())
421 unless ($doctype eq "classification");
422 }
423# elsif ($edit_mode eq "update") {
424 # don't do anything. this assumes it will keep the same id.
425# }
426 elsif ($edit_mode eq "delete") {
427 # delete => remove this doc from browse structure
428 my $del_doc_oid = $doc_obj->get_OID();
429
430 my @filtered_doc_list = ();
431 foreach my $oid (@{$self->{'doclist'}}) {
432 push(@filtered_doc_list,$oid) if ($oid ne $del_doc_oid);
433 }
434 $self->{'doclist'} = \@filtered_doc_list;
435 }
436
437
438 # classify this document
439 &classify::classify_doc ($self->{'classifiers'}, $doc_obj, $edit_mode);
440
441 if (($edit_mode eq "add")) {# || ($edit_mode eq "update")) {
442 # this is another document
443 $self->{'num_docs'} += 1 unless ($doctype eq "classification");
444 }
445 elsif ($edit_mode eq "delete") {
446 # delete
447 $self->{'num_docs'} -= 1 unless ($doctype eq "classification");
448 }
449
450 # is this a paged or a hierarchical document
451 my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
452
453 my $section = $doc_obj->get_top_section ();
454 my $doc_OID = $doc_obj->get_OID();
455 my $first = 1;
456 my $infodb_handle = $self->{'output_handle'};
457
458 $self->{'doc_mdprefix_fields'} = {};
459
460 while (defined $section)
461 {
462 my $section_OID = $doc_OID;
463 if ($section ne "")
464 {
465 $section_OID = $doc_OID . "." . $section;
466 }
467 my %section_infodb = ();
468
469 # update a few statistics - how do we do update????
470 if (($edit_mode eq "add")) {# || ($edit_mode eq "update")) {
471
472 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
473 $self->{'num_sections'} += 1 unless ($doctype eq "classification");
474 }
475 elsif ($edit_mode eq "delete") {
476 # delete
477 $self->{'num_bytes'} -= $doc_obj->get_text_length ($section);
478 $self->{'num_sections'} -= 1 unless ($doctype eq "classification");
479 }
480
481 # output the fact that this document is a document (unless doctype
482 # has been set to something else from within a plugin
483 my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
484 if (!defined $dtype || $dtype !~ /\w/) {
485 $section_infodb{"doctype"} = [ "doc" ];
486 }
487
488 # Output whether this node contains text
489 #
490 # If doc_obj reconstructed from database file then no need to
491 # explicitly add <hastxt> as this is preserved as metadata when
492 # the database file is loaded in
493 if (defined $filename)
494 {
495 # doc_obj derived directly from file
496 if ($doc_obj->get_text_length($section) > 0) {
497 $section_infodb{"hastxt"} = [ "1" ];
498 } else {
499 $section_infodb{"hastxt"} = [ "0" ];
500 }
501 }
502
503 # output all the section metadata
504 my $metadata = $doc_obj->get_all_metadata ($section);
505 foreach my $pair (@$metadata) {
506 my ($field, $value) = (@$pair);
507
508 if ($field ne "Identifier" && $field !~ /^gsdl/ &&
509 defined $value && $value ne "") {
510
511 # escape problematic stuff
512 $value =~ s/([^\\])\\([^\\])/$1\\\\$2/g;
513 $value =~ s/\n/\\n/g;
514 $value =~ s/\r/\\r/g;
515 # remove ex. if there
516 $field =~ s/^ex\.//;
517
518 # special case for URL metadata
519 if ($field =~ /^URL$/i) {
520 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
521
522 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $value, { 'section' => [ $section_OID ] });
523 }
524 else {
525 # delete
526 &dbutil::delete_infodb_entry($self->{'infodbtype'}, $infodb_handle, $value);
527 }
528
529
530 }
531
532 if (!defined $self->{'dontdb'}->{$field}) {
533 push(@{$section_infodb{$field}}, $value);
534
535 if ($section eq "" && $self->{'store_metadata_coverage'} =~ /^true$/i)
536 {
537 $self->infodb_metadata_stats($field,$edit_mode);
538 }
539 }
540 }
541 }
542
543 if ($section eq "")
544 {
545 my $doc_mdprefix_fields = $self->{'doc_mdprefix_fields'};
546
547 foreach my $prefix (keys %$doc_mdprefix_fields)
548 {
549 push(@{$section_infodb{"metadataset"}}, $prefix);
550
551 foreach my $field (keys %{$doc_mdprefix_fields->{$prefix}})
552 {
553 push(@{$section_infodb{"metadatalist-$prefix"}}, $field);
554
555 my $val = $doc_mdprefix_fields->{$prefix}->{$field};
556 push(@{$section_infodb{"metadatafreq-$prefix-$field"}}, $val);
557 }
558 }
559 }
560
561 # If doc_obj reconstructed from database file then no need to
562 # explicitly add <archivedir> as this is preserved as metadata when
563 # the database file is loaded in
564 if (defined $filename)
565 {
566 # output archivedir if at top level
567 if ($section eq $doc_obj->get_top_section()) {
568 $section_infodb{"archivedir"} = [ $archivedir ];
569 }
570 }
571
572 # output document display type
573 if ($first) {
574 $section_infodb{"thistype"} = [ $thistype ];
575 }
576
577 if ($self->{'db_level'} eq "document") {
578 # doc num is num_docs not num_sections
579 # output the matching document number
580 $section_infodb{"docnum"} = [ $self->{'num_docs'} ];
581 }
582 else {
583 # output a list of children
584 my $children = $doc_obj->get_children ($section);
585 if (scalar(@$children) > 0) {
586 $section_infodb{"childtype"} = [ $childtype ];
587 my $contains = "";
588 foreach my $child (@$children)
589 {
590 $contains .= ";" unless ($contains eq "");
591 if ($child =~ /^.*?\.(\d+)$/)
592 {
593 $contains .= "\".$1";
594 }
595 else
596 {
597 $contains .= "\".$child";
598 }
599 }
600 $section_infodb{"contains"} = [ $contains ];
601 }
602 # output the matching doc number
603 $section_infodb{"docnum"} = [ $self->{'num_sections'} ];
604 }
605
606 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
607 # in case of update, this will overwrite old entry??
608 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $section_OID, \%section_infodb);
609 }
610 else {
611 # delete
612 &dbutil::delete_infodb_entry($self->{'infodbtype'}, $infodb_handle, $section_OID);
613 }
614
615
616 # output a database entry for the document number, except for Lucene (which no longer needs this information)
617 unless (ref($self) eq "lucenebuildproc")
618 {
619 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
620
621 if ($self->{'db_level'} eq "document") {
622 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_docs'}, { 'section' => [ $doc_OID ] });
623 }
624 else {
625 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_sections'}, { 'section' => [ $section_OID ] });
626 }
627 }
628 else {
629
630 if ($self->{'db_level'} eq "document") {
631 &dbutil::delete_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_docs'});
632 }
633 else {
634 &dbutil::delete_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_sections'});
635 }
636
637 }
638 }
639
640 $first = 0;
641 $section = $doc_obj->get_next_section($section);
642 last if ($self->{'db_level'} eq "document"); # if no sections wanted, only add the docs
643 }
644}
645
646
647
648
649sub infodb {
650 my $self = shift (@_);
651 my ($doc_obj, $filename) = @_;
652
653 $self->infodbedit($doc_obj,$filename,"add");
654}
655
656sub infodbreindex {
657 my $self = shift (@_);
658 my ($doc_obj, $filename) = @_;
659
660 $self->infodbedit($doc_obj,$filename,"update");
661}
662
663sub infodbdelete {
664 my $self = shift (@_);
665 my ($doc_obj, $filename) = @_;
666
667 $self->infodbedit($doc_obj,$filename,"delete");
668}
669
670
671sub text {
672 my $self = shift (@_);
673 my ($doc_obj) = @_;
674
675 my $handle = $self->{'outhandle'};
676 print $handle "basebuildproc::text function must be implemented in sub classes\n";
677 die "\n";
678}
679
680sub textreindex
681{
682 my $self = shift @_;
683
684 my $outhandle = $self->{'outhandle'};
685 print $outhandle "basebuildproc::textreindex function must be implemented in sub classes\n";
686 if (!$self->is_incremental_capable()) {
687
688 print $outhandle " This operation is only possible with indexing tools with that support\n";
689 print $outhandle " incremental building\n";
690 }
691 die "\n";
692}
693
694sub textdelete
695{
696 my $self = shift @_;
697
698 my $outhandle = $self->{'outhandle'};
699 print $outhandle "basebuildproc::textdelete function must be implemented in sub classes\n";
700 if (!$self->is_incremental_capable()) {
701
702 print $outhandle " This operation is only possible with indexing tools with that support\n";
703 print $outhandle " incremental building\n";
704 }
705 die "\n";
706}
707
708
709# should the document be indexed - according to the subcollection and language
710# specification.
711sub is_subcollection_doc {
712 my $self = shift (@_);
713 my ($doc_obj) = @_;
714
715 my $indexed_doc = 1;
716 foreach my $indexexp (@{$self->{'indexexparr'}}) {
717 $indexed_doc = 0;
718 my ($field, $exp, $options) = split /\//, $indexexp;
719 if (defined ($field) && defined ($exp)) {
720 my ($bool) = $field =~ /^(.)/;
721 $field =~ s/^.// if $bool eq '!';
722 my @metadata_values;
723 if ($field =~ /^filename$/i) {
724 push(@metadata_values, $doc_obj->get_source_filename());
725 }
726 else {
727 $field =~ s/^ex\.//; #strip ex. if present
728 @metadata_values = @{$doc_obj->get_metadata($doc_obj->get_top_section(), $field)};
729 }
730 next unless @metadata_values;
731 foreach my $metadata_value (@metadata_values) {
732 if ($bool eq '!') {
733 if ($options =~ /^i$/i) {
734 if ($metadata_value !~ /$exp/i) {$indexed_doc = 1; last;}
735 } else {
736 if ($metadata_value !~ /$exp/) {$indexed_doc = 1; last;}
737 }
738 } else {
739 if ($options =~ /^i$/i) {
740 if ($metadata_value =~ /$exp/i) {$indexed_doc = 1; last;}
741 } else {
742 if ($metadata_value =~ /$exp/) {$indexed_doc = 1; last;}
743 }
744 }
745 }
746
747 last if ($indexed_doc == 1);
748 }
749 }
750
751 # if this doc is so far in the sub collection, and we have lang info,
752 # now we check the languages to see if it matches
753 if($indexed_doc && defined $self->{'lang_meta'}) {
754 $indexed_doc = 0;
755 my $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'lang_meta'});
756 if (defined $field) {
757 foreach my $lang (@{$self->{'langarr'}}) {
758 my ($bool) = $lang =~ /^(.)/;
759 if ($bool eq '!') {
760 $lang =~ s/^.//;
761 if ($field !~ /$lang/) {
762 $indexed_doc = 1; last;
763 }
764 } else {
765 if ($field =~ /$lang/) {
766 $indexed_doc = 1; last;
767 }
768 }
769 }
770 }
771 }
772 return $indexed_doc;
773
774}
775
776# use 'Paged' if document has no more than 2 levels
777# and each section at second level has a number for
778# Title metadata
779# also use Paged if gsdlthistype metadata is set to Paged
780sub get_document_type {
781 my $self = shift (@_);
782 my ($doc_obj) = @_;
783
784 my $thistype = "VList";
785 my $childtype = "VList";
786 my $title;
787 my @tmp = ();
788
789 my $section = $doc_obj->get_top_section ();
790
791 my $gsdlthistype = $doc_obj->get_metadata_element ($section, "gsdlthistype");
792 if (defined $gsdlthistype) {
793 if ($gsdlthistype eq "Paged") {
794 $childtype = "Paged";
795 if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
796 $thistype = "Paged";
797 } else {
798 $thistype = "Invisible";
799 }
800
801 return ($thistype, $childtype);
802 } elsif ($gsdlthistype eq "Hierarchy") {
803 return ($thistype, $childtype); # use VList, VList
804 }
805 }
806 my $first = 1;
807 while (defined $section) {
808 @tmp = split /\./, $section;
809 if (scalar(@tmp) > 1) {
810 return ($thistype, $childtype);
811 }
812 if (!$first) {
813 $title = $doc_obj->get_metadata_element ($section, "Title");
814 if (!defined $title || $title !~ /^\d+$/) {
815 return ($thistype, $childtype);
816 }
817 }
818 $first = 0;
819 $section = $doc_obj->get_next_section($section);
820 }
821 if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
822 $thistype = "Paged";
823 } else {
824 $thistype = "Invisible";
825 }
826 $childtype = "Paged";
827 return ($thistype, $childtype);
828}
829
830sub assoc_files
831{
832 my $self = shift (@_);
833 my ($doc_obj, $archivedir) = @_;
834 my ($afile);
835
836 foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
837 #rint STDERR "Processing associated file - copy " . $assoc_file->[0] . " to " . $assoc_file->[1] . "\n";
838 # if assoc file starts with a slash, we put it relative to the assoc
839 # dir, otherwise it is relative to the HASH... directory
840 if ($assoc_file->[1] =~ m@^[/\\]@) {
841 $afile = &util::filename_cat($self->{'assocdir'}, $assoc_file->[1]);
842 } else {
843 $afile = &util::filename_cat($self->{'assocdir'}, $archivedir, $assoc_file->[1]);
844 }
845 &util::hard_link ($assoc_file->[0], $afile, $self->{'verbosity'});
846 }
847}
848
Note: See TracBrowser for help on using the repository browser.