source: main/trunk/greenstone2/perllib/basebuildproc.pm@ 22296

Last change on this file since 22296 was 22296, checked in by kjdon, 14 years ago

remove ex. from any metadata before writing it to gdbm file. EmbeddedMetadataPlugin now saves its metadata as ex.

  • Property svn:keywords set to Author Date Id Revision
File size: 22.2 KB
Line 
1##########################################################################
2#
3# basebuildproc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor outputs a document for indexing (should be
27# implemented by subclass) and storing in the database
28
29package basebuildproc;
30
31eval {require bytes};
32
33use classify;
34use dbutil;
35use doc;
36use docproc;
37use strict;
38no strict 'subs';
39no strict 'refs';
40use util;
41
42BEGIN {
43 @basebuildproc::ISA = ('docproc');
44}
45
46sub new()
47 {
48 my ($class, $collection, $source_dir, $build_dir, $keepold, $verbosity, $outhandle) = @_;
49 my $self = new docproc ();
50
51 # outhandle is where all the debugging info goes
52 # output_handle is where the output of the plugins is piped
53 # to (i.e. mg, database etc.)
54 $outhandle = STDERR unless defined $outhandle;
55
56 $self->{'collection'} = $collection;
57 $self->{'source_dir'} = $source_dir;
58 $self->{'build_dir'} = $build_dir;
59 $self->{'keepold'} = $keepold;
60 $self->{'verbosity'} = $verbosity;
61 $self->{'outhandle'} = $outhandle;
62
63 $self->{'classifiers'} = [];
64 $self->{'mode'} = "text";
65 $self->{'assocdir'} = $build_dir;
66 $self->{'dontdb'} = {};
67 $self->{'store_metadata_coverage'} = "false";
68
69 $self->{'index'} = "section:text";
70 $self->{'indexexparr'} = [];
71
72 $self->{'separate_cjk'} = 0;
73
74 my $found_num_data = 0;
75 my $buildconfigfile = undef;
76
77 if ($keepold) {
78 # For incremental building need to seed num_docs etc from values
79 # stored in build.cfg (if present)
80 $buildconfigfile = &util::filename_cat($build_dir, "build.cfg");
81 if (-e $buildconfigfile) {
82 $found_num_data = 1;
83 }
84 else {
85 # try the index dir
86 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},
87 "index", "build.cfg");
88 if (-e $buildconfigfile) {
89 $found_num_data = 1;
90 }
91 }
92
93 }
94
95 if ($found_num_data)
96 {
97 #print STDERR "Found_Num_Data!\n";
98 my $buildcfg = &colcfg::read_build_cfg($buildconfigfile);
99 $self->{'starting_num_docs'} = $buildcfg->{'numdocs'};
100 #print STDERR "- num_docs: $self->{'starting_num_docs'}\n";
101 $self->{'starting_num_sections'} = $buildcfg->{'numsections'};
102 #print STDERR "- num_sections: $self->{'starting_num_sections'}\n";
103 $self->{'starting_num_bytes'} = $buildcfg->{'numbytes'};
104 #print STDERR "- num_bytes: $self->{'starting_num_bytes'}\n";
105 }
106 else
107 {
108 #print STDERR "NOT Found_Num_Data!\n";
109 $self->{'starting_num_docs'} = 0;
110 $self->{'starting_num_sections'} = 0;
111 $self->{'starting_num_bytes'} = 0;
112 }
113
114 $self->{'output_handle'} = "STDOUT";
115 $self->{'num_docs'} = $self->{'starting_num_docs'};
116 $self->{'num_sections'} = $self->{'starting_num_sections'};
117 $self->{'num_bytes'} = $self->{'starting_num_bytes'};
118
119 $self->{'num_processed_bytes'} = 0;
120 $self->{'store_text'} = 1;
121
122 # what level (section/document) the database - indexer intersection is
123 $self->{'db_level'} = "section";
124 #used by browse interface
125 $self->{'doclist'} = [];
126
127 $self->{'indexing_text'} = 0;
128
129 return bless $self, $class;
130
131}
132
133sub reset {
134 my $self = shift (@_);
135
136 $self->{'num_docs'} = $self->{'starting_num_docs'};
137 $self->{'num_sections'} = $self->{'starting_num_sections'};
138 $self->{'num_bytes'} = $self->{'starting_num_bytes'};
139
140 $self->{'num_processed_bytes'} = 0;
141}
142
143sub zero_reset {
144 my $self = shift (@_);
145
146 $self->{'num_docs'} = 0;
147 $self->{'num_sections'} = 0;
148 # reconstructed docs have no text, just metadata, so we need to
149 # remember how many bytes we had initially
150 $self->{'num_bytes'} = $self->{'starting_num_bytes'};
151
152 $self->{'num_processed_bytes'} = 0;
153}
154
155sub is_incremental_capable
156{
157 # By default we return 'no' as the answer
158 # Safer to assume non-incremental to start with, and then override in
159 # inherited classes that are.
160
161 return 0;
162}
163
164sub get_num_docs {
165 my $self = shift (@_);
166
167 return $self->{'num_docs'};
168}
169
170sub get_num_sections {
171 my $self = shift (@_);
172
173 return $self->{'num_sections'};
174}
175
176# num_bytes is the actual number of bytes in the collection
177# this is normally the same as what's processed during text compression
178sub get_num_bytes {
179 my $self = shift (@_);
180
181 return $self->{'num_bytes'};
182}
183
184# num_processed_bytes is the number of bytes actually passed
185# to mg for the current index
186sub get_num_processed_bytes {
187 my $self = shift (@_);
188
189 return $self->{'num_processed_bytes'};
190}
191
192sub set_output_handle {
193 my $self = shift (@_);
194 my ($handle) = @_;
195
196 $self->{'output_handle'} = $handle;
197}
198
199
200sub set_mode {
201 my $self = shift (@_);
202 my ($mode) = @_;
203
204 $self->{'mode'} = $mode;
205}
206
207sub get_mode {
208 my $self = shift (@_);
209
210 return $self->{'mode'};
211}
212
213sub set_assocdir {
214 my $self = shift (@_);
215 my ($assocdir) = @_;
216
217 $self->{'assocdir'} = $assocdir;
218}
219
220sub set_dontdb {
221 my $self = shift (@_);
222 my ($dontdb) = @_;
223
224 $self->{'dontdb'} = $dontdb;
225}
226
227sub set_infodbtype
228{
229 my $self = shift(@_);
230 my $infodbtype = shift(@_);
231 $self->{'infodbtype'} = $infodbtype;
232}
233
234sub set_index {
235 my $self = shift (@_);
236 my ($index, $indexexparr) = @_;
237
238 $self->{'index'} = $index;
239 $self->{'indexexparr'} = $indexexparr if defined $indexexparr;
240}
241
242sub set_index_languages {
243 my $self = shift (@_);
244 my ($lang_meta, $langarr) = @_;
245 $lang_meta =~ s/^ex\.//; # strip ex. if there
246 $self->{'lang_meta'} = $lang_meta;
247 $self->{'langarr'} = $langarr;
248}
249
250sub get_index {
251 my $self = shift (@_);
252
253 return $self->{'index'};
254}
255
256sub set_classifiers {
257 my $self = shift (@_);
258 my ($classifiers) = @_;
259
260 $self->{'classifiers'} = $classifiers;
261}
262
263sub set_indexing_text {
264 my $self = shift (@_);
265 my ($indexing_text) = @_;
266
267 $self->{'indexing_text'} = $indexing_text;
268}
269
270sub get_indexing_text {
271 my $self = shift (@_);
272
273 return $self->{'indexing_text'};
274}
275
276sub set_store_text {
277 my $self = shift (@_);
278 my ($store_text) = @_;
279
280 $self->{'store_text'} = $store_text;
281}
282
283sub set_store_metadata_coverage {
284 my $self = shift (@_);
285 my ($store_metadata_coverage) = @_;
286
287 $self->{'store_metadata_coverage'} = $store_metadata_coverage || "";
288}
289
290sub get_doc_list {
291 my $self = shift(@_);
292
293 return @{$self->{'doclist'}};
294}
295
296# the standard database level is section, but you may want to change it to document
297sub set_db_level {
298 my $self= shift (@_);
299 my ($db_level) = @_;
300
301 $self->{'db_level'} = $db_level;
302}
303
304sub set_sections_index_document_metadata {
305 my $self= shift (@_);
306 my ($index_type) = @_;
307
308 $self->{'sections_index_document_metadata'} = $index_type;
309}
310
311sub set_separate_cjk {
312 my $self = shift (@_);
313 my ($sep_cjk) = @_;
314
315 $self->{'separate_cjk'} = $sep_cjk;
316}
317
318sub process {
319 my $self = shift (@_);
320 my $method = $self->{'mode'};
321
322 $self->$method(@_);
323}
324
325# post process text depending on field. Currently don't do anything here
326# except cjk separation, and only for indexing
327# should only do this for indexed text (if $self->{'indexing_text'}),
328# but currently search term highlighting doesn't work if you do that.
329# once thats fixed up, then fix this.
330sub filter_text {
331 my $self = shift (@_);
332 my ($field, $text) = @_;
333
334 # lets do cjk seg here
335 my $new_text =$text;
336 if ($self->{'separate_cjk'}) {
337 $new_text = &cnseg::segment($text);
338 }
339 return $new_text;
340}
341
342
343sub infodb_metadata_stats
344{
345 my $self = shift (@_);
346 my ($field,$edit_mode) = @_;
347
348 # Keep some statistics relating to metadata sets used and
349 # frequency of particular metadata fields within each set
350
351 # Union of metadata prefixes and frequency of fields
352 # (both scoped for this document alone, and across whole collection)
353
354 if ($field =~ m/^(.+)\.(.*)$/) {
355 my $prefix = $1;
356 my $core_field = $2;
357
358 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
359 $self->{'doc_mdprefix_fields'}->{$prefix}->{$core_field}++;
360 $self->{'mdprefix_fields'}->{$prefix}->{$core_field}++;
361 }
362 else {
363 # delete
364 $self->{'doc_mdprefix_fields'}->{$prefix}->{$core_field}--;
365 $self->{'mdprefix_fields'}->{$prefix}->{$core_field}--;
366 }
367
368 }
369 elsif ($field =~ m/^[[:upper:]]/) {
370 # implicit 'ex' metadata set
371
372 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
373
374 $self->{'doc_mdprefix_fields'}->{'ex'}->{$field}++;
375 $self->{'mdprefix_fields'}->{'ex'}->{$field}++;
376 }
377 else {
378 # delete
379 $self->{'doc_mdprefix_fields'}->{'ex'}->{$field}--;
380 $self->{'mdprefix_fields'}->{'ex'}->{$field}--;
381 }
382 }
383
384}
385
386
387sub infodbedit {
388 my $self = shift (@_);
389 my ($doc_obj, $filename, $edit_mode) = @_;
390
391 # only output this document if it is a "indexed_doc" or "info_doc" (database only) document
392 my $doctype = $doc_obj->get_doc_type();
393 return if ($doctype ne "indexed_doc" && $doctype ne "info_doc");
394
395 my $archivedir = "";
396 if (defined $filename)
397 {
398 # doc_obj derived directly from file
399 my ($dir) = $filename =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
400 $dir = "" unless defined $dir;
401 $dir =~ s/\\/\//g;
402 $dir =~ s/^\/+//;
403 $dir =~ s/\/+$//;
404
405 $archivedir = $dir;
406
407 # resolve the final filenames of the files associated with this document
408 $self->assoc_files ($doc_obj, $archivedir);
409 }
410 else
411 {
412 # doc_obj reconstructed from database (has metadata, doc structure but no text)
413 my $top_section = $doc_obj->get_top_section();
414 $archivedir = $doc_obj->get_metadata_element($top_section,"archivedir");
415 }
416
417 if (($edit_mode eq "add")) {# || ($edit_mode eq "update")) {
418 #add this document to the browse structure
419 push(@{$self->{'doclist'}},$doc_obj->get_OID())
420 unless ($doctype eq "classification");
421 }
422# elsif ($edit_mode eq "update") {
423 # don't do anything. this assumes it will keep the same id.
424# }
425 elsif ($edit_mode eq "delete") {
426 # delete => remove this doc from browse structure
427 my $del_doc_oid = $doc_obj->get_OID();
428
429 my @filtered_doc_list = ();
430 foreach my $oid (@{$self->{'doclist'}}) {
431 push(@filtered_doc_list,$oid) if ($oid ne $del_doc_oid);
432 }
433 $self->{'doclist'} = \@filtered_doc_list;
434 }
435
436
437 # classify this document
438 &classify::classify_doc ($self->{'classifiers'}, $doc_obj, $edit_mode);
439
440 if (($edit_mode eq "add")) {# || ($edit_mode eq "update")) {
441 # this is another document
442 $self->{'num_docs'} += 1 unless ($doctype eq "classification");
443 }
444 elsif ($edit_mode eq "delete") {
445 # delete
446 $self->{'num_docs'} -= 1 unless ($doctype eq "classification");
447 }
448
449 # is this a paged or a hierarchical document
450 my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
451
452 my $section = $doc_obj->get_top_section ();
453 my $doc_OID = $doc_obj->get_OID();
454 my $first = 1;
455 my $infodb_handle = $self->{'output_handle'};
456
457 $self->{'doc_mdprefix_fields'} = {};
458
459 while (defined $section)
460 {
461 my $section_OID = $doc_OID;
462 if ($section ne "")
463 {
464 $section_OID = $doc_OID . "." . $section;
465 }
466 my %section_infodb = ();
467
468 # update a few statistics - how do we do update????
469 if (($edit_mode eq "add")) {# || ($edit_mode eq "update")) {
470
471 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
472 $self->{'num_sections'} += 1 unless ($doctype eq "classification");
473 }
474 elsif ($edit_mode eq "delete") {
475 # delete
476 $self->{'num_bytes'} -= $doc_obj->get_text_length ($section);
477 $self->{'num_sections'} -= 1 unless ($doctype eq "classification");
478 }
479
480 # output the fact that this document is a document (unless doctype
481 # has been set to something else from within a plugin
482 my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
483 if (!defined $dtype || $dtype !~ /\w/) {
484 $section_infodb{"doctype"} = [ "doc" ];
485 }
486
487 # Output whether this node contains text
488 #
489 # If doc_obj reconstructed from database file then no need to
490 # explicitly add <hastxt> as this is preserved as metadata when
491 # the database file is loaded in
492 if (defined $filename)
493 {
494 # doc_obj derived directly from file
495 if ($doc_obj->get_text_length($section) > 0) {
496 $section_infodb{"hastxt"} = [ "1" ];
497 } else {
498 $section_infodb{"hastxt"} = [ "0" ];
499 }
500 }
501
502 # output all the section metadata
503 my $metadata = $doc_obj->get_all_metadata ($section);
504 foreach my $pair (@$metadata) {
505 my ($field, $value) = (@$pair);
506
507 if ($field ne "Identifier" && $field !~ /^gsdl/ &&
508 defined $value && $value ne "") {
509
510 # escape problematic stuff
511 $value =~ s/\\/\\\\/g;
512 $value =~ s/\n/\\n/g;
513 $value =~ s/\r/\\r/g;
514 # remove ex. if there
515 $field =~ s/^ex\.//;
516
517 # special case for URL metadata
518 if ($field =~ /^URL$/i) {
519 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
520
521 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $value, { 'section' => [ $section_OID ] });
522 }
523 else {
524 # delete
525 &dbutil::delete_infodb_entry($self->{'infodbtype'}, $infodb_handle, $value);
526 }
527
528
529 }
530
531 if (!defined $self->{'dontdb'}->{$field}) {
532 push(@{$section_infodb{$field}}, $value);
533
534 if ($section eq "" && $self->{'store_metadata_coverage'} =~ /^true$/i)
535 {
536 $self->infodb_metadata_stats($field,$edit_mode);
537 }
538 }
539 }
540 }
541
542 if ($section eq "")
543 {
544 my $doc_mdprefix_fields = $self->{'doc_mdprefix_fields'};
545
546 foreach my $prefix (keys %$doc_mdprefix_fields)
547 {
548 push(@{$section_infodb{"metadataset"}}, $prefix);
549
550 foreach my $field (keys %{$doc_mdprefix_fields->{$prefix}})
551 {
552 push(@{$section_infodb{"metadatalist-$prefix"}}, $field);
553
554 my $val = $doc_mdprefix_fields->{$prefix}->{$field};
555 push(@{$section_infodb{"metadatafreq-$prefix-$field"}}, $val);
556 }
557 }
558 }
559
560 # If doc_obj reconstructed from database file then no need to
561 # explicitly add <archivedir> as this is preserved as metadata when
562 # the database file is loaded in
563 if (defined $filename)
564 {
565 # output archivedir if at top level
566 if ($section eq $doc_obj->get_top_section()) {
567 $section_infodb{"archivedir"} = [ $archivedir ];
568 }
569 }
570
571 # output document display type
572 if ($first) {
573 $section_infodb{"thistype"} = [ $thistype ];
574 }
575
576 if ($self->{'db_level'} eq "document") {
577 # doc num is num_docs not num_sections
578 # output the matching document number
579 $section_infodb{"docnum"} = [ $self->{'num_docs'} ];
580 }
581 else {
582 # output a list of children
583 my $children = $doc_obj->get_children ($section);
584 if (scalar(@$children) > 0) {
585 $section_infodb{"childtype"} = [ $childtype ];
586 my $contains = "";
587 foreach my $child (@$children)
588 {
589 $contains .= ";" unless ($contains eq "");
590 if ($child =~ /^.*?\.(\d+)$/)
591 {
592 $contains .= "\".$1";
593 }
594 else
595 {
596 $contains .= "\".$child";
597 }
598 }
599 $section_infodb{"contains"} = [ $contains ];
600 }
601 # output the matching doc number
602 $section_infodb{"docnum"} = [ $self->{'num_sections'} ];
603 }
604
605 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
606 # in case of update, this will overwrite old entry??
607 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $section_OID, \%section_infodb);
608 }
609 else {
610 # delete
611 &dbutil::delete_infodb_entry($self->{'infodbtype'}, $infodb_handle, $section_OID);
612 }
613
614
615 # output a database entry for the document number, except for Lucene (which no longer needs this information)
616 unless (ref($self) eq "lucenebuildproc")
617 {
618 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
619
620 if ($self->{'db_level'} eq "document") {
621 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_docs'}, { 'section' => [ $doc_OID ] });
622 }
623 else {
624 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_sections'}, { 'section' => [ $section_OID ] });
625 }
626 }
627 else {
628
629 if ($self->{'db_level'} eq "document") {
630 &dbutil::delete_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_docs'});
631 }
632 else {
633 &dbutil::delete_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_sections'});
634 }
635
636 }
637 }
638
639 $first = 0;
640 $section = $doc_obj->get_next_section($section);
641 last if ($self->{'db_level'} eq "document"); # if no sections wanted, only add the docs
642 }
643}
644
645
646
647
648sub infodb {
649 my $self = shift (@_);
650 my ($doc_obj, $filename) = @_;
651
652 $self->infodbedit($doc_obj,$filename,"add");
653}
654
655sub infodbreindex {
656 my $self = shift (@_);
657 my ($doc_obj, $filename) = @_;
658
659 $self->infodbedit($doc_obj,$filename,"update");
660}
661
662sub infodbdelete {
663 my $self = shift (@_);
664 my ($doc_obj, $filename) = @_;
665
666 $self->infodbedit($doc_obj,$filename,"delete");
667}
668
669
670sub text {
671 my $self = shift (@_);
672 my ($doc_obj) = @_;
673
674 my $handle = $self->{'outhandle'};
675 print $handle "basebuildproc::text function must be implemented in sub classes\n";
676 die "\n";
677}
678
679sub textreindex
680{
681 my $self = shift @_;
682
683 my $outhandle = $self->{'outhandle'};
684 print $outhandle "basebuildproc::textreindex function must be implemented in sub classes\n";
685 if (!$self->is_incremental_capable()) {
686
687 print $outhandle " This operation is only possible with indexing tools with that support\n";
688 print $outhandle " incremental building\n";
689 }
690 die "\n";
691}
692
693sub textdelete
694{
695 my $self = shift @_;
696
697 my $outhandle = $self->{'outhandle'};
698 print $outhandle "basebuildproc::textdelete function must be implemented in sub classes\n";
699 if (!$self->is_incremental_capable()) {
700
701 print $outhandle " This operation is only possible with indexing tools with that support\n";
702 print $outhandle " incremental building\n";
703 }
704 die "\n";
705}
706
707
708# should the document be indexed - according to the subcollection and language
709# specification.
710sub is_subcollection_doc {
711 my $self = shift (@_);
712 my ($doc_obj) = @_;
713
714 my $indexed_doc = 1;
715 foreach my $indexexp (@{$self->{'indexexparr'}}) {
716 $indexed_doc = 0;
717 my ($field, $exp, $options) = split /\//, $indexexp;
718 if (defined ($field) && defined ($exp)) {
719 my ($bool) = $field =~ /^(.)/;
720 $field =~ s/^.// if $bool eq '!';
721 my @metadata_values;
722 if ($field =~ /^filename$/i) {
723 push(@metadata_values, $doc_obj->get_source_filename());
724 }
725 else {
726 $field =~ s/^ex\.//; #strip ex. if present
727 @metadata_values = @{$doc_obj->get_metadata($doc_obj->get_top_section(), $field)};
728 }
729 next unless @metadata_values;
730 foreach my $metadata_value (@metadata_values) {
731 if ($bool eq '!') {
732 if ($options =~ /^i$/i) {
733 if ($metadata_value !~ /$exp/i) {$indexed_doc = 1; last;}
734 } else {
735 if ($metadata_value !~ /$exp/) {$indexed_doc = 1; last;}
736 }
737 } else {
738 if ($options =~ /^i$/i) {
739 if ($metadata_value =~ /$exp/i) {$indexed_doc = 1; last;}
740 } else {
741 if ($metadata_value =~ /$exp/) {$indexed_doc = 1; last;}
742 }
743 }
744 }
745
746 last if ($indexed_doc == 1);
747 }
748 }
749
750 # if this doc is so far in the sub collection, and we have lang info,
751 # now we check the languages to see if it matches
752 if($indexed_doc && defined $self->{'lang_meta'}) {
753 $indexed_doc = 0;
754 my $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'lang_meta'});
755 if (defined $field) {
756 foreach my $lang (@{$self->{'langarr'}}) {
757 my ($bool) = $lang =~ /^(.)/;
758 if ($bool eq '!') {
759 $lang =~ s/^.//;
760 if ($field !~ /$lang/) {
761 $indexed_doc = 1; last;
762 }
763 } else {
764 if ($field =~ /$lang/) {
765 $indexed_doc = 1; last;
766 }
767 }
768 }
769 }
770 }
771 return $indexed_doc;
772
773}
774
775# use 'Paged' if document has no more than 2 levels
776# and each section at second level has a number for
777# Title metadata
778# also use Paged if gsdlthistype metadata is set to Paged
779sub get_document_type {
780 my $self = shift (@_);
781 my ($doc_obj) = @_;
782
783 my $thistype = "VList";
784 my $childtype = "VList";
785 my $title;
786 my @tmp = ();
787
788 my $section = $doc_obj->get_top_section ();
789
790 my $gsdlthistype = $doc_obj->get_metadata_element ($section, "gsdlthistype");
791 if (defined $gsdlthistype) {
792 if ($gsdlthistype eq "Paged") {
793 $childtype = "Paged";
794 if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
795 $thistype = "Paged";
796 } else {
797 $thistype = "Invisible";
798 }
799
800 return ($thistype, $childtype);
801 } elsif ($gsdlthistype eq "Hierarchy") {
802 return ($thistype, $childtype); # use VList, VList
803 }
804 }
805 my $first = 1;
806 while (defined $section) {
807 @tmp = split /\./, $section;
808 if (scalar(@tmp) > 1) {
809 return ($thistype, $childtype);
810 }
811 if (!$first) {
812 $title = $doc_obj->get_metadata_element ($section, "Title");
813 if (!defined $title || $title !~ /^\d+$/) {
814 return ($thistype, $childtype);
815 }
816 }
817 $first = 0;
818 $section = $doc_obj->get_next_section($section);
819 }
820 if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
821 $thistype = "Paged";
822 } else {
823 $thistype = "Invisible";
824 }
825 $childtype = "Paged";
826 return ($thistype, $childtype);
827}
828
829sub assoc_files
830{
831 my $self = shift (@_);
832 my ($doc_obj, $archivedir) = @_;
833 my ($afile);
834
835 foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
836 #rint STDERR "Processing associated file - copy " . $assoc_file->[0] . " to " . $assoc_file->[1] . "\n";
837 # if assoc file starts with a slash, we put it relative to the assoc
838 # dir, otherwise it is relative to the HASH... directory
839 if ($assoc_file->[1] =~ m@^[/\\]@) {
840 $afile = &util::filename_cat($self->{'assocdir'}, $assoc_file->[1]);
841 } else {
842 $afile = &util::filename_cat($self->{'assocdir'}, $archivedir, $assoc_file->[1]);
843 }
844 &util::hard_link ($assoc_file->[0], $afile, $self->{'verbosity'});
845 }
846}
847
Note: See TracBrowser for help on using the repository browser.