source: main/trunk/greenstone2/perllib/basebuildproc.pm@ 24193

Last change on this file since 24193 was 23387, checked in by davidb, 13 years ago

Further changes to deal with documents that use different filename encodings on the file-system. Now sets UTF8URL metadata to perform the cross-document look up. Files stored in doc.pm as associated files are now always raw filenames (rather than potentially UTF8 encoded). Storing of filenames seen by HTMLPlug when scanning for files to block on is now done in Unicode aware strings rather than utf8 but unware strings.

  • Property svn:keywords set to Author Date Id Revision
File size: 22.3 KB
Line 
1##########################################################################
2#
3# basebuildproc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor outputs a document for indexing (should be
27# implemented by subclass) and storing in the database
28
29package basebuildproc;
30
31eval {require bytes};
32
33use classify;
34use dbutil;
35use doc;
36use docproc;
37use strict;
38no strict 'subs';
39no strict 'refs';
40use util;
41
42BEGIN {
43 @basebuildproc::ISA = ('docproc');
44}
45
46sub new()
47 {
48 my ($class, $collection, $source_dir, $build_dir, $keepold, $verbosity, $outhandle) = @_;
49 my $self = new docproc ();
50
51 # outhandle is where all the debugging info goes
52 # output_handle is where the output of the plugins is piped
53 # to (i.e. mg, database etc.)
54 $outhandle = STDERR unless defined $outhandle;
55
56 $self->{'collection'} = $collection;
57 $self->{'source_dir'} = $source_dir;
58 $self->{'build_dir'} = $build_dir;
59 $self->{'keepold'} = $keepold;
60 $self->{'verbosity'} = $verbosity;
61 $self->{'outhandle'} = $outhandle;
62
63 $self->{'classifiers'} = [];
64 $self->{'mode'} = "text";
65 $self->{'assocdir'} = $build_dir;
66 $self->{'dontdb'} = {};
67 $self->{'store_metadata_coverage'} = "false";
68
69 $self->{'index'} = "section:text";
70 $self->{'indexexparr'} = [];
71
72 $self->{'separate_cjk'} = 0;
73
74 my $found_num_data = 0;
75 my $buildconfigfile = undef;
76
77 if ($keepold) {
78 # For incremental building need to seed num_docs etc from values
79 # stored in build.cfg (if present)
80 $buildconfigfile = &util::filename_cat($build_dir, "build.cfg");
81 if (-e $buildconfigfile) {
82 $found_num_data = 1;
83 }
84 else {
85 # try the index dir
86 $buildconfigfile = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},
87 "index", "build.cfg");
88 if (-e $buildconfigfile) {
89 $found_num_data = 1;
90 }
91 }
92
93 }
94
95 if ($found_num_data)
96 {
97 #print STDERR "Found_Num_Data!\n";
98 my $buildcfg = &colcfg::read_build_cfg($buildconfigfile);
99 $self->{'starting_num_docs'} = $buildcfg->{'numdocs'};
100 #print STDERR "- num_docs: $self->{'starting_num_docs'}\n";
101 $self->{'starting_num_sections'} = $buildcfg->{'numsections'};
102 #print STDERR "- num_sections: $self->{'starting_num_sections'}\n";
103 $self->{'starting_num_bytes'} = $buildcfg->{'numbytes'};
104 #print STDERR "- num_bytes: $self->{'starting_num_bytes'}\n";
105 }
106 else
107 {
108 #print STDERR "NOT Found_Num_Data!\n";
109 $self->{'starting_num_docs'} = 0;
110 $self->{'starting_num_sections'} = 0;
111 $self->{'starting_num_bytes'} = 0;
112 }
113
114 $self->{'output_handle'} = "STDOUT";
115 $self->{'num_docs'} = $self->{'starting_num_docs'};
116 $self->{'num_sections'} = $self->{'starting_num_sections'};
117 $self->{'num_bytes'} = $self->{'starting_num_bytes'};
118
119 $self->{'num_processed_bytes'} = 0;
120 $self->{'store_text'} = 1;
121
122 # what level (section/document) the database - indexer intersection is
123 $self->{'db_level'} = "section";
124 #used by browse interface
125 $self->{'doclist'} = [];
126
127 $self->{'indexing_text'} = 0;
128
129 return bless $self, $class;
130
131}
132
133sub reset {
134 my $self = shift (@_);
135
136 $self->{'num_docs'} = $self->{'starting_num_docs'};
137 $self->{'num_sections'} = $self->{'starting_num_sections'};
138 $self->{'num_bytes'} = $self->{'starting_num_bytes'};
139
140 $self->{'num_processed_bytes'} = 0;
141}
142
143sub zero_reset {
144 my $self = shift (@_);
145
146 $self->{'num_docs'} = 0;
147 $self->{'num_sections'} = 0;
148 # reconstructed docs have no text, just metadata, so we need to
149 # remember how many bytes we had initially
150 #$self->{'num_bytes'} = $self->{'starting_num_bytes'};
151 $self->{'num_bytes'} = 0; # we'll store num bytes in db for reconstructed docs.
152 $self->{'num_processed_bytes'} = 0;
153}
154
155sub is_incremental_capable
156{
157 # By default we return 'no' as the answer
158 # Safer to assume non-incremental to start with, and then override in
159 # inherited classes that are.
160
161 return 0;
162}
163
164sub get_num_docs {
165 my $self = shift (@_);
166
167 return $self->{'num_docs'};
168}
169
170sub get_num_sections {
171 my $self = shift (@_);
172
173 return $self->{'num_sections'};
174}
175
176# num_bytes is the actual number of bytes in the collection
177# this is normally the same as what's processed during text compression
178sub get_num_bytes {
179 my $self = shift (@_);
180
181 return $self->{'num_bytes'};
182}
183
184# num_processed_bytes is the number of bytes actually passed
185# to mg for the current index
186sub get_num_processed_bytes {
187 my $self = shift (@_);
188
189 return $self->{'num_processed_bytes'};
190}
191
192sub set_output_handle {
193 my $self = shift (@_);
194 my ($handle) = @_;
195
196 $self->{'output_handle'} = $handle;
197 binmode($handle,":utf8");
198}
199
200
201sub set_mode {
202 my $self = shift (@_);
203 my ($mode) = @_;
204
205 $self->{'mode'} = $mode;
206}
207
208sub get_mode {
209 my $self = shift (@_);
210
211 return $self->{'mode'};
212}
213
214sub set_assocdir {
215 my $self = shift (@_);
216 my ($assocdir) = @_;
217
218 $self->{'assocdir'} = $assocdir;
219}
220
221sub set_dontdb {
222 my $self = shift (@_);
223 my ($dontdb) = @_;
224
225 $self->{'dontdb'} = $dontdb;
226}
227
228sub set_infodbtype
229{
230 my $self = shift(@_);
231 my $infodbtype = shift(@_);
232 $self->{'infodbtype'} = $infodbtype;
233}
234
235sub set_index {
236 my $self = shift (@_);
237 my ($index, $indexexparr) = @_;
238
239 $self->{'index'} = $index;
240 $self->{'indexexparr'} = $indexexparr if defined $indexexparr;
241}
242
243sub set_index_languages {
244 my $self = shift (@_);
245 my ($lang_meta, $langarr) = @_;
246 $lang_meta =~ s/^ex\.//; # strip ex. if there
247 $self->{'lang_meta'} = $lang_meta;
248 $self->{'langarr'} = $langarr;
249}
250
251sub get_index {
252 my $self = shift (@_);
253
254 return $self->{'index'};
255}
256
257sub set_classifiers {
258 my $self = shift (@_);
259 my ($classifiers) = @_;
260
261 $self->{'classifiers'} = $classifiers;
262}
263
264sub set_indexing_text {
265 my $self = shift (@_);
266 my ($indexing_text) = @_;
267
268 $self->{'indexing_text'} = $indexing_text;
269}
270
271sub get_indexing_text {
272 my $self = shift (@_);
273
274 return $self->{'indexing_text'};
275}
276
277sub set_store_text {
278 my $self = shift (@_);
279 my ($store_text) = @_;
280
281 $self->{'store_text'} = $store_text;
282}
283
284sub set_store_metadata_coverage {
285 my $self = shift (@_);
286 my ($store_metadata_coverage) = @_;
287
288 $self->{'store_metadata_coverage'} = $store_metadata_coverage || "";
289}
290
291sub get_doc_list {
292 my $self = shift(@_);
293
294 return @{$self->{'doclist'}};
295}
296
297# the standard database level is section, but you may want to change it to document
298sub set_db_level {
299 my $self= shift (@_);
300 my ($db_level) = @_;
301
302 $self->{'db_level'} = $db_level;
303}
304
305sub set_sections_index_document_metadata {
306 my $self= shift (@_);
307 my ($index_type) = @_;
308
309 $self->{'sections_index_document_metadata'} = $index_type;
310}
311
312sub set_separate_cjk {
313 my $self = shift (@_);
314 my ($sep_cjk) = @_;
315
316 $self->{'separate_cjk'} = $sep_cjk;
317}
318
319sub process {
320 my $self = shift (@_);
321 my $method = $self->{'mode'};
322
323 $self->$method(@_);
324}
325
326# post process text depending on field. Currently don't do anything here
327# except cjk separation, and only for indexing
328# should only do this for indexed text (if $self->{'indexing_text'}),
329# but currently search term highlighting doesn't work if you do that.
330# once thats fixed up, then fix this.
331sub filter_text {
332 my $self = shift (@_);
333 my ($field, $text) = @_;
334
335 # lets do cjk seg here
336 my $new_text =$text;
337 if ($self->{'separate_cjk'}) {
338 $new_text = &cnseg::segment($text);
339 }
340 return $new_text;
341}
342
343
344sub infodb_metadata_stats
345{
346 my $self = shift (@_);
347 my ($field,$edit_mode) = @_;
348
349 # Keep some statistics relating to metadata sets used and
350 # frequency of particular metadata fields within each set
351
352 # Union of metadata prefixes and frequency of fields
353 # (both scoped for this document alone, and across whole collection)
354
355 if ($field =~ m/^(.+)\.(.*)$/) {
356 my $prefix = $1;
357 my $core_field = $2;
358
359 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
360 $self->{'doc_mdprefix_fields'}->{$prefix}->{$core_field}++;
361 $self->{'mdprefix_fields'}->{$prefix}->{$core_field}++;
362 }
363 else {
364 # delete
365 $self->{'doc_mdprefix_fields'}->{$prefix}->{$core_field}--;
366 $self->{'mdprefix_fields'}->{$prefix}->{$core_field}--;
367 }
368
369 }
370 elsif ($field =~ m/^[[:upper:]]/) {
371 # implicit 'ex' metadata set
372
373 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
374
375 $self->{'doc_mdprefix_fields'}->{'ex'}->{$field}++;
376 $self->{'mdprefix_fields'}->{'ex'}->{$field}++;
377 }
378 else {
379 # delete
380 $self->{'doc_mdprefix_fields'}->{'ex'}->{$field}--;
381 $self->{'mdprefix_fields'}->{'ex'}->{$field}--;
382 }
383 }
384
385}
386
387
388sub infodbedit {
389 my $self = shift (@_);
390 my ($doc_obj, $filename, $edit_mode) = @_;
391
392 # only output this document if it is a "indexed_doc" or "info_doc" (database only) document
393 my $doctype = $doc_obj->get_doc_type();
394 return if ($doctype ne "indexed_doc" && $doctype ne "info_doc");
395
396 my $archivedir = "";
397 if (defined $filename)
398 {
399 # doc_obj derived directly from file
400 my ($dir) = $filename =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
401 $dir = "" unless defined $dir;
402 $dir =~ s/\\/\//g;
403 $dir =~ s/^\/+//;
404 $dir =~ s/\/+$//;
405
406 $archivedir = $dir;
407
408 if ($edit_mode eq "delete") {
409 # record this doc so we don't process the reconstructed doc later
410 $self->{'dont_process_reconstructed'}->{$doc_obj->get_OID()} = 1;
411 # we don't need to do anything else for the info database for a deleted document. The infodb starts from scratch each time, so no deletion is necessary
412 $self->delete_assoc_files ($archivedir, "delete");
413 return;
414 }
415 if ($edit_mode eq "update") {
416 # we don't want to process the reconstructed doc later, but we will process this version now.
417 $self->{'dont_process_reconstructed'}->{$doc_obj->get_OID()} = 1;
418 # delete the old assoc files as they may have changed
419 $self->delete_assoc_files ($archivedir, "update");
420 }
421
422 # resolve the final filenames of the files associated with this document
423 # now save the new assoc files for an update/new doc.
424 $self->assoc_files ($doc_obj, $archivedir);
425 }
426 else
427 {
428 # doc_obj reconstructed from database (has metadata, doc structure but no text)
429 my $top_section = $doc_obj->get_top_section();
430 $archivedir = $doc_obj->get_metadata_element($top_section,"archivedir");
431 }
432
433 # rest of code used for add and update. In both cases, we add to the classifiers and to the info database.
434
435 #add this document to the browse structure
436 push(@{$self->{'doclist'}},$doc_obj->get_OID())
437 unless ($doctype eq "classification");
438 $self->{'num_docs'} += 1 unless ($doctype eq "classification");
439
440 if (!defined $filename) {
441 # a reconstructed doc
442 my $num_reconstructed_bytes = $doc_obj->get_metadata_element ($doc_obj->get_top_section (), "total_numbytes");
443 if (defined $num_reconstructed_bytes) {
444 $self->{'num_bytes'} += $num_reconstructed_bytes;
445 }
446 }
447 # classify the document
448 &classify::classify_doc ($self->{'classifiers'}, $doc_obj);
449
450 # now add all the sections to the infodb.
451
452 # is this a paged or a hierarchical document
453 my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
454
455 my $section = $doc_obj->get_top_section ();
456 my $doc_OID = $doc_obj->get_OID();
457 my $first = 1;
458 my $infodb_handle = $self->{'output_handle'};
459
460 $self->{'doc_mdprefix_fields'} = {};
461
462 while (defined $section)
463 {
464 my $section_OID = $doc_OID;
465 if ($section ne "")
466 {
467 $section_OID = $doc_OID . "." . $section;
468 }
469 my %section_infodb = ();
470
471 # update a few statistics
472 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
473 $self->{'num_sections'} += 1 unless ($doctype eq "classification");
474
475 # output the fact that this document is a document (unless doctype
476 # has been set to something else from within a plugin
477 my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
478 if (!defined $dtype || $dtype !~ /\w/) {
479 $section_infodb{"doctype"} = [ "doc" ];
480 }
481
482 if ($first && defined $filename) {
483 # if we are at the top level of the document, and we are not a reconstructed document, set the total_text_length - used to count bytes when we reconstruct later
484 my $length = $doc_obj->get_total_text_length();
485 $section_infodb{"total_numbytes"} = [ $length ];
486 }
487 # Output whether this node contains text
488 #
489 # If doc_obj reconstructed from database file then no need to
490 # explicitly add <hastxt> as this is preserved as metadata when
491 # the database file is loaded in
492 if (defined $filename)
493 {
494 # doc_obj derived directly from file
495 if ($doc_obj->get_text_length($section) > 0) {
496 $section_infodb{"hastxt"} = [ "1" ];
497 } else {
498 $section_infodb{"hastxt"} = [ "0" ];
499 }
500 }
501
502 # output all the section metadata
503 my $metadata = $doc_obj->get_all_metadata ($section);
504 foreach my $pair (@$metadata) {
505 my ($field, $value) = (@$pair);
506
507 if ($field ne "Identifier" && $field !~ /^gsdl/ &&
508 defined $value && $value ne "") {
509
510 # escape problematic stuff
511 $value =~ s/([^\\])\\([^\\])/$1\\\\$2/g;
512 $value =~ s/\n/\\n/g;
513 $value =~ s/\r/\\r/g;
514 # remove ex. if there
515 $field =~ s/^ex\.//;
516
517 # special case for UTF8URL metadata
518 if ($field =~ m/^UTF8URL$/i) {
519 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle,
520 $value, { 'section' => [ $section_OID ] });
521 }
522
523 if (!defined $self->{'dontdb'}->{$field}) {
524 push(@{$section_infodb{$field}}, $value);
525
526 if ($section eq "" && $self->{'store_metadata_coverage'} =~ /^true$/i)
527 {
528 $self->infodb_metadata_stats($field,$edit_mode);
529 }
530 }
531 }
532 }
533
534 if ($section eq "")
535 {
536 my $doc_mdprefix_fields = $self->{'doc_mdprefix_fields'};
537
538 foreach my $prefix (keys %$doc_mdprefix_fields)
539 {
540 push(@{$section_infodb{"metadataset"}}, $prefix);
541
542 foreach my $field (keys %{$doc_mdprefix_fields->{$prefix}})
543 {
544 push(@{$section_infodb{"metadatalist-$prefix"}}, $field);
545
546 my $val = $doc_mdprefix_fields->{$prefix}->{$field};
547 push(@{$section_infodb{"metadatafreq-$prefix-$field"}}, $val);
548 }
549 }
550 }
551
552 # If doc_obj reconstructed from database file then no need to
553 # explicitly add <archivedir> as this is preserved as metadata when
554 # the database file is loaded in
555 if (defined $filename)
556 {
557 # output archivedir if at top level
558 if ($section eq $doc_obj->get_top_section()) {
559 $section_infodb{"archivedir"} = [ $archivedir ];
560 }
561 }
562
563 # output document display type
564 if ($first) {
565 $section_infodb{"thistype"} = [ $thistype ];
566 }
567
568 if ($self->{'db_level'} eq "document") {
569 # doc num is num_docs not num_sections
570 # output the matching document number
571 $section_infodb{"docnum"} = [ $self->{'num_docs'} ];
572 }
573 else {
574 # output a list of children
575 my $children = $doc_obj->get_children ($section);
576 if (scalar(@$children) > 0) {
577 $section_infodb{"childtype"} = [ $childtype ];
578 my $contains = "";
579 foreach my $child (@$children)
580 {
581 $contains .= ";" unless ($contains eq "");
582 if ($child =~ /^.*?\.(\d+)$/)
583 {
584 $contains .= "\".$1";
585 }
586 else
587 {
588 $contains .= "\".$child";
589 }
590 }
591 $section_infodb{"contains"} = [ $contains ];
592 }
593 # output the matching doc number
594 $section_infodb{"docnum"} = [ $self->{'num_sections'} ];
595 }
596
597 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $section_OID, \%section_infodb);
598
599 # output a database entry for the document number, unless we are incremental
600 unless ($self->is_incremental_capable())
601 {
602 if ($self->{'db_level'} eq "document") {
603 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_docs'}, { 'section' => [ $doc_OID ] });
604 }
605 else {
606 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_sections'}, { 'section' => [ $section_OID ] });
607 }
608 }
609
610 $first = 0;
611 $section = $doc_obj->get_next_section($section);
612 last if ($self->{'db_level'} eq "document"); # if no sections wanted, only add the docs
613 } # while defined section
614
615}
616
617
618
619
620sub infodb {
621 my $self = shift (@_);
622 my ($doc_obj, $filename) = @_;
623
624 $self->infodbedit($doc_obj,$filename,"add");
625}
626
627sub infodbreindex {
628 my $self = shift (@_);
629 my ($doc_obj, $filename) = @_;
630
631 $self->infodbedit($doc_obj,$filename,"update");
632}
633
634sub infodbdelete {
635 my $self = shift (@_);
636 my ($doc_obj, $filename) = @_;
637
638 $self->infodbedit($doc_obj,$filename,"delete");
639}
640
641
642sub text {
643 my $self = shift (@_);
644 my ($doc_obj) = @_;
645
646 my $handle = $self->{'outhandle'};
647 print $handle "basebuildproc::text function must be implemented in sub classes\n";
648 die "\n";
649}
650
651sub textreindex
652{
653 my $self = shift @_;
654
655 my $outhandle = $self->{'outhandle'};
656 print $outhandle "basebuildproc::textreindex function must be implemented in sub classes\n";
657 if (!$self->is_incremental_capable()) {
658
659 print $outhandle " This operation is only possible with indexing tools with that support\n";
660 print $outhandle " incremental building\n";
661 }
662 die "\n";
663}
664
665sub textdelete
666{
667 my $self = shift @_;
668
669 my $outhandle = $self->{'outhandle'};
670 print $outhandle "basebuildproc::textdelete function must be implemented in sub classes\n";
671 if (!$self->is_incremental_capable()) {
672
673 print $outhandle " This operation is only possible with indexing tools with that support\n";
674 print $outhandle " incremental building\n";
675 }
676 die "\n";
677}
678
679
680# should the document be indexed - according to the subcollection and language
681# specification.
682sub is_subcollection_doc {
683 my $self = shift (@_);
684 my ($doc_obj) = @_;
685
686 my $indexed_doc = 1;
687 foreach my $indexexp (@{$self->{'indexexparr'}}) {
688 $indexed_doc = 0;
689 my ($field, $exp, $options) = split /\//, $indexexp;
690 if (defined ($field) && defined ($exp)) {
691 my ($bool) = $field =~ /^(.)/;
692 $field =~ s/^.// if $bool eq '!';
693 my @metadata_values;
694 if ($field =~ /^filename$/i) {
695 push(@metadata_values, $doc_obj->get_source_filename());
696 }
697 else {
698 $field =~ s/^ex\.//; #strip ex. if present
699 @metadata_values = @{$doc_obj->get_metadata($doc_obj->get_top_section(), $field)};
700 }
701 next unless @metadata_values;
702 foreach my $metadata_value (@metadata_values) {
703 if ($bool eq '!') {
704 if ($options =~ /^i$/i) {
705 if ($metadata_value !~ /$exp/i) {$indexed_doc = 1; last;}
706 } else {
707 if ($metadata_value !~ /$exp/) {$indexed_doc = 1; last;}
708 }
709 } else {
710 if ($options =~ /^i$/i) {
711 if ($metadata_value =~ /$exp/i) {$indexed_doc = 1; last;}
712 } else {
713 if ($metadata_value =~ /$exp/) {$indexed_doc = 1; last;}
714 }
715 }
716 }
717
718 last if ($indexed_doc == 1);
719 }
720 }
721
722 # if this doc is so far in the sub collection, and we have lang info,
723 # now we check the languages to see if it matches
724 if($indexed_doc && defined $self->{'lang_meta'}) {
725 $indexed_doc = 0;
726 my $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'lang_meta'});
727 if (defined $field) {
728 foreach my $lang (@{$self->{'langarr'}}) {
729 my ($bool) = $lang =~ /^(.)/;
730 if ($bool eq '!') {
731 $lang =~ s/^.//;
732 if ($field !~ /$lang/) {
733 $indexed_doc = 1; last;
734 }
735 } else {
736 if ($field =~ /$lang/) {
737 $indexed_doc = 1; last;
738 }
739 }
740 }
741 }
742 }
743 return $indexed_doc;
744
745}
746
747# use 'Paged' if document has no more than 2 levels
748# and each section at second level has a number for
749# Title metadata
750# also use Paged if gsdlthistype metadata is set to Paged
751sub get_document_type {
752 my $self = shift (@_);
753 my ($doc_obj) = @_;
754
755 my $thistype = "VList";
756 my $childtype = "VList";
757 my $title;
758 my @tmp = ();
759
760 my $section = $doc_obj->get_top_section ();
761
762 my $gsdlthistype = $doc_obj->get_metadata_element ($section, "gsdlthistype");
763 if (defined $gsdlthistype) {
764 if ($gsdlthistype eq "Paged") {
765 $childtype = "Paged";
766 if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
767 $thistype = "Paged";
768 } else {
769 $thistype = "Invisible";
770 }
771
772 return ($thistype, $childtype);
773 } elsif ($gsdlthistype eq "Hierarchy") {
774 return ($thistype, $childtype); # use VList, VList
775 }
776 }
777 my $first = 1;
778 while (defined $section) {
779 @tmp = split /\./, $section;
780 if (scalar(@tmp) > 1) {
781 return ($thistype, $childtype);
782 }
783 if (!$first) {
784 $title = $doc_obj->get_metadata_element ($section, "Title");
785 if (!defined $title || $title !~ /^\d+$/) {
786 return ($thistype, $childtype);
787 }
788 }
789 $first = 0;
790 $section = $doc_obj->get_next_section($section);
791 }
792 if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
793 $thistype = "Paged";
794 } else {
795 $thistype = "Invisible";
796 }
797 $childtype = "Paged";
798 return ($thistype, $childtype);
799}
800
801sub assoc_files
802{
803 my $self = shift (@_);
804 my ($doc_obj, $archivedir) = @_;
805 my ($afile);
806
807 foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
808 #rint STDERR "Processing associated file - copy " . $assoc_file->[0] . " to " . $assoc_file->[1] . "\n";
809 # if assoc file starts with a slash, we put it relative to the assoc
810 # dir, otherwise it is relative to the HASH... directory
811 if ($assoc_file->[1] =~ m@^[/\\]@) {
812 $afile = &util::filename_cat($self->{'assocdir'}, $assoc_file->[1]);
813 } else {
814 $afile = &util::filename_cat($self->{'assocdir'}, $archivedir, $assoc_file->[1]);
815 }
816 &util::hard_link ($assoc_file->[0], $afile, $self->{'verbosity'});
817 }
818}
819
820sub delete_assoc_files
821{
822 my $self = shift (@_);
823 my ($archivedir, $edit_mode) = @_;
824
825 my $assoc_dir = &util::filename_cat($self->{'assocdir'}, $archivedir);
826 if (-d $assoc_dir) {
827 &util::rm_r($assoc_dir);
828 }
829}
Note: See TracBrowser for help on using the repository browser.