source: main/trunk/greenstone2/perllib/basebuildproc.pm@ 27306

Last change on this file since 27306 was 27306, checked in by jmt12, 11 years ago

Moving the critical file-related functions (copy, rm, etc) out of util.pm into their own proper class FileUtils. Use of the old functions in util.pm will prompt deprecated warning messages. There may be further functions that could be moved across in the future, but these are the critical ones when considering supporting other filesystems (HTTP, HDFS, WebDav, etc). Updated some key files to use the new functions so now deprecated messages thrown when importing/building demo collection 'out of the box'

  • Property svn:keywords set to Author Date Id Revision
File size: 23.4 KB
Line 
1##########################################################################
2#
3# basebuildproc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor outputs a document for indexing (should be
27# implemented by subclass) and storing in the database
28
29package basebuildproc;
30
31eval {require bytes};
32
33use classify;
34use dbutil;
35use doc;
36use docproc;
37use strict;
38no strict 'subs';
39no strict 'refs';
40use util;
41use FileUtils;
42
43BEGIN {
44 @basebuildproc::ISA = ('docproc');
45}
46
47sub new()
48 {
49 my ($class, $collection, $source_dir, $build_dir, $keepold, $verbosity, $outhandle) = @_;
50 my $self = new docproc ();
51
52 # outhandle is where all the debugging info goes
53 # output_handle is where the output of the plugins is piped
54 # to (i.e. mg, database etc.)
55 $outhandle = STDERR unless defined $outhandle;
56
57 $self->{'collection'} = $collection;
58 $self->{'source_dir'} = $source_dir;
59 $self->{'build_dir'} = $build_dir;
60 $self->{'keepold'} = $keepold;
61 $self->{'verbosity'} = $verbosity;
62 $self->{'outhandle'} = $outhandle;
63
64 $self->{'classifiers'} = [];
65 $self->{'mode'} = "text";
66 $self->{'assocdir'} = $build_dir;
67 $self->{'dontdb'} = {};
68 $self->{'store_metadata_coverage'} = "false";
69
70 $self->{'index'} = "section:text";
71 $self->{'indexexparr'} = [];
72
73 $self->{'separate_cjk'} = 0;
74
75 my $found_num_data = 0;
76 my $buildconfigfile = undef;
77
78 if ($keepold) {
79 # For incremental building need to seed num_docs etc from values
80 # stored in build.cfg (if present)
81 $buildconfigfile = &FileUtils::filenameConcatenate($build_dir, "build.cfg");
82 if (-e $buildconfigfile) {
83 $found_num_data = 1;
84 }
85 else {
86 # try the index dir
87 $buildconfigfile = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},
88 "index", "build.cfg");
89 if (-e $buildconfigfile) {
90 $found_num_data = 1;
91 }
92 }
93
94 }
95
96 if ($found_num_data)
97 {
98 #print STDERR "Found_Num_Data!\n";
99 my $buildcfg = &colcfg::read_build_cfg($buildconfigfile);
100 $self->{'starting_num_docs'} = $buildcfg->{'numdocs'};
101 #print STDERR "- num_docs: $self->{'starting_num_docs'}\n";
102 $self->{'starting_num_sections'} = $buildcfg->{'numsections'};
103 #print STDERR "- num_sections: $self->{'starting_num_sections'}\n";
104 $self->{'starting_num_bytes'} = $buildcfg->{'numbytes'};
105 #print STDERR "- num_bytes: $self->{'starting_num_bytes'}\n";
106 }
107 else
108 {
109 #print STDERR "NOT Found_Num_Data!\n";
110 $self->{'starting_num_docs'} = 0;
111 $self->{'starting_num_sections'} = 0;
112 $self->{'starting_num_bytes'} = 0;
113 }
114
115 $self->{'output_handle'} = "STDOUT";
116 $self->{'num_docs'} = $self->{'starting_num_docs'};
117 $self->{'num_sections'} = $self->{'starting_num_sections'};
118 $self->{'num_bytes'} = $self->{'starting_num_bytes'};
119
120 $self->{'num_processed_bytes'} = 0;
121 $self->{'store_text'} = 1;
122
123 # what level (section/document) the database - indexer intersection is
124 $self->{'db_level'} = "section";
125 #used by browse interface
126 $self->{'doclist'} = [];
127
128 $self->{'indexing_text'} = 0;
129
130 return bless $self, $class;
131
132}
133
134sub reset {
135 my $self = shift (@_);
136
137 $self->{'num_docs'} = $self->{'starting_num_docs'};
138 $self->{'num_sections'} = $self->{'starting_num_sections'};
139 $self->{'num_bytes'} = $self->{'starting_num_bytes'};
140
141 $self->{'num_processed_bytes'} = 0;
142}
143
144sub zero_reset {
145 my $self = shift (@_);
146
147 $self->{'num_docs'} = 0;
148 $self->{'num_sections'} = 0;
149 # reconstructed docs have no text, just metadata, so we need to
150 # remember how many bytes we had initially
151 #$self->{'num_bytes'} = $self->{'starting_num_bytes'};
152 $self->{'num_bytes'} = 0; # we'll store num bytes in db for reconstructed docs.
153 $self->{'num_processed_bytes'} = 0;
154}
155
156sub is_incremental_capable
157{
158 # By default we return 'no' as the answer
159 # Safer to assume non-incremental to start with, and then override in
160 # inherited classes that are.
161
162 return 0;
163}
164
165sub get_num_docs {
166 my $self = shift (@_);
167
168 return $self->{'num_docs'};
169}
170
171sub get_num_sections {
172 my $self = shift (@_);
173
174 return $self->{'num_sections'};
175}
176
177# num_bytes is the actual number of bytes in the collection
178# this is normally the same as what's processed during text compression
179sub get_num_bytes {
180 my $self = shift (@_);
181
182 return $self->{'num_bytes'};
183}
184
185# num_processed_bytes is the number of bytes actually passed
186# to mg for the current index
187sub get_num_processed_bytes {
188 my $self = shift (@_);
189
190 return $self->{'num_processed_bytes'};
191}
192
193sub set_output_handle {
194 my $self = shift (@_);
195 my ($handle) = @_;
196
197 $self->{'output_handle'} = $handle;
198 # The output handle isn't always an actual handle. In a couple of the
199 # database drivers (MSSQL and GDBMServer) it's actually a reference
200 # to an object. Thus we need to test the type before setting binmode.
201 # [jmt12]
202 if (ref $handle eq "GLOB")
203 {
204 binmode($handle,":utf8");
205 }
206}
207
208
209sub set_mode {
210 my $self = shift (@_);
211 my ($mode) = @_;
212
213 $self->{'mode'} = $mode;
214}
215
216sub get_mode {
217 my $self = shift (@_);
218
219 return $self->{'mode'};
220}
221
222sub set_assocdir {
223 my $self = shift (@_);
224 my ($assocdir) = @_;
225
226 $self->{'assocdir'} = $assocdir;
227}
228
229sub set_dontdb {
230 my $self = shift (@_);
231 my ($dontdb) = @_;
232
233 $self->{'dontdb'} = $dontdb;
234}
235
236sub set_infodbtype
237{
238 my $self = shift(@_);
239 my $infodbtype = shift(@_);
240 $self->{'infodbtype'} = $infodbtype;
241}
242
243sub set_index {
244 my $self = shift (@_);
245 my ($index, $indexexparr) = @_;
246
247 $self->{'index'} = $index;
248 $self->{'indexexparr'} = $indexexparr if defined $indexexparr;
249}
250
251sub set_index_languages {
252 my $self = shift (@_);
253 my ($lang_meta, $langarr) = @_;
254 $lang_meta =~ s/^ex\.([^.]+)$/$1/; # strip any ex. namespace iff it's the only namespace prefix (will leave ex.dc.* intact)
255
256 $self->{'lang_meta'} = $lang_meta;
257 $self->{'langarr'} = $langarr;
258}
259
260sub get_index {
261 my $self = shift (@_);
262
263 return $self->{'index'};
264}
265
266sub set_classifiers {
267 my $self = shift (@_);
268 my ($classifiers) = @_;
269
270 $self->{'classifiers'} = $classifiers;
271}
272
273sub set_indexing_text {
274 my $self = shift (@_);
275 my ($indexing_text) = @_;
276
277 $self->{'indexing_text'} = $indexing_text;
278}
279
280sub get_indexing_text {
281 my $self = shift (@_);
282
283 return $self->{'indexing_text'};
284}
285
286sub set_store_text {
287 my $self = shift (@_);
288 my ($store_text) = @_;
289
290 $self->{'store_text'} = $store_text;
291}
292
293sub set_store_metadata_coverage {
294 my $self = shift (@_);
295 my ($store_metadata_coverage) = @_;
296
297 $self->{'store_metadata_coverage'} = $store_metadata_coverage || "";
298}
299
300sub get_doc_list {
301 my $self = shift(@_);
302
303 return @{$self->{'doclist'}};
304}
305
306# the standard database level is section, but you may want to change it to document
307sub set_db_level {
308 my $self= shift (@_);
309 my ($db_level) = @_;
310
311 $self->{'db_level'} = $db_level;
312}
313
314sub set_sections_index_document_metadata {
315 my $self= shift (@_);
316 my ($index_type) = @_;
317
318 $self->{'sections_index_document_metadata'} = $index_type;
319}
320
321sub set_separate_cjk {
322 my $self = shift (@_);
323 my ($sep_cjk) = @_;
324
325 $self->{'separate_cjk'} = $sep_cjk;
326}
327
328sub process {
329 my $self = shift (@_);
330 my $method = $self->{'mode'};
331
332 $self->$method(@_);
333}
334
335# post process text depending on field. Currently don't do anything here
336# except cjk separation, and only for indexing
337# should only do this for indexed text (if $self->{'indexing_text'}),
338# but currently search term highlighting doesn't work if you do that.
339# once thats fixed up, then fix this.
340sub filter_text {
341 my $self = shift (@_);
342 my ($field, $text) = @_;
343
344 # lets do cjk seg here
345 my $new_text =$text;
346 if ($self->{'separate_cjk'}) {
347 $new_text = &cnseg::segment($text);
348 }
349 return $new_text;
350}
351
352
353sub infodb_metadata_stats
354{
355 my $self = shift (@_);
356 my ($field,$edit_mode) = @_;
357
358 # Keep some statistics relating to metadata sets used and
359 # frequency of particular metadata fields within each set
360
361 # Union of metadata prefixes and frequency of fields
362 # (both scoped for this document alone, and across whole collection)
363
364 if ($field =~ m/^(.+)\.(.*)$/) {
365 my $prefix = $1;
366 my $core_field = $2;
367
368 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
369 $self->{'doc_mdprefix_fields'}->{$prefix}->{$core_field}++;
370 $self->{'mdprefix_fields'}->{$prefix}->{$core_field}++;
371 }
372 else {
373 # delete
374 $self->{'doc_mdprefix_fields'}->{$prefix}->{$core_field}--;
375 $self->{'mdprefix_fields'}->{$prefix}->{$core_field}--;
376 }
377
378 }
379 elsif ($field =~ m/^[[:upper:]]/) {
380 # implicit 'ex' metadata set
381
382 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
383
384 $self->{'doc_mdprefix_fields'}->{'ex'}->{$field}++;
385 $self->{'mdprefix_fields'}->{'ex'}->{$field}++;
386 }
387 else {
388 # delete
389 $self->{'doc_mdprefix_fields'}->{'ex'}->{$field}--;
390 $self->{'mdprefix_fields'}->{'ex'}->{$field}--;
391 }
392 }
393
394}
395
396
397sub infodbedit {
398 my $self = shift (@_);
399 my ($doc_obj, $filename, $edit_mode) = @_;
400
401 # only output this document if it is a "indexed_doc" or "info_doc" (database only) document
402 my $doctype = $doc_obj->get_doc_type();
403 return if ($doctype ne "indexed_doc" && $doctype ne "info_doc");
404
405 my $archivedir = "";
406 if (defined $filename)
407 {
408 # doc_obj derived directly from file
409 my ($dir) = $filename =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
410 $dir = "" unless defined $dir;
411 $dir =~ s/\\/\//g;
412 $dir =~ s/^\/+//;
413 $dir =~ s/\/+$//;
414
415 $archivedir = $dir;
416
417 if ($edit_mode eq "delete") {
418 # record this doc so we don't process the reconstructed doc later
419 $self->{'dont_process_reconstructed'}->{$doc_obj->get_OID()} = 1;
420 # we don't need to do anything else for the info database for a deleted document. The infodb starts from scratch each time, so no deletion is necessary
421 $self->delete_assoc_files ($archivedir, "delete");
422 return;
423 }
424 if ($edit_mode eq "update") {
425 # we don't want to process the reconstructed doc later, but we will process this version now.
426 $self->{'dont_process_reconstructed'}->{$doc_obj->get_OID()} = 1;
427 # delete the old assoc files as they may have changed
428 $self->delete_assoc_files ($archivedir, "update");
429 }
430
431 # resolve the final filenames of the files associated with this document
432 # now save the new assoc files for an update/new doc.
433 $self->assoc_files ($doc_obj, $archivedir);
434 }
435 else
436 {
437 # doc_obj reconstructed from database (has metadata, doc structure but no text)
438 my $top_section = $doc_obj->get_top_section();
439 $archivedir = $doc_obj->get_metadata_element($top_section,"archivedir");
440 }
441
442 # rest of code used for add and update. In both cases, we add to the classifiers and to the info database.
443
444 #add this document to the browse structure
445 push(@{$self->{'doclist'}},$doc_obj->get_OID())
446 unless ($doctype eq "classification");
447 $self->{'num_docs'} += 1 unless ($doctype eq "classification");
448
449 if (!defined $filename) {
450 # a reconstructed doc
451 my $num_reconstructed_bytes = $doc_obj->get_metadata_element ($doc_obj->get_top_section (), "total_numbytes");
452 if (defined $num_reconstructed_bytes) {
453 $self->{'num_bytes'} += $num_reconstructed_bytes;
454 }
455 }
456 # classify the document
457 &classify::classify_doc ($self->{'classifiers'}, $doc_obj);
458
459 # now add all the sections to the infodb.
460
461 # is this a paged or a hierarchical document
462 my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
463
464 my $section = $doc_obj->get_top_section ();
465 my $doc_OID = $doc_obj->get_OID();
466 my $first = 1;
467 my $infodb_handle = $self->{'output_handle'};
468
469 $self->{'doc_mdprefix_fields'} = {};
470
471 while (defined $section)
472 {
473 my $section_OID = $doc_OID;
474 if ($section ne "")
475 {
476 $section_OID = $doc_OID . "." . $section;
477 }
478 my %section_infodb = ();
479
480 # update a few statistics
481 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
482 $self->{'num_sections'} += 1 unless ($doctype eq "classification");
483
484 # output the fact that this document is a document (unless doctype
485 # has been set to something else from within a plugin
486 my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
487 if (!defined $dtype || $dtype !~ /\w/) {
488 $section_infodb{"doctype"} = [ "doc" ];
489 }
490
491 if ($first && defined $filename) {
492 # if we are at the top level of the document, and we are not a reconstructed document, set the total_text_length - used to count bytes when we reconstruct later
493 my $length = $doc_obj->get_total_text_length();
494 $section_infodb{"total_numbytes"} = [ $length ];
495 }
496 # Output whether this node contains text
497 #
498 # If doc_obj reconstructed from database file then no need to
499 # explicitly add <hastxt> as this is preserved as metadata when
500 # the database file is loaded in
501 if (defined $filename)
502 {
503 # doc_obj derived directly from file
504 if ($doc_obj->get_text_length($section) > 0) {
505 $section_infodb{"hastxt"} = [ "1" ];
506 } else {
507 $section_infodb{"hastxt"} = [ "0" ];
508 }
509 }
510
511 # output all the section metadata
512 my $metadata = $doc_obj->get_all_metadata ($section);
513 foreach my $pair (@$metadata) {
514 my ($field, $value) = (@$pair);
515
516 if ($field ne "Identifier" && $field !~ /^gsdl/ &&
517 defined $value && $value ne "") {
518
519 # escape problematic stuff
520 $value =~ s/([^\\])\\([^\\])/$1\\\\$2/g;
521 $value =~ s/\n/\\n/g;
522 $value =~ s/\r/\\r/g;
523 # remove any ex. iff it's the only namespace prefix (will leave ex.dc.* intact)
524 $field =~ s/^ex\.([^.]+)$/$1/; # $field =~ s/^ex\.//;
525
526 # special case for UTF8URL metadata
527 if ($field =~ m/^UTF8URL$/i) {
528 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle,
529 $value, { 'section' => [ $section_OID ] });
530 }
531
532 if (!defined $self->{'dontdb'}->{$field}) {
533 push(@{$section_infodb{$field}}, $value);
534
535 if ($section eq ""
536 && (($self->{'store_metadata_coverage'} =~ /^true$/i)
537 || $self->{'store_metadata_coverage'} eq "1"))
538 {
539 $self->infodb_metadata_stats($field,$edit_mode);
540 }
541 }
542 }
543 }
544
545 if ($section eq "")
546 {
547 my $doc_mdprefix_fields = $self->{'doc_mdprefix_fields'};
548
549 foreach my $prefix (keys %$doc_mdprefix_fields)
550 {
551 push(@{$section_infodb{"metadataset"}}, $prefix);
552
553 foreach my $field (keys %{$doc_mdprefix_fields->{$prefix}})
554 {
555 push(@{$section_infodb{"metadatalist-$prefix"}}, $field);
556
557 my $val = $doc_mdprefix_fields->{$prefix}->{$field};
558 push(@{$section_infodb{"metadatafreq-$prefix-$field"}}, $val);
559 }
560 }
561 }
562
563 # If doc_obj reconstructed from database file then no need to
564 # explicitly add <archivedir> as this is preserved as metadata when
565 # the database file is loaded in
566 if (defined $filename)
567 {
568 # output archivedir if at top level
569 if ($section eq $doc_obj->get_top_section()) {
570 $section_infodb{"archivedir"} = [ $archivedir ];
571 }
572 }
573
574 # output document display type
575 if ($first) {
576 $section_infodb{"thistype"} = [ $thistype ];
577 }
578
579 if ($self->{'db_level'} eq "document") {
580 # doc num is num_docs not num_sections
581 # output the matching document number
582 $section_infodb{"docnum"} = [ $self->{'num_docs'} ];
583 }
584 else {
585 # output a list of children
586 my $children = $doc_obj->get_children ($section);
587 if (scalar(@$children) > 0) {
588 $section_infodb{"childtype"} = [ $childtype ];
589 my $contains = "";
590 foreach my $child (@$children)
591 {
592 $contains .= ";" unless ($contains eq "");
593 if ($child =~ /^.*?\.(\d+)$/)
594 {
595 $contains .= "\".$1";
596 }
597 else
598 {
599 $contains .= "\".$child";
600 }
601 }
602 $section_infodb{"contains"} = [ $contains ];
603 }
604 # output the matching doc number
605 $section_infodb{"docnum"} = [ $self->{'num_sections'} ];
606 }
607
608 if(defined $section_infodb{'assocfilepath'})
609 {
610 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $section_infodb{'assocfilepath'}[0], { 'contains' => [ $section_OID ]});
611 }
612 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $section_OID, \%section_infodb);
613
614 # output a database entry for the document number, unless we are incremental
615 unless ($self->is_incremental_capable())
616 {
617 if ($self->{'db_level'} eq "document") {
618 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_docs'}, { 'section' => [ $doc_OID ] });
619 }
620 else {
621 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_sections'}, { 'section' => [ $section_OID ] });
622 }
623 }
624
625 $first = 0;
626 $section = $doc_obj->get_next_section($section);
627 last if ($self->{'db_level'} eq "document"); # if no sections wanted, only add the docs
628 } # while defined section
629
630}
631
632
633
634
635sub infodb {
636 my $self = shift (@_);
637 my ($doc_obj, $filename) = @_;
638
639 $self->infodbedit($doc_obj,$filename,"add");
640}
641
642sub infodbreindex {
643 my $self = shift (@_);
644 my ($doc_obj, $filename) = @_;
645
646 $self->infodbedit($doc_obj,$filename,"update");
647}
648
649sub infodbdelete {
650 my $self = shift (@_);
651 my ($doc_obj, $filename) = @_;
652
653 $self->infodbedit($doc_obj,$filename,"delete");
654}
655
656
657sub text {
658 my $self = shift (@_);
659 my ($doc_obj) = @_;
660
661 my $handle = $self->{'outhandle'};
662 print $handle "basebuildproc::text function must be implemented in sub classes\n";
663 die "\n";
664}
665
666sub textreindex
667{
668 my $self = shift @_;
669
670 my $outhandle = $self->{'outhandle'};
671 print $outhandle "basebuildproc::textreindex function must be implemented in sub classes\n";
672 if (!$self->is_incremental_capable()) {
673
674 print $outhandle " This operation is only possible with indexing tools with that support\n";
675 print $outhandle " incremental building\n";
676 }
677 die "\n";
678}
679
680sub textdelete
681{
682 my $self = shift @_;
683
684 my $outhandle = $self->{'outhandle'};
685 print $outhandle "basebuildproc::textdelete function must be implemented in sub classes\n";
686 if (!$self->is_incremental_capable()) {
687
688 print $outhandle " This operation is only possible with indexing tools with that support\n";
689 print $outhandle " incremental building\n";
690 }
691 die "\n";
692}
693
694
695# should the document be indexed - according to the subcollection and language
696# specification.
697sub is_subcollection_doc {
698 my $self = shift (@_);
699 my ($doc_obj) = @_;
700
701 my $indexed_doc = 1;
702 foreach my $indexexp (@{$self->{'indexexparr'}}) {
703 $indexed_doc = 0;
704 my ($field, $exp, $options) = split /\//, $indexexp;
705 if (defined ($field) && defined ($exp)) {
706 my ($bool) = $field =~ /^(.)/;
707 $field =~ s/^.// if $bool eq '!';
708 my @metadata_values;
709 if ($field =~ /^filename$/i) {
710 push(@metadata_values, $doc_obj->get_source_filename());
711 }
712 else {
713 $field =~ s/^ex\.([^.]+)$/$1/; # remove any ex. iff it's the only namespace prefix (will leave ex.dc.* intact)
714 @metadata_values = @{$doc_obj->get_metadata($doc_obj->get_top_section(), $field)};
715 }
716 next unless @metadata_values;
717 foreach my $metadata_value (@metadata_values) {
718 if ($bool eq '!') {
719 if ($options =~ /^i$/i) {
720 if ($metadata_value !~ /$exp/i) {$indexed_doc = 1; last;}
721 } else {
722 if ($metadata_value !~ /$exp/) {$indexed_doc = 1; last;}
723 }
724 } else {
725 if ($options =~ /^i$/i) {
726 if ($metadata_value =~ /$exp/i) {$indexed_doc = 1; last;}
727 } else {
728 if ($metadata_value =~ /$exp/) {$indexed_doc = 1; last;}
729 }
730 }
731 }
732
733 last if ($indexed_doc == 1);
734 }
735 }
736
737 # if this doc is so far in the sub collection, and we have lang info,
738 # now we check the languages to see if it matches
739 if($indexed_doc && defined $self->{'lang_meta'}) {
740 $indexed_doc = 0;
741 my $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'lang_meta'});
742 if (defined $field) {
743 foreach my $lang (@{$self->{'langarr'}}) {
744 my ($bool) = $lang =~ /^(.)/;
745 if ($bool eq '!') {
746 $lang =~ s/^.//;
747 if ($field !~ /$lang/) {
748 $indexed_doc = 1; last;
749 }
750 } else {
751 if ($field =~ /$lang/) {
752 $indexed_doc = 1; last;
753 }
754 }
755 }
756 }
757 }
758 return $indexed_doc;
759
760}
761
762# use 'Paged' if document has no more than 2 levels
763# and each section at second level has a number for
764# Title metadata
765# also use Paged if gsdlthistype metadata is set to Paged
766sub get_document_type {
767 my $self = shift (@_);
768 my ($doc_obj) = @_;
769
770 my $thistype = "VList";
771 my $childtype = "VList";
772 my $title;
773 my @tmp = ();
774
775 my $section = $doc_obj->get_top_section ();
776
777 my $gsdlthistype = $doc_obj->get_metadata_element ($section, "gsdlthistype");
778 if (defined $gsdlthistype) {
779 if ($gsdlthistype =~ /^paged$/i) {
780 $childtype = "Paged";
781 if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
782 $thistype = "Paged";
783 } else {
784 $thistype = "Invisible";
785 }
786
787 return ($thistype, $childtype);
788 }
789 # gs3 pagedhierarchy option
790 elsif ($gsdlthistype =~ /^pagedhierarchy$/i) {
791 $childtype = "PagedHierarchy";
792 if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
793 $thistype = "PagedHierarchy";
794 } else {
795 $thistype = "Invisible";
796 }
797
798 return ($thistype, $childtype);
799 } elsif ($gsdlthistype =~ /^hierarchy$/i) {
800 return ($thistype, $childtype); # use VList, VList
801 }
802 }
803 my $first = 1;
804 while (defined $section) {
805 @tmp = split /\./, $section;
806 if (scalar(@tmp) > 1) {
807 return ($thistype, $childtype);
808 }
809 if (!$first) {
810 $title = $doc_obj->get_metadata_element ($section, "Title");
811 if (!defined $title || $title !~ /^\d+$/) {
812 return ($thistype, $childtype);
813 }
814 }
815 $first = 0;
816 $section = $doc_obj->get_next_section($section);
817 }
818 if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
819 $thistype = "Paged";
820 } else {
821 $thistype = "Invisible";
822 }
823 $childtype = "Paged";
824 return ($thistype, $childtype);
825}
826
827sub assoc_files
828{
829 my $self = shift (@_);
830 my ($doc_obj, $archivedir) = @_;
831 my ($afile);
832
833 foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
834 #rint STDERR "Processing associated file - copy " . $assoc_file->[0] . " to " . $assoc_file->[1] . "\n";
835 # if assoc file starts with a slash, we put it relative to the assoc
836 # dir, otherwise it is relative to the HASH... directory
837 if ($assoc_file->[1] =~ m@^[/\\]@) {
838 $afile = &FileUtils::filenameConcatenate($self->{'assocdir'}, $assoc_file->[1]);
839 } else {
840 $afile = &FileUtils::filenameConcatenate($self->{'assocdir'}, $archivedir, $assoc_file->[1]);
841 }
842
843 &FileUtils::hardLink($assoc_file->[0], $afile, $self->{'verbosity'});
844 }
845}
846
847sub delete_assoc_files
848{
849 my $self = shift (@_);
850 my ($archivedir, $edit_mode) = @_;
851
852 my $assoc_dir = &FileUtils::filenameConcatenate($self->{'assocdir'}, $archivedir);
853 if (-d $assoc_dir) {
854 &FileUtils::removeFilesRecursive($assoc_dir);
855 }
856}
Note: See TracBrowser for help on using the repository browser.