source: main/trunk/greenstone2/perllib/basebuildproc.pm@ 35401

Last change on this file since 35401 was 33302, checked in by ak19, 5 years ago
  1. Adding GPSMapOverlayLabel extracted from GPS.mapOverlay meta to text indexes for searching, as with Coordinte and CoordShort. 2. Added a shortname for this index, ML for MapLabel. 3. On testing the indexing of the GPSMapOverlayLabel text, the old problem of increasingly duplicated Coordinate/CoordShort and now also GPSMapOverlayLabel meta in the infodb reappeared. Dr Bainbridge explained why this was (documented as comments in this commit) and fixed the problem by not processing GPS.mapOverlay meta into Coordinate and Label meta during the infodb pass (and dummy pass, so specifically specifically non-text passes) of buildcol. A natural consequence is that to check whether Coord and Label meta have been indexed, can no longer check the index/text/col.jdb but need to use Luke (if a lucene collection ) to check contents of index/sidx and index/didx. 4. An important change needed for the bugfix in 3 is reordering call to &classify::reconstruct_doc_objs_metadata() in basebuilder.pm to take place AFTER build_proc->set_mode(infodb) has taken place. 5. Changed cross-files global variables declared in doc.pm from our to my variables and tested this works.
  • Property svn:keywords set to Author Date Id Revision
File size: 23.6 KB
Line 
1##########################################################################
2#
3# basebuildproc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# This document processor outputs a document for indexing (should be
27# implemented by subclass) and storing in the database
28
29package basebuildproc;
30
31eval {require bytes};
32
33use classify;
34use dbutil;
35use doc;
36use docproc;
37use strict;
38no strict 'subs';
39no strict 'refs';
40use util;
41use FileUtils;
42
43BEGIN {
44 @basebuildproc::ISA = ('docproc');
45}
46
47sub new()
48 {
49 my ($class, $collection, $source_dir, $build_dir, $keepold, $verbosity, $outhandle) = @_;
50 my $self = new docproc ();
51
52 # outhandle is where all the debugging info goes
53 # output_handle is where the output of the plugins is piped
54 # to (i.e. mg, database etc.)
55 $outhandle = STDERR unless defined $outhandle;
56
57 $self->{'collection'} = $collection;
58 $self->{'source_dir'} = $source_dir;
59 $self->{'build_dir'} = $build_dir;
60 $self->{'keepold'} = $keepold;
61 $self->{'verbosity'} = $verbosity;
62 $self->{'outhandle'} = $outhandle;
63
64 $self->{'classifiers'} = [];
65 $self->{'mode'} = "text";
66 $self->{'assocdir'} = $build_dir;
67 $self->{'dontdb'} = {};
68 $self->{'store_metadata_coverage'} = "false";
69
70 $self->{'index'} = "section:text";
71 $self->{'indexexparr'} = [];
72
73 $self->{'separate_cjk'} = 0;
74
75 my $found_num_data = 0;
76 my $buildconfigfile = undef;
77
78 if ($keepold) {
79 # For incremental building need to seed num_docs etc from values
80 # stored in build.cfg (if present)
81 $buildconfigfile = &FileUtils::filenameConcatenate($build_dir, "build.cfg");
82 if (-e $buildconfigfile) {
83 $found_num_data = 1;
84 }
85 else {
86 # try the index dir
87 $buildconfigfile = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},
88 "index", "build.cfg");
89 if (-e $buildconfigfile) {
90 $found_num_data = 1;
91 }
92 }
93
94 }
95
96 if ($found_num_data)
97 {
98 #print STDERR "Found_Num_Data!\n";
99 my $buildcfg = &colcfg::read_build_cfg($buildconfigfile);
100 $self->{'starting_num_docs'} = $buildcfg->{'numdocs'};
101 #print STDERR "- num_docs: $self->{'starting_num_docs'}\n";
102 $self->{'starting_num_sections'} = $buildcfg->{'numsections'};
103 #print STDERR "- num_sections: $self->{'starting_num_sections'}\n";
104 $self->{'starting_num_bytes'} = $buildcfg->{'numbytes'};
105 #print STDERR "- num_bytes: $self->{'starting_num_bytes'}\n";
106 }
107 else
108 {
109 #print STDERR "NOT Found_Num_Data!\n";
110 $self->{'starting_num_docs'} = 0;
111 $self->{'starting_num_sections'} = 0;
112 $self->{'starting_num_bytes'} = 0;
113 }
114
115 $self->{'output_handle'} = "STDOUT";
116 $self->{'num_docs'} = $self->{'starting_num_docs'};
117 $self->{'num_sections'} = $self->{'starting_num_sections'};
118 $self->{'num_bytes'} = $self->{'starting_num_bytes'};
119
120 $self->{'num_processed_bytes'} = 0;
121 $self->{'store_text'} = 1;
122
123 # what level (section/document) the database - indexer intersection is
124 $self->{'db_level'} = "section";
125 #used by browse interface
126 $self->{'doclist'} = [];
127
128 $self->{'indexing_text'} = 0;
129
130 return bless $self, $class;
131
132}
133
134sub reset {
135 my $self = shift (@_);
136
137 $self->{'num_docs'} = $self->{'starting_num_docs'};
138 $self->{'num_sections'} = $self->{'starting_num_sections'};
139 $self->{'num_bytes'} = $self->{'starting_num_bytes'};
140
141 $self->{'num_processed_bytes'} = 0;
142}
143
144sub zero_reset {
145 my $self = shift (@_);
146
147 $self->{'num_docs'} = 0;
148 $self->{'num_sections'} = 0;
149 # reconstructed docs have no text, just metadata, so we need to
150 # remember how many bytes we had initially
151 #$self->{'num_bytes'} = $self->{'starting_num_bytes'};
152 $self->{'num_bytes'} = 0; # we'll store num bytes in db for reconstructed docs.
153 $self->{'num_processed_bytes'} = 0;
154}
155
156sub is_incremental_capable
157{
158 # By default we return 'no' as the answer
159 # Safer to assume non-incremental to start with, and then override in
160 # inherited classes that are.
161
162 return 0;
163}
164
165sub get_num_docs {
166 my $self = shift (@_);
167
168 return $self->{'num_docs'};
169}
170
171sub get_num_sections {
172 my $self = shift (@_);
173
174 return $self->{'num_sections'};
175}
176
177# num_bytes is the actual number of bytes in the collection
178# this is normally the same as what's processed during text compression
179sub get_num_bytes {
180 my $self = shift (@_);
181
182 return $self->{'num_bytes'};
183}
184
185# num_processed_bytes is the number of bytes actually passed
186# to mg for the current index
187sub get_num_processed_bytes {
188 my $self = shift (@_);
189
190 return $self->{'num_processed_bytes'};
191}
192
193sub set_output_handle {
194 my $self = shift (@_);
195 my ($handle) = @_;
196
197 $self->{'output_handle'} = $handle;
198 # The output handle isn't always an actual handle. In a couple of the
199 # database drivers (MSSQL and GDBMServer) it's actually a reference
200 # to an object. Thus we need to test the type before setting binmode.
201 # [jmt12]
202 if (ref $handle eq "GLOB")
203 {
204 binmode($handle,":utf8");
205 }
206}
207
208
209sub set_mode {
210 my $self = shift (@_);
211 my ($mode) = @_;
212
213 $self->{'mode'} = $mode;
214 $doc::processor_mode = $mode; # doc.pm needs to know what buildcol pass we're at
215}
216
217sub get_mode {
218 my $self = shift (@_);
219
220 return $self->{'mode'};
221}
222
223sub set_assocdir {
224 my $self = shift (@_);
225 my ($assocdir) = @_;
226
227 $self->{'assocdir'} = $assocdir;
228}
229
230sub set_dontdb {
231 my $self = shift (@_);
232 my ($dontdb) = @_;
233
234 $self->{'dontdb'} = $dontdb;
235}
236
237sub set_infodbtype
238{
239 my $self = shift(@_);
240 my $infodbtype = shift(@_);
241 $self->{'infodbtype'} = $infodbtype;
242}
243
244sub set_index {
245 my $self = shift (@_);
246 my ($index, $indexexparr) = @_;
247
248 $self->{'index'} = $index;
249 $self->{'indexexparr'} = $indexexparr if defined $indexexparr;
250}
251
252sub set_index_languages {
253 my $self = shift (@_);
254 my ($lang_meta, $langarr) = @_;
255 $lang_meta =~ s/^ex\.([^.]+)$/$1/; # strip any ex. namespace iff it's the only namespace prefix (will leave ex.dc.* intact)
256
257 $self->{'lang_meta'} = $lang_meta;
258 $self->{'langarr'} = $langarr;
259}
260
261sub get_index {
262 my $self = shift (@_);
263
264 return $self->{'index'};
265}
266
267sub set_classifiers {
268 my $self = shift (@_);
269 my ($classifiers) = @_;
270
271 $self->{'classifiers'} = $classifiers;
272}
273
274sub set_indexing_text {
275 my $self = shift (@_);
276 my ($indexing_text) = @_;
277
278 $self->{'indexing_text'} = $indexing_text;
279}
280
281sub get_indexing_text {
282 my $self = shift (@_);
283
284 return $self->{'indexing_text'};
285}
286
287sub set_store_text {
288 my $self = shift (@_);
289 my ($store_text) = @_;
290
291 $self->{'store_text'} = $store_text;
292}
293
294sub set_store_metadata_coverage {
295 my $self = shift (@_);
296 my ($store_metadata_coverage) = @_;
297
298 $self->{'store_metadata_coverage'} = $store_metadata_coverage || "";
299}
300
301sub get_doc_list {
302 my $self = shift(@_);
303
304 return @{$self->{'doclist'}};
305}
306
307# the standard database level is section, but you may want to change it to document
308sub set_db_level {
309 my $self= shift (@_);
310 my ($db_level) = @_;
311
312 $self->{'db_level'} = $db_level;
313}
314
315sub set_sections_index_document_metadata {
316 my $self= shift (@_);
317 my ($index_type) = @_;
318
319 $self->{'sections_index_document_metadata'} = $index_type;
320}
321
322sub set_separate_cjk {
323 my $self = shift (@_);
324 my ($sep_cjk) = @_;
325
326 $self->{'separate_cjk'} = $sep_cjk;
327}
328
329sub process {
330 my $self = shift (@_);
331 my $method = $self->{'mode'};
332
333 $self->$method(@_);
334}
335
336# post process text depending on field. Currently don't do anything here
337# except cjk separation, and only for indexing
338# should only do this for indexed text (if $self->{'indexing_text'}),
339# but currently search term highlighting doesn't work if you do that.
340# once thats fixed up, then fix this.
341sub filter_text {
342 my $self = shift (@_);
343 my ($field, $text) = @_;
344
345 # lets do cjk seg here
346 my $new_text =$text;
347 if ($self->{'separate_cjk'}) {
348 $new_text = &cnseg::segment($text);
349 }
350 return $new_text;
351}
352
353
354sub infodb_metadata_stats
355{
356 my $self = shift (@_);
357 my ($field,$edit_mode) = @_;
358
359 # Keep some statistics relating to metadata sets used and
360 # frequency of particular metadata fields within each set
361
362 # Union of metadata prefixes and frequency of fields
363 # (both scoped for this document alone, and across whole collection)
364
365 if ($field =~ m/^(.+)\.(.*)$/) {
366 my $prefix = $1;
367 my $core_field = $2;
368
369 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
370 $self->{'doc_mdprefix_fields'}->{$prefix}->{$core_field}++;
371 $self->{'mdprefix_fields'}->{$prefix}->{$core_field}++;
372 }
373 else {
374 # delete
375 $self->{'doc_mdprefix_fields'}->{$prefix}->{$core_field}--;
376 $self->{'mdprefix_fields'}->{$prefix}->{$core_field}--;
377 }
378
379 }
380 elsif ($field =~ m/^[[:upper:]]/) {
381 # implicit 'ex' metadata set
382
383 if (($edit_mode eq "add") || ($edit_mode eq "update")) {
384
385 $self->{'doc_mdprefix_fields'}->{'ex'}->{$field}++;
386 $self->{'mdprefix_fields'}->{'ex'}->{$field}++;
387 }
388 else {
389 # delete
390 $self->{'doc_mdprefix_fields'}->{'ex'}->{$field}--;
391 $self->{'mdprefix_fields'}->{'ex'}->{$field}--;
392 }
393 }
394
395}
396
397
398sub infodbedit {
399 my $self = shift (@_);
400 my ($doc_obj, $filename, $edit_mode) = @_;
401
402 # only output this document if it is a "indexed_doc" or "info_doc" (database only) document
403 my $doctype = $doc_obj->get_doc_type();
404 return if ($doctype ne "indexed_doc" && $doctype ne "info_doc");
405
406 my $archivedir = "";
407 if (defined $filename)
408 {
409 # doc_obj derived directly from file
410 my ($dir) = $filename =~ /^(.*?)(?:\/|\\)[^\/\\]*$/;
411 $dir = "" unless defined $dir;
412 $dir =~ s/\\/\//g;
413 $dir =~ s/^\/+//;
414 $dir =~ s/\/+$//;
415
416 $archivedir = $dir;
417
418 if ($edit_mode eq "delete") {
419 # record this doc so we don't process the reconstructed doc later
420 $self->{'dont_process_reconstructed'}->{$doc_obj->get_OID()} = 1;
421 # we don't need to do anything else for the info database for a deleted document. The infodb starts from scratch each time, so no deletion is necessary
422 $self->delete_assoc_files ($archivedir, "delete");
423 return;
424 }
425 if ($edit_mode eq "update") {
426 # we don't want to process the reconstructed doc later, but we will process this version now.
427 $self->{'dont_process_reconstructed'}->{$doc_obj->get_OID()} = 1;
428 # delete the old assoc files as they may have changed
429 $self->delete_assoc_files ($archivedir, "update");
430 }
431
432 # resolve the final filenames of the files associated with this document
433 # now save the new assoc files for an update/new doc.
434 $self->assoc_files ($doc_obj, $archivedir);
435 }
436 else
437 {
438 # doc_obj reconstructed from database (has metadata, doc structure but no text)
439 my $top_section = $doc_obj->get_top_section();
440 $archivedir = $doc_obj->get_metadata_element($top_section,"archivedir");
441 }
442
443 # rest of code used for add and update. In both cases, we add to the classifiers and to the info database.
444
445 #add this document to the browse structure
446 push(@{$self->{'doclist'}},$doc_obj->get_OID())
447 unless ($doctype eq "classification");
448 $self->{'num_docs'} += 1 unless ($doctype eq "classification");
449
450 if (!defined $filename) {
451 # a reconstructed doc
452 my $num_reconstructed_bytes = $doc_obj->get_metadata_element ($doc_obj->get_top_section (), "total_numbytes");
453 if (defined $num_reconstructed_bytes) {
454 $self->{'num_bytes'} += $num_reconstructed_bytes;
455 }
456 }
457 # classify the document
458 &classify::classify_doc ($self->{'classifiers'}, $doc_obj);
459
460 # now add all the sections to the infodb.
461
462 # is this a paged or a hierarchical document
463 my ($thistype, $childtype) = $self->get_document_type ($doc_obj);
464
465 my $section = $doc_obj->get_top_section ();
466 my $doc_OID = $doc_obj->get_OID();
467 my $first = 1;
468 my $infodb_handle = $self->{'output_handle'};
469
470 $self->{'doc_mdprefix_fields'} = {};
471
472 while (defined $section)
473 {
474 my $section_OID = $doc_OID;
475 if ($section ne "")
476 {
477 $section_OID = $doc_OID . "." . $section;
478 }
479 my %section_infodb = ();
480
481 # update a few statistics
482 $self->{'num_bytes'} += $doc_obj->get_text_length ($section);
483 $self->{'num_sections'} += 1 unless ($doctype eq "classification");
484
485 # output the fact that this document is a document (unless doctype
486 # has been set to something else from within a plugin
487 my $dtype = $doc_obj->get_metadata_element ($section, "doctype");
488 if (!defined $dtype || $dtype !~ /\w/) {
489 $section_infodb{"doctype"} = [ "doc" ];
490 }
491
492 if ($first && defined $filename) {
493 # if we are at the top level of the document, and we are not a reconstructed document, set the total_text_length - used to count bytes when we reconstruct later
494 my $length = $doc_obj->get_total_text_length();
495 $section_infodb{"total_numbytes"} = [ $length ];
496 }
497 # Output whether this node contains text
498 #
499 # If doc_obj reconstructed from database file then no need to
500 # explicitly add <hastxt> as this is preserved as metadata when
501 # the database file is loaded in
502 if (defined $filename)
503 {
504 # doc_obj derived directly from file
505 if ($doc_obj->get_text_length($section) > 0) {
506 $section_infodb{"hastxt"} = [ "1" ];
507 } else {
508 $section_infodb{"hastxt"} = [ "0" ];
509 }
510 }
511
512 # output all the section metadata
513 my $metadata = $doc_obj->get_all_metadata ($section);
514 foreach my $pair (@$metadata) {
515 my ($field, $value) = (@$pair);
516
517 if ($field ne "Identifier" && $field !~ /^gsdl/ &&
518 defined $value && $value ne "") {
519
520 # escape problematic stuff
521 $value =~ s/([^\\])\\([^\\])/$1\\\\$2/g;
522 $value =~ s/\n/\\n/g;
523 $value =~ s/\r/\\r/g;
524 # remove any ex. iff it's the only namespace prefix (will leave ex.dc.* intact)
525 $field =~ s/^ex\.([^.]+)$/$1/; # $field =~ s/^ex\.//;
526
527 # special case for UTF8URL metadata
528 if ($field =~ m/^UTF8URL$/i) {
529 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle,
530 $value, { 'section' => [ $section_OID ] });
531 }
532
533 if (!defined $self->{'dontdb'}->{$field}) {
534 push(@{$section_infodb{$field}}, $value);
535
536 if ($section eq ""
537 && (($self->{'store_metadata_coverage'} =~ /^true$/i)
538 || $self->{'store_metadata_coverage'} eq "1"))
539 {
540 $self->infodb_metadata_stats($field,$edit_mode);
541 }
542 }
543 }
544 }
545
546 if ($section eq "")
547 {
548 my $doc_mdprefix_fields = $self->{'doc_mdprefix_fields'};
549
550 foreach my $prefix (keys %$doc_mdprefix_fields)
551 {
552 push(@{$section_infodb{"metadataset"}}, $prefix);
553
554 foreach my $field (keys %{$doc_mdprefix_fields->{$prefix}})
555 {
556 push(@{$section_infodb{"metadatalist-$prefix"}}, $field);
557
558 my $val = $doc_mdprefix_fields->{$prefix}->{$field};
559 push(@{$section_infodb{"metadatafreq-$prefix-$field"}}, $val);
560 }
561 }
562 }
563
564 # If doc_obj reconstructed from database file then no need to
565 # explicitly add <archivedir> as this is preserved as metadata when
566 # the database file is loaded in
567 if (defined $filename)
568 {
569 # output archivedir if at top level
570 if ($section eq $doc_obj->get_top_section()) {
571 $section_infodb{"archivedir"} = [ $archivedir ];
572 }
573 }
574
575 # output document display type
576 if ($first) {
577 $section_infodb{"thistype"} = [ $thistype ];
578 }
579
580 if ($self->{'db_level'} eq "document") {
581 # doc num is num_docs not num_sections
582 # output the matching document number
583 $section_infodb{"docnum"} = [ $self->{'num_docs'} ];
584 }
585 else {
586 # output a list of children
587 my $children = $doc_obj->get_children ($section);
588 if (scalar(@$children) > 0) {
589 $section_infodb{"childtype"} = [ $childtype ];
590 my $contains = "";
591 foreach my $child (@$children)
592 {
593 $contains .= ";" unless ($contains eq "");
594 if ($child =~ /^.*?\.(\d+)$/)
595 {
596 $contains .= "\".$1";
597 }
598 else
599 {
600 $contains .= "\".$child";
601 }
602 }
603 $section_infodb{"contains"} = [ $contains ];
604 }
605 # output the matching doc number
606 $section_infodb{"docnum"} = [ $self->{'num_sections'} ];
607 }
608
609 if(defined $section_infodb{'assocfilepath'})
610 {
611 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $section_infodb{'assocfilepath'}[0], { 'contains' => [ $section_OID ]});
612 }
613 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $section_OID, \%section_infodb);
614
615 # output a database entry for the document number, unless we are incremental
616 unless ($self->is_incremental_capable())
617 {
618 if ($self->{'db_level'} eq "document") {
619 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_docs'}, { 'section' => [ $doc_OID ] });
620 }
621 else {
622 &dbutil::write_infodb_entry($self->{'infodbtype'}, $infodb_handle, $self->{'num_sections'}, { 'section' => [ $section_OID ] });
623 }
624 }
625
626 $first = 0;
627 $section = $doc_obj->get_next_section($section);
628 last if ($self->{'db_level'} eq "document"); # if no sections wanted, only add the docs
629 } # while defined section
630
631}
632
633
634
635
636sub infodb {
637 my $self = shift (@_);
638 my ($doc_obj, $filename) = @_;
639
640 $self->infodbedit($doc_obj,$filename,"add");
641}
642
643sub infodbreindex {
644 my $self = shift (@_);
645 my ($doc_obj, $filename) = @_;
646
647 $self->infodbedit($doc_obj,$filename,"update");
648}
649
650sub infodbdelete {
651 my $self = shift (@_);
652 my ($doc_obj, $filename) = @_;
653
654 $self->infodbedit($doc_obj,$filename,"delete");
655}
656
657
658sub text {
659 my $self = shift (@_);
660 my ($doc_obj) = @_;
661
662 my $handle = $self->{'outhandle'};
663 print $handle "basebuildproc::text function must be implemented in sub classes\n";
664 die "\n";
665}
666
667sub textreindex
668{
669 my $self = shift @_;
670
671 my $outhandle = $self->{'outhandle'};
672 print $outhandle "basebuildproc::textreindex function must be implemented in sub classes\n";
673 if (!$self->is_incremental_capable()) {
674
675 print $outhandle " This operation is only possible with indexing tools with that support\n";
676 print $outhandle " incremental building\n";
677 }
678 die "\n";
679}
680
681sub textdelete
682{
683 my $self = shift @_;
684
685 my $outhandle = $self->{'outhandle'};
686 print $outhandle "basebuildproc::textdelete function must be implemented in sub classes\n";
687 if (!$self->is_incremental_capable()) {
688
689 print $outhandle " This operation is only possible with indexing tools with that support\n";
690 print $outhandle " incremental building\n";
691 }
692 die "\n";
693}
694
695
696# should the document be indexed - according to the subcollection and language
697# specification.
698sub is_subcollection_doc {
699 my $self = shift (@_);
700 my ($doc_obj) = @_;
701
702 my $indexed_doc = 1;
703 foreach my $indexexp (@{$self->{'indexexparr'}}) {
704 $indexed_doc = 0;
705 my ($field, $exp, $options) = split /\//, $indexexp;
706 if (defined ($field) && defined ($exp)) {
707 my ($bool) = $field =~ /^(.)/;
708 $field =~ s/^.// if $bool eq '!';
709 my @metadata_values;
710 if ($field =~ /^filename$/i) {
711 push(@metadata_values, $doc_obj->get_source_filename());
712 }
713 else {
714 $field =~ s/^ex\.([^.]+)$/$1/; # remove any ex. iff it's the only namespace prefix (will leave ex.dc.* intact)
715 @metadata_values = @{$doc_obj->get_metadata($doc_obj->get_top_section(), $field)};
716 }
717 next unless @metadata_values;
718 foreach my $metadata_value (@metadata_values) {
719 if ($bool eq '!') {
720 if (defined $options && $options =~ /^i$/i) {
721 if ($metadata_value !~ /$exp/i) {$indexed_doc = 1; last;}
722 } else {
723 if ($metadata_value !~ /$exp/) {$indexed_doc = 1; last;}
724 }
725 } else {
726 if (defined $options && $options =~ /^i$/i) {
727 if ($metadata_value =~ /$exp/i) {$indexed_doc = 1; last;}
728 } else {
729 if ($metadata_value =~ /$exp/) {$indexed_doc = 1; last;}
730 }
731 }
732 }
733
734 last if ($indexed_doc == 1);
735 }
736 }
737
738 # if this doc is so far in the sub collection, and we have lang info,
739 # now we check the languages to see if it matches
740 if($indexed_doc && defined $self->{'lang_meta'}) {
741 $indexed_doc = 0;
742 my $field = $doc_obj->get_metadata_element($doc_obj->get_top_section(), $self->{'lang_meta'});
743 if (defined $field) {
744 foreach my $lang (@{$self->{'langarr'}}) {
745 my ($bool) = $lang =~ /^(.)/;
746 if ($bool eq '!') {
747 $lang =~ s/^.//;
748 if ($field !~ /$lang/) {
749 $indexed_doc = 1; last;
750 }
751 } else {
752 if ($field =~ /$lang/) {
753 $indexed_doc = 1; last;
754 }
755 }
756 }
757 }
758 }
759 return $indexed_doc;
760
761}
762
763# use 'Paged' if document has no more than 2 levels
764# and each section at second level has a number for
765# Title metadata
766# also use Paged if gsdlthistype metadata is set to Paged
767sub get_document_type {
768 my $self = shift (@_);
769 my ($doc_obj) = @_;
770
771 my $thistype = "VList";
772 my $childtype = "VList";
773 my $title;
774 my @tmp = ();
775
776 my $section = $doc_obj->get_top_section ();
777
778 my $gsdlthistype = $doc_obj->get_metadata_element ($section, "gsdlthistype");
779 if (defined $gsdlthistype) {
780 if ($gsdlthistype =~ /^paged$/i) {
781 $childtype = "Paged";
782 if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
783 $thistype = "Paged";
784 } else {
785 $thistype = "Invisible";
786 }
787
788 return ($thistype, $childtype);
789 }
790 # gs3 pagedhierarchy option
791 elsif ($gsdlthistype =~ /^pagedhierarchy$/i) {
792 $childtype = "PagedHierarchy";
793 if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
794 $thistype = "PagedHierarchy";
795 } else {
796 $thistype = "Invisible";
797 }
798
799 return ($thistype, $childtype);
800 } elsif ($gsdlthistype =~ /^hierarchy$/i) {
801 return ($thistype, $childtype); # use VList, VList
802 }
803 }
804 my $first = 1;
805 while (defined $section) {
806 @tmp = split /\./, $section;
807 if (scalar(@tmp) > 1) {
808 return ($thistype, $childtype);
809 }
810 if (!$first) {
811 $title = $doc_obj->get_metadata_element ($section, "Title");
812 if (!defined $title || $title !~ /^\d+$/) {
813 return ($thistype, $childtype);
814 }
815 }
816 $first = 0;
817 $section = $doc_obj->get_next_section($section);
818 }
819 if ($doc_obj->get_text_length ($doc_obj->get_top_section())) {
820 $thistype = "Paged";
821 } else {
822 $thistype = "Invisible";
823 }
824 $childtype = "Paged";
825 return ($thistype, $childtype);
826}
827
828sub assoc_files
829{
830 my $self = shift (@_);
831 my ($doc_obj, $archivedir) = @_;
832 my ($afile);
833
834 foreach my $assoc_file (@{$doc_obj->get_assoc_files()}) {
835 #rint STDERR "Processing associated file - copy " . $assoc_file->[0] . " to " . $assoc_file->[1] . "\n";
836 # if assoc file starts with a slash, we put it relative to the assoc
837 # dir, otherwise it is relative to the HASH... directory
838 if ($assoc_file->[1] =~ m@^[/\\]@) {
839 $afile = &FileUtils::filenameConcatenate($self->{'assocdir'}, $assoc_file->[1]);
840 } else {
841 $afile = &FileUtils::filenameConcatenate($self->{'assocdir'}, $archivedir, $assoc_file->[1]);
842 }
843
844 &FileUtils::hardLink($assoc_file->[0], $afile, $self->{'verbosity'});
845 }
846}
847
848sub delete_assoc_files
849{
850 my $self = shift (@_);
851 my ($archivedir, $edit_mode) = @_;
852
853 my $assoc_dir = &FileUtils::filenameConcatenate($self->{'assocdir'}, $archivedir);
854 if (-d $assoc_dir) {
855 &FileUtils::removeFilesRecursive($assoc_dir);
856 }
857}
Note: See TracBrowser for help on using the repository browser.