source: main/trunk/greenstone2/perllib/doc.pm@ 28560

Last change on this file since 28560 was 28560, checked in by ak19, 10 years ago
  1. New subroutine util::set_gnomelib_env that sets the environment for gnomelib needed for running hashfile, suffix and wget which are dependent on the libiconv dll in ext/gnome-lib(-minimal). It's particularly the Mac Lions that need libiconv.2.dylib. 2. Updated the call to hashfile in doc.pm, the call to suffix in Phind.pm and the calls to wget in several perl scripts and modules to call util::set_gnomelib_env, though this will only set the environment once for each subshell.
  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 38.5 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use FileUtils;
40use ghtml;
41use File::stat;
42##use hashdoc;
43use docprint;
44
45# the document type may be indexed_doc, nonindexed_doc, or
46# classification
47
48our $OIDcount = 0;
49
50# rename_method can be 'url', 'none', 'base64'
51sub new {
52 my $class = shift (@_);
53 my ($source_filename, $doc_type, $rename_method) = @_;
54
55
56 my $self = bless {'associated_files'=>[],
57 'subsection_order'=>[],
58 'next_subsection'=>1,
59 'subsections'=>{},
60 'metadata'=>[],
61 'text'=>"",
62 'OIDtype'=>"hash"}, $class;
63
64 # used to set lastmodified here, but this can screw up the HASH ids, so
65 # the docsave processor now calls set_lastmodified
66
67 $self->set_source_path($source_filename);
68
69 if (defined $source_filename) {
70 $source_filename = &util::filename_within_collection($source_filename);
71 print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
72 $self->set_source_filename ($source_filename, $rename_method);
73 }
74
75 $self->set_doc_type ($doc_type) if defined $doc_type;
76
77 return $self;
78}
79
80
81sub set_source_path
82{
83 my $self = shift @_;
84 my ($source_filename) = @_;
85
86 if (defined $source_filename) {
87 # On Windows the source_filename can be in terse DOS format
88 # e.g. test~1.txt
89
90 $self->{'terse_source_path'} = $source_filename;
91
92 # Use the FileUtil library methods as they are aware of more special
93 # cases such as HDFS [jmt12]
94 if (&FileUtils::fileExists($source_filename))
95 {
96 # See if we can do better for Windows with a filename
97 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
98 require Win32;
99 $self->{'source_path'} = Win32::GetLongPathName($source_filename);
100 }
101 else {
102 # For Unix-based systems, there is no difference between the two
103 $self->{'source_path'} = $source_filename;
104 }
105 }
106 else {
107 print STDERR "Warning: In doc::set_source_path(), file\n";
108 print STDERR " $source_filename\n";
109 print STDERR " does not exist\n";
110
111 # (default) Set it to whatever we were given
112 $self->{'source_path'} = $source_filename;
113 }
114 }
115 else {
116 # Previous code for setting source_path allowed for
117 # it to be undefined, so continue this practice
118 $self->{'terse_source_path'} = undef;
119 $self->{'source_path'} = undef;
120 }
121}
122
123
124sub get_source_path
125{
126 my $self = shift @_;
127
128 return $self->{'terse_source_path'};
129}
130
131# set lastmodified for OAI purposes, added by GRB, moved by kjdon
132sub set_oailastmodified {
133 my $self = shift (@_);
134
135 my $source_path = $self->{'terse_source_path'};
136
137 if (defined $source_path && (-e $source_path)) {
138 my $current_time = time;
139
140 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
141 $wday, $yday, $isdst) = localtime($current_time);
142
143 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
144
145 $self->add_utf8_metadata($self->get_top_section(), "oailastmodified", $current_time);
146 $self->add_utf8_metadata($self->get_top_section(), "oailastmodifieddate", $date_modified);
147 }
148}
149
150# no longer used for OAI purposes, since lastmodified is not what we want as the
151# Datestamp of a document. This doc metadata may be useful for general purposes.
152sub set_lastmodified {
153 my $self = shift (@_);
154
155 my $source_path = $self->{'terse_source_path'};
156
157 if (defined $source_path && (-e $source_path)) {
158
159 my $file_stat = stat($source_path);
160 my $mtime = $file_stat->mtime;
161 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
162 $wday, $yday, $isdst) = localtime($mtime);
163
164 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
165
166 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $mtime);
167 $self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
168 }
169}
170
171# clone the $self object
172sub duplicate {
173 my $self = shift (@_);
174
175 my $newobj = {};
176
177 foreach my $k (keys %$self) {
178 $newobj->{$k} = &clone ($self->{$k});
179 }
180
181 bless $newobj, ref($self);
182 return $newobj;
183}
184
185sub clone {
186 my ($from) = @_;
187 my $type = ref ($from);
188
189 if ($type eq "HASH") {
190 my $to = {};
191 foreach my $key (keys %$from) {
192 $to->{$key} = &clone ($from->{$key});
193 }
194 return $to;
195 } elsif ($type eq "ARRAY") {
196 my $to = [];
197 foreach my $v (@$from) {
198 push (@$to, &clone ($v));
199 }
200 return $to;
201 } else {
202 return $from;
203 }
204}
205
206sub set_OIDtype {
207 my $self = shift (@_);
208 my ($type, $metadata) = @_;
209
210 if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|hash_on_full_filename|incremental|filename|dirname|full_filename|assigned)$/) {
211 $self->{'OIDtype'} = $type;
212 } else {
213 $self->{'OIDtype'} = "hash";
214 }
215
216 if ($type =~ /^assigned$/) {
217 if (defined $metadata) {
218 $self->{'OIDmetadata'} = $metadata;
219 } else {
220 $self->{'OIDmetadata'} = "dc.Identifier";
221 }
222 }
223}
224
225# rename_method can be 'url', 'none', 'base64'
226sub set_source_filename {
227 my $self = shift (@_);
228 my ($source_filename, $rename_method) = @_;
229
230 # Since the gsdlsourcefilename element goes into the doc.xml it has
231 # to be utf8. However, it should also *represent* the source filename
232 # (in the import directory) which may not be utf8 at all.
233 # For instance, if this meta element (gsdlsourcefilename) will be used
234 # by other applications that parse doc.xml in order to locate
235 # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
236 # encode the real filename as this is a binary-to-text encoding meaning
237 # that the resulting string is ASCII (utf8). Decoding will give the original.
238
239# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
240
241 # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
242 $source_filename = $self->encode_filename($source_filename, $rename_method);
243# my ($srcfilename,$dirname,$suffix)
244# = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
245# print STDERR "-> $srcfilename -> ";
246# $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
247# $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
248# print STDERR "$source_filename\n";
249
250 $self->set_utf8_metadata_element ($self->get_top_section(),
251 "gsdlsourcefilename",
252 $source_filename);
253}
254
255sub encode_filename {
256 my $self = shift (@_);
257 my ($source_filename, $rename_method) = @_;
258
259 my ($srcfilename,$dirname,$suffix)
260 = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
261# print STDERR "-> $srcfilename -> ";
262 $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
263 $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
264
265 return $source_filename;
266}
267
268sub set_converted_filename {
269 my $self = shift (@_);
270 my ($converted_filename) = @_;
271
272 # we know the converted filename is utf8
273 $self->set_utf8_metadata_element ($self->get_top_section(),
274 "gsdlconvertedfilename",
275 $converted_filename);
276}
277
278# returns the source_filename as it was provided
279sub get_unmodified_source_filename {
280 my $self = shift (@_);
281
282 return $self->{'terse_source_path'};
283}
284
285# returns the source_filename with whatever rename_method was given
286sub get_source_filename {
287 my $self = shift (@_);
288
289 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
290}
291
292
293
294# returns converted filename if available else returns source filename
295sub get_filename_for_hashing {
296 my $self = shift (@_);
297
298 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
299
300 if (!defined $filename) {
301 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
302 # if NULPlug processed file, then don't give a filename
303 if (defined $plugin_name && $plugin_name eq "NULPlug") {
304 $filename = undef;
305 } else { # returns the URL encoded source filename!
306 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
307 }
308 }
309
310 if (!&FileUtils::isFilenameAbsolute($filename)) {
311 $filename = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$filename);
312 }
313
314 return $filename;
315}
316
317sub set_doc_type {
318 my $self = shift (@_);
319 my ($doc_type) = @_;
320
321 $self->set_metadata_element ($self->get_top_section(),
322 "gsdldoctype",
323 $doc_type);
324}
325
326# returns the gsdldoctype as it was provided
327# the default of "indexed_doc" is used if no document
328# type was provided
329sub get_doc_type {
330 my $self = shift (@_);
331
332 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
333 return $doc_type if (defined $doc_type);
334 return "indexed_doc";
335}
336
337
338# look up the reference to the a particular section
339sub _lookup_section {
340 my $self = shift (@_);
341 my ($section) = @_;
342
343 my ($num);
344 my $sectionref = $self;
345
346 while (defined $section && $section ne "") {
347
348 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
349
350 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
351
352 $section = "" unless defined $section;
353
354
355 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
356 $sectionref = $sectionref->{'subsections'}->{$num};
357 } else {
358 return undef;
359 }
360 }
361
362 return $sectionref;
363}
364
365# calculate OID by hashing the contents of the document
366sub _calc_OID {
367 my $self = shift (@_);
368 my ($filename) = @_;
369
370
371 my $osexe = &util::get_os_exe();
372
373 my $hashfile_exe = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin",
374 $ENV{'GSDLOS'},"hashfile$osexe");
375
376 &util::set_gnomelib_env(); # gnomelib_env (particularly lib/libiconv2.dylib) required to run the hashfile executable on Mac Lions
377 # The subroutine will set the gnomelib env once for each subshell launched, by first testing if GEXTGNOME is not already set
378
379 my $result = "NULL";
380
381
382 if (-e "$hashfile_exe") {
383# $result = `\"$hashfile_exe\" \"$filename\"`;
384# $result = `hashfile$osexe \"$filename\" 2>&1`;
385 $result = `hashfile$osexe \"$filename\"`;
386
387 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
388 } else {
389 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
390 }
391 return "HASH$result";
392}
393
394# methods dealing with OID, not groups of them.
395
396# if $OID is not provided one is calculated
397sub set_OID {
398 my $self = shift (@_);
399 my ($OID) = @_;
400
401 my $use_hash_oid = 0;
402 # if an OID wasn't provided calculate one
403 if (!defined $OID) {
404 $OID = "NULL";
405 if ($self->{'OIDtype'} =~ /^hash/) {
406 $use_hash_oid = 1;
407 } elsif ($self->{'OIDtype'} eq "incremental") {
408 $OID = "D" . $OIDcount;
409 $OIDcount ++;
410 } elsif ($self->{'OIDtype'} eq "filename") {
411 my $filename = $self->get_source_filename();
412 $OID = &File::Basename::fileparse($filename, qr/\.[^.]*/);
413 $OID = &util::tidy_up_oid($OID);
414 } elsif ($self->{'OIDtype'} eq "full_filename") {
415 my $source_filename = $self->get_source_filename();
416 my $dirsep = &util::get_os_dirsep();
417
418 $source_filename =~ s/^import$dirsep//;
419 $source_filename =~ s/$dirsep/-/g;
420 $source_filename =~ s/\./_/g;
421
422 $OID = $source_filename;
423 $OID = &util::tidy_up_oid($OID);
424 } elsif ($self->{'OIDtype'} eq "dirname") {
425 my $filename = $self->get_source_filename();
426 if (defined($filename)) { # && -e $filename) {
427 # get the immediate parent directory
428 $OID = &File::Basename::dirname($filename);
429 if (defined $OID) {
430 $OID = &File::Basename::basename($OID);
431 $OID = &util::tidy_up_oid($OID);
432 } else {
433 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
434 $use_hash_oid = 1;
435 }
436 } else {
437 print STDERR "Failed to find a filename, generating hash id\n";
438 $use_hash_oid = 1;
439 }
440
441 } elsif ($self->{'OIDtype'} eq "assigned") {
442 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
443 if (defined $identifier && $identifier ne "") {
444 $OID = $identifier;
445 $OID = &util::tidy_up_oid($OID);
446 } else {
447 # need a hash id
448 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
449 $use_hash_oid = 1;
450 }
451
452 } else {
453 $use_hash_oid = 1;
454 }
455
456 if ($use_hash_oid) {
457 my $hash_on_file = 1;
458 my $hash_on_ga_xml = 0;
459
460 if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
461 $hash_on_file = 0;
462 $hash_on_ga_xml = 1;
463 }
464
465 if ($self->{'OIDtype'} eq "hash_on_full_filename") {
466 $hash_on_file = 0;
467 $hash_on_ga_xml = 0;
468
469 my $source_filename = $self->get_source_filename();
470 my $dirsep = &util::get_os_dirsep();
471
472 $source_filename =~ s/^import$dirsep//;
473 $source_filename =~ s/$dirsep/-/g;
474 $source_filename =~ s/\./_/g;
475
476 # If the filename is very short then (handled naively)
477 # this can cause conjestion in the hash-values
478 # computed, leading documents sharing the same leading
479 # Hex values in the computed has.
480 #
481 # The solution taken here is to replace the name of
482 # the file name a sufficient number of times (up to
483 # the character limit defined in 'rep_limit' and
484 # make that the content that is hashed on
485
486 # *** Think twice before changing the following value
487 # as it will break backward compatability of computed
488 # document HASH values
489
490 my $rep_limit = 256;
491 my $hash_content = undef;
492
493 if (length($source_filename)<$rep_limit) {
494 my $rep_string = "$source_filename|";
495 my $rs_len = length($rep_string);
496
497 my $clone_times = int(($rep_limit-1)/$rs_len) +1;
498
499 $hash_content = substr($rep_string x $clone_times, 0, $rep_limit);
500 }
501 else {
502 $hash_content = $source_filename;
503 }
504
505 my $filename = &util::get_tmp_filename();
506 if (!open (OUTFILE, ">:utf8", $filename)) {
507 print STDERR "doc::set_OID could not write to $filename\n";
508 } else {
509 print OUTFILE $hash_content;
510 close (OUTFILE);
511 }
512 $OID = $self->_calc_OID ($filename);
513
514 &FileUtils::removeFiles ($filename);
515 }
516
517 if ($hash_on_file) {
518 # "hash" OID - feed file to hashfile.exe
519 my $filename = $self->get_filename_for_hashing();
520
521 # -z: don't want to hash on the file if it is zero size
522 if (defined($filename) && -e $filename && !-z $filename) {
523 $OID = $self->_calc_OID ($filename);
524 } else {
525 $hash_on_ga_xml = 1; # switch to back-up plan, and hash on GA file instead
526 }
527 }
528
529 if ($hash_on_ga_xml) {
530 # In addition being asked to explicity calculate the has based on the GA file,
531 # can also end up coming into this block is doing 'hash_on_file' but the file
532 # itself is of zero bytes (as could be the case with 'doc.nul' file
533
534 my $filename = &util::get_tmp_filename();
535 if (!open (OUTFILE, ">:utf8", $filename)) {
536 print STDERR "doc::set_OID could not write to $filename\n";
537 } else {
538 my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
539 print OUTFILE $doc_text;
540 close (OUTFILE);
541 }
542 $OID = $self->_calc_OID ($filename);
543 &FileUtils::removeFiles($filename);
544 }
545 }
546 }
547 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
548}
549
550# this uses hashdoc (embedded c thingy) which is faster but still
551# needs a little work to be suffiently stable
552sub ___set_OID {
553 my $self = shift (@_);
554 my ($OID) = @_;
555
556 # if an OID wasn't provided then calculate hash value based on document
557 if (!defined $OID)
558 {
559 my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
560 my $hash_len = length($hash_text);
561
562 $OID = &hashdoc::buffer($hash_text,$hash_len);
563 }
564
565 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
566}
567
568# returns the OID for this document
569sub get_OID {
570 my $self = shift (@_);
571 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
572 return $OID if (defined $OID);
573 return "NULL";
574}
575
576sub delete_OID {
577 my $self = shift (@_);
578
579 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
580}
581
582
583# methods for manipulating section names
584
585# returns the name of the top-most section (the top
586# level of the document
587sub get_top_section {
588 my $self = shift (@_);
589
590 return "";
591}
592
593# returns a section
594sub get_parent_section {
595 my $self = shift (@_);
596 my ($section) = @_;
597
598 $section =~ s/(^|\.)\d+$//;
599
600 return $section;
601}
602
603# returns the first child section (or the end child
604# if there isn't any)
605sub get_begin_child {
606 my $self = shift (@_);
607 my ($section) = @_;
608
609 my $section_ptr = $self->_lookup_section($section);
610 return "" unless defined $section_ptr;
611
612 if (defined $section_ptr->{'subsection_order'}->[0]) {
613 return "$section.$section_ptr->{'subsection_order'}->[0]";
614 }
615
616 return $self->get_end_child ($section);
617}
618
619# returns the next child of a parent section
620sub get_next_child {
621 my $self = shift (@_);
622 my ($section) = @_;
623
624 my $parent_section = $self->get_parent_section($section);
625 my $parent_section_ptr = $self->_lookup_section($parent_section);
626 return undef unless defined $parent_section_ptr;
627
628 my ($section_num) = $section =~ /(\d+)$/;
629 return undef unless defined $section_num;
630
631 my $i = 0;
632 my $section_order = $parent_section_ptr->{'subsection_order'};
633 while ($i < scalar(@$section_order)) {
634 last if $section_order->[$i] eq $section_num;
635 $i++;
636 }
637
638 $i++; # the next child
639 if ($i < scalar(@$section_order)) {
640 return $section_order->[$i] if $parent_section eq "";
641 return "$parent_section.$section_order->[$i]";
642 }
643
644 # no more sections in this level
645 return undef;
646}
647
648# returns a reference to a list of children
649sub get_children {
650 my $self = shift (@_);
651 my ($section) = @_;
652
653 my $section_ptr = $self->_lookup_section($section);
654 return [] unless defined $section_ptr;
655
656 my @children = @{$section_ptr->{'subsection_order'}};
657
658 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
659 return \@children;
660}
661
662# returns the child section one past the last one (which
663# is coded as "0")
664sub get_end_child {
665 my $self = shift (@_);
666 my ($section) = @_;
667
668 return $section . ".0" unless $section eq "";
669 return "0";
670}
671
672# returns the next section in book order
673sub get_next_section {
674 my $self = shift (@_);
675 my ($section) = @_;
676
677 return undef unless defined $section;
678
679 my $section_ptr = $self->_lookup_section($section);
680 return undef unless defined $section_ptr;
681
682 # first try to find first child
683 if (defined $section_ptr->{'subsection_order'}->[0]) {
684 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
685 return "$section.$section_ptr->{'subsection_order'}->[0]";
686 }
687
688 do {
689 # try to find sibling
690 my $next_child = $self->get_next_child ($section);
691 return $next_child if (defined $next_child);
692
693 # move up one level
694 $section = $self->get_parent_section ($section);
695 } while $section =~ /\d/;
696
697 return undef;
698}
699
700sub is_leaf_section {
701 my $self = shift (@_);
702 my ($section) = @_;
703
704 my $section_ptr = $self->_lookup_section($section);
705 return 1 unless defined $section_ptr;
706
707 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
708}
709
710# methods for dealing with sections
711
712# returns the name of the inserted section
713sub insert_section {
714 my $self = shift (@_);
715 my ($before_section) = @_;
716
717 # get the child to insert before and its parent section
718 my $parent_section = "";
719 my $before_child = "0";
720 my @before_section = split (/\./, $before_section);
721 if (scalar(@before_section) > 0) {
722 $before_child = pop (@before_section);
723 $parent_section = join (".", @before_section);
724 }
725
726 my $parent_section_ptr = $self->_lookup_section($parent_section);
727 if (!defined $parent_section_ptr) {
728 print STDERR "doc::insert_section couldn't find parent section " .
729 "$parent_section\n";
730 return;
731 }
732
733 # get the next section number
734 my $section_num = $parent_section_ptr->{'next_subsection'}++;
735
736 my $i = 0;
737 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
738 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
739 $i++;
740 }
741
742 # insert the section number into the order list
743 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
744
745 # add this section to the parent section
746 my $section_ptr = {'subsection_order'=>[],
747 'next_subsection'=>1,
748 'subsections'=>{},
749 'metadata'=>[],
750 'text'=>""};
751 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
752
753 # work out the full section number
754 my $section = $parent_section;
755 $section .= "." unless $section eq "";
756 $section .= $section_num;
757
758 return $section;
759}
760
761# creates a pre-named section
762sub create_named_section {
763 my $self = shift (@_);
764 my ($mastersection) = @_;
765
766 my ($num);
767 my $section = $mastersection;
768 my $sectionref = $self;
769
770 while ($section ne "") {
771 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
772 $num =~ s/^0+(\d)/$1/; # remove leading 0s
773 $section = "" unless defined $section;
774
775 if (defined $num) {
776 if (!defined $sectionref->{'subsections'}->{$num}) {
777 push (@{$sectionref->{'subsection_order'}}, $num);
778 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
779 'next_subsection'=>1,
780 'subsections'=>{},
781 'metadata'=>[],
782 'text'=>""};
783 if ($num >= $sectionref->{'next_subsection'}) {
784 $sectionref->{'next_subsection'} = $num + 1;
785 }
786 }
787 $sectionref = $sectionref->{'subsections'}->{$num};
788
789 } else {
790 print STDERR "doc::create_named_section couldn't create section ";
791 print STDERR "$mastersection\n";
792 last;
793 }
794 }
795}
796
797# returns a reference to a list of subsections
798sub list_subsections {
799 my $self = shift (@_);
800 my ($section) = @_;
801
802 my $section_ptr = $self->_lookup_section ($section);
803 if (!defined $section_ptr) {
804 print STDERR "doc::list_subsections couldn't find section $section\n";
805 return [];
806 }
807
808 return [@{$section_ptr->{'subsection_order'}}];
809}
810
811sub delete_section {
812 my $self = shift (@_);
813 my ($section) = @_;
814
815# my $section_ptr = {'subsection_order'=>[],
816# 'next_subsection'=>1,
817# 'subsections'=>{},
818# 'metadata'=>[],
819# 'text'=>""};
820
821 # if this is the top section reset everything
822 if ($section eq "") {
823 $self->{'subsection_order'} = [];
824 $self->{'subsections'} = {};
825 $self->{'metadata'} = [];
826 $self->{'text'} = "";
827 return;
828 }
829
830 # find the parent of the section to delete
831 my $parent_section = "";
832 my $child = "0";
833 my @section = split (/\./, $section);
834 if (scalar(@section) > 0) {
835 $child = pop (@section);
836 $parent_section = join (".", @section);
837 }
838
839 my $parent_section_ptr = $self->_lookup_section($parent_section);
840 if (!defined $parent_section_ptr) {
841 print STDERR "doc::delete_section couldn't find parent section " .
842 "$parent_section\n";
843 return;
844 }
845
846 # remove this section from the subsection_order list
847 my $i = 0;
848 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
849 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
850 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
851 last;
852 }
853 $i++;
854 }
855
856 # remove this section from the subsection hash
857 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
858 undef $parent_section_ptr->{'subsections'}->{$child};
859 }
860}
861
862#--
863# methods for dealing with metadata
864
865# set_metadata_element and get_metadata_element are for metadata
866# which should only have one value. add_meta_data and get_metadata
867# are for metadata which can have more than one value.
868
869# returns the first metadata value which matches field
870
871# This version of get metadata element works much like the one above,
872# except it allows for the namespace portion of a metadata element to
873# be ignored, thus if you are searching for dc.Title, the first piece
874# of matching metadata ending with the name Title (once any namespace
875# is removed) would be returned.
876# 28-11-2003 John Thompson
877sub get_metadata_element {
878 my $self = shift (@_);
879 my ($section, $field, $ignore_namespace) = @_;
880 my ($data);
881
882 $ignore_namespace = 0 unless defined $ignore_namespace;
883
884 my $section_ptr = $self->_lookup_section($section);
885 if (!defined $section_ptr) {
886 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
887 return;
888 }
889
890 # Remove any namespace if we are being told to ignore them
891 if($ignore_namespace) {
892 $field =~ s/^.*\.//; #$field =~ s/^\w*\.//;
893 }
894
895 foreach $data (@{$section_ptr->{'metadata'}}) {
896
897 my $data_name = $data->[0];
898
899 # Remove any namespace if we are being told to ignore them
900 if($ignore_namespace) {
901 $data_name =~ s/^.*\.//; #$data_name =~ s/^\w*\.//;
902 }
903 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
904 $data_name =~ s/^ex\.([^.]+)$/$1/; #$data_name =~ s/^ex\.//;
905
906 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
907 }
908
909 return undef; # was not found
910}
911
912# returns a list of the form [value1, value2, ...]
913sub get_metadata {
914 my $self = shift (@_);
915 my ($section, $field, $ignore_namespace) = @_;
916 my ($data);
917
918 $ignore_namespace = 0 unless defined $ignore_namespace;
919
920 my $section_ptr = $self->_lookup_section($section);
921 if (!defined $section_ptr) {
922 print STDERR "doc::get_metadata couldn't find section ",
923 $section, "\n";
924 return;
925 }
926
927 # Remove any namespace if we are being told to ignore them
928 if($ignore_namespace) {
929 $field =~ s/^.*\.//;
930 }
931
932 my @metadata = ();
933 foreach $data (@{$section_ptr->{'metadata'}}) {
934
935 my $data_name = $data->[0];
936
937 # Remove any namespace if we are being told to ignore them
938 if($ignore_namespace) {
939 $data_name =~ s/^.*\.//;
940 }
941 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
942 $data_name =~ s/^ex\.([^.]+)$/$1/;
943
944 push (@metadata, $data->[1]) if ($data_name eq $field);
945 }
946
947 return \@metadata;
948}
949
950sub get_metadata_hashmap {
951 my $self = shift (@_);
952 my ($section, $opt_namespace) = @_;
953
954 my $section_ptr = $self->_lookup_section($section);
955 if (!defined $section_ptr) {
956 print STDERR "doc::get_metadata couldn't find section ",
957 $section, "\n";
958 return;
959 }
960
961 my $metadata_hashmap = {};
962 foreach my $data (@{$section_ptr->{'metadata'}}) {
963 my $metaname = $data->[0];
964
965 if ((!defined $opt_namespace) || ($metaname =~ m/^$opt_namespace\./)) {
966 if (!defined $metadata_hashmap->{$metaname}) {
967 $metadata_hashmap->{$metaname} = [];
968 }
969 my $metaval_list = $metadata_hashmap->{$metaname};
970 push(@$metaval_list, $data->[1]);
971 }
972 }
973
974 return $metadata_hashmap;
975}
976
977# returns a list of the form [[field,value],[field,value],...]
978sub get_all_metadata {
979 my $self = shift (@_);
980 my ($section) = @_;
981
982 my $section_ptr = $self->_lookup_section($section);
983 if (!defined $section_ptr) {
984 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
985 return;
986 }
987
988 return $section_ptr->{'metadata'};
989}
990
991# $value is optional
992sub delete_metadata {
993 my $self = shift (@_);
994 my ($section, $field, $value) = @_;
995
996 my $section_ptr = $self->_lookup_section($section);
997 if (!defined $section_ptr) {
998 print STDERR "doc::delete_metadata couldn't find section ", $section, "$field\n";
999 return;
1000 }
1001
1002 my $i = 0;
1003 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
1004 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
1005 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
1006 splice (@{$section_ptr->{'metadata'}}, $i, 1);
1007 } else {
1008 $i++;
1009 }
1010 }
1011}
1012
1013sub delete_all_metadata {
1014 my $self = shift (@_);
1015 my ($section) = @_;
1016
1017 my $section_ptr = $self->_lookup_section($section);
1018 if (!defined $section_ptr) {
1019 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
1020 return;
1021 }
1022
1023 $section_ptr->{'metadata'} = [];
1024}
1025
1026sub set_metadata_element {
1027 my $self = shift (@_);
1028 my ($section, $field, $value) = @_;
1029
1030 $self->set_utf8_metadata_element ($section, $field,
1031 &unicode::ascii2utf8(\$value));
1032}
1033
1034# set_utf8_metadata_element assumes the text has already been
1035# converted to the UTF-8 encoding.
1036sub set_utf8_metadata_element {
1037 my $self = shift (@_);
1038 my ($section, $field, $value) = @_;
1039
1040 $self->delete_metadata ($section, $field);
1041 $self->add_utf8_metadata ($section, $field, $value);
1042}
1043
1044
1045# add_metadata assumes the text is in (extended) ascii form. For
1046# text which has already been converted to the UTF-8 format use
1047# add_utf8_metadata.
1048sub add_metadata {
1049 my $self = shift (@_);
1050 my ($section, $field, $value) = @_;
1051
1052 $self->add_utf8_metadata ($section, $field,
1053 &unicode::ascii2utf8(\$value));
1054}
1055
1056sub add_utf8_metadata {
1057 my $self = shift (@_);
1058 my ($section, $field, $value) = @_;
1059
1060 # my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1061 # my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
1062 # print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
1063
1064 my $section_ptr = $self->_lookup_section($section);
1065 if (!defined $section_ptr) {
1066 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
1067 return;
1068 }
1069 if (!defined $value) {
1070 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
1071 return;
1072 }
1073 if (!defined $field) {
1074 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
1075 return;
1076 }
1077
1078 #print STDERR "###$field=$value\n";
1079
1080 # For now, supress this check. Given that text data read in is now
1081 # Unicode aware, then the following block of code can (ironically enough)
1082 # cause our unicode compliant string to be re-encoded (leading to
1083 # a double-encoded UTF-8 string, which we definitely don't want!).
1084
1085
1086 # double check that the value is utf-8
1087 # if (!&unicode::check_is_utf8($value)) {
1088 # print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
1089 # &unicode::ensure_utf8(\$value);
1090 # print STDERR " Tried converting to utf8: $value\n";
1091 # }
1092
1093 #If the metadata value is either a latitude or a longitude value then we want to save a shortened version for spacial searching purposes
1094 if ($field =~ m/^(.+\.)?Latitude$/ || $field =~ m/^(.+\.)?Longitude$/)
1095 {
1096 my ($mdprefix,$metaname) = ($field =~ m/(.+)\.(.+)$/);
1097 if (defined $mdprefix) {
1098 # Add in a version of Latitude/Longitude without the metadata namespace prefix to keep Runtime happy
1099 push (@{$section_ptr->{'metadata'}}, [$metaname, $value]);
1100 }
1101
1102 my $direction;
1103 if($value =~ m/^-/)
1104 {
1105 $direction = ($field eq "Latitude") ? "S" : "W";
1106 }
1107 else
1108 {
1109 $direction = ($field eq "Latitude") ? "N" : "E";
1110 }
1111
1112 my ($beforeDec, $afterDec) = ($value =~ m/^-?([0-9]+)\.([0-9]+)$/);
1113 if(defined $beforeDec && defined $afterDec)
1114 {
1115 my $name = ($field eq "Latitude") ? "LatShort" : "LngShort";
1116 push (@{$section_ptr->{'metadata'}}, [$name, $beforeDec . $direction]);
1117
1118 for(my $i = 2; $i <= 4; $i++)
1119 {
1120 if(length($afterDec) >= $i)
1121 {
1122 push (@{$section_ptr->{'metadata'}}, [$name, substr($afterDec, 0, $i)]);
1123 }
1124 }
1125
1126 #Only add the metadata if it has not already been added
1127 my $metaMap = $self->get_metadata_hashmap($section);
1128 }
1129 }
1130
1131 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1132}
1133
1134
1135# methods for dealing with text
1136
1137# returns the text for a section
1138sub get_text {
1139 my $self = shift (@_);
1140 my ($section) = @_;
1141
1142 my $section_ptr = $self->_lookup_section($section);
1143 if (!defined $section_ptr) {
1144 print STDERR "doc::get_text couldn't find section " .
1145 "$section\n";
1146 return "";
1147 }
1148
1149 return $section_ptr->{'text'};
1150}
1151
1152# returns the (utf-8 encoded) length of the text for a section
1153sub get_text_length {
1154 my $self = shift (@_);
1155 my ($section) = @_;
1156
1157 my $section_ptr = $self->_lookup_section($section);
1158 if (!defined $section_ptr) {
1159 print STDERR "doc::get_text_length couldn't find section " .
1160 "$section\n";
1161 return 0;
1162 }
1163
1164 return length ($section_ptr->{'text'});
1165}
1166
1167# returns the total length for all the sections
1168sub get_total_text_length {
1169 my $self = shift (@_);
1170
1171 my $section = $self->get_top_section();
1172 my $length = 0;
1173 while (defined $section) {
1174 $length += $self->get_text_length($section);
1175 $section = $self->get_next_section($section);
1176 }
1177 return $length;
1178}
1179
1180sub delete_text {
1181 my $self = shift (@_);
1182 my ($section) = @_;
1183
1184 my $section_ptr = $self->_lookup_section($section);
1185 if (!defined $section_ptr) {
1186 print STDERR "doc::delete_text couldn't find section " .
1187 "$section\n";
1188 return;
1189 }
1190
1191 $section_ptr->{'text'} = "";
1192}
1193
1194# add_text assumes the text is in (extended) ascii form. For
1195# text which has been already converted to the UTF-8 format
1196# use add_utf8_text.
1197sub add_text {
1198 my $self = shift (@_);
1199 my ($section, $text) = @_;
1200
1201 # convert the text to UTF-8 encoded unicode characters
1202 # and add the text
1203 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1204}
1205
1206
1207# add_utf8_text assumes the text to be added has already
1208# been converted to the UTF-8 encoding. For ascii text use
1209# add_text
1210sub add_utf8_text {
1211 my $self = shift (@_);
1212 my ($section, $text) = @_;
1213
1214 my $section_ptr = $self->_lookup_section($section);
1215 if (!defined $section_ptr) {
1216 print STDERR "doc::add_utf8_text couldn't find section " .
1217 "$section\n";
1218 return;
1219 }
1220
1221 $section_ptr->{'text'} .= $text;
1222}
1223
1224# returns the Source meta, which is the utf8 filename generated.
1225# Added a separate method here for convenience
1226sub get_source {
1227 my $self = shift (@_);
1228 return $self->get_metadata_element ($self->get_top_section(), "Source");
1229}
1230
1231# returns the SourceFile meta, which is the url reference to the URL-encoded
1232# version of Source (the utf8 filename). Added a separate method here for convenience
1233sub get_sourcefile {
1234 my $self = shift (@_);
1235 return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
1236}
1237
1238# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
1239# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
1240sub get_assocfile_from_sourcefile {
1241 my $self = shift (@_);
1242
1243 # get the SourceFile meta, which is a *URL* to a file on the filesystem
1244 my $top_section = $self->get_top_section();
1245 my $source_file = $self->get_metadata_element($top_section, "SourceFile");
1246
1247 # get the actual filename as it exists on the filesystem which this url refers to
1248 $source_file = &unicode::url_to_filename($source_file);
1249 my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
1250 return $assocfilename;
1251}
1252
1253# methods for dealing with associated files
1254
1255# a file is associated with a document, NOT a section.
1256# if section is defined it is noted in the data structure
1257# only so that files associated from a particular section
1258# may be removed later (using delete_section_assoc_files)
1259sub associate_file {
1260 my $self = shift (@_);
1261 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1262 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1263
1264 # remove all associated files with the same name
1265 $self->delete_assoc_file ($assoc_filename);
1266
1267 # Too harsh a requirement
1268 # Definitely get HTML docs, for example, with some missing
1269 # support files
1270# if (!&util::fd_exists($real_filename)) {
1271# print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n";
1272# exit -1;
1273# }
1274
1275# print STDERR "**** is the following a UTF8 rep of *real* filename?\n $real_filename\n";
1276# print STDERR "****##### so, ensure it is before storing?!?!?\n";
1277## my $utf8_filename = Encode::encode("utf8",$filename);
1278
1279 push (@{$self->{'associated_files'}},
1280 [$real_filename, $assoc_filename, $mime_type, $section]);
1281}
1282
1283# returns a list of associated files in the form
1284# [[real_filename, assoc_filename, mimetype], ...]
1285sub get_assoc_files {
1286 my $self = shift (@_);
1287
1288 return $self->{'associated_files'};
1289}
1290
1291# the following two methods used to keep track of original associated files
1292# for incremental building. eg a txt file used by an item file does not end
1293# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1294# file for incremental build
1295sub associate_source_file {
1296 my $self = shift (@_);
1297 my ($full_filename) = @_;
1298
1299 push (@{$self->{'source_assoc_files'}}, $full_filename);
1300
1301}
1302
1303sub get_source_assoc_files {
1304 my $self = shift (@_);
1305
1306 return $self->{'source_assoc_files'};
1307
1308
1309}
1310sub metadata_file {
1311 my $self = shift (@_);
1312 my ($real_filename, $filename) = @_;
1313
1314 push (@{$self->{'metadata_files'}},
1315 [$real_filename, $filename]);
1316}
1317
1318# used for writing out the archiveinf-doc info database, to list all the metadata files
1319sub get_meta_files {
1320 my $self = shift (@_);
1321
1322 return $self->{'metadata_files'};
1323}
1324
1325sub delete_section_assoc_files {
1326 my $self = shift (@_);
1327 my ($section) = @_;
1328
1329 my $i=0;
1330 while ($i < scalar (@{$self->{'associated_files'}})) {
1331 if (defined $self->{'associated_files'}->[$i]->[3] &&
1332 $self->{'associated_files'}->[$i]->[3] eq $section) {
1333 splice (@{$self->{'associated_files'}}, $i, 1);
1334 } else {
1335 $i++;
1336 }
1337 }
1338}
1339
1340sub delete_assoc_file {
1341 my $self = shift (@_);
1342 my ($assoc_filename) = @_;
1343
1344 my $i=0;
1345 while ($i < scalar (@{$self->{'associated_files'}})) {
1346 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1347 splice (@{$self->{'associated_files'}}, $i, 1);
1348 } else {
1349 $i++;
1350 }
1351 }
1352}
1353
1354sub reset_nextsection_ptr {
1355 my $self = shift (@_);
1356 my ($section) = @_;
1357
1358 my $section_ptr = $self->_lookup_section($section);
1359 $section_ptr->{'next_subsection'} = 1;
1360}
1361
13621;
Note: See TracBrowser for help on using the repository browser.