source: main/trunk/greenstone2/perllib/doc.pm@ 32541

Last change on this file since 32541 was 32535, checked in by ak19, 5 years ago

Fixing couple of typos before major commit.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 38.7 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use FileUtils;
40use ghtml;
41use File::stat;
42##use hashdoc;
43use docprint;
44
45# the document type may be indexed_doc, nonindexed_doc, or
46# classification
47
48our $OIDcount = 0;
49
50# rename_method can be 'url', 'none', 'base64'
51sub new {
52 my $class = shift (@_);
53 my ($source_filename, $doc_type, $rename_method) = @_;
54
55
56 my $self = bless {'associated_files'=>[],
57 'subsection_order'=>[],
58 'next_subsection'=>1,
59 'subsections'=>{},
60 'metadata'=>[],
61 'text'=>"",
62 'OIDtype'=>"hash"}, $class;
63
64 # used to set lastmodified here, but this can screw up the HASH ids, so
65 # the docsave processor now calls set_lastmodified
66
67 $self->set_source_path($source_filename);
68
69 if (defined $source_filename) {
70 $source_filename = &util::filename_within_collection($source_filename);
71 print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
72 $self->set_source_filename ($source_filename, $rename_method);
73 }
74
75 $self->set_doc_type ($doc_type) if defined $doc_type;
76
77 return $self;
78}
79
80
81sub set_source_path
82{
83 my $self = shift @_;
84 my ($source_filename) = @_;
85
86 if (defined $source_filename) {
87 # On Windows the source_filename can be in terse DOS format
88 # e.g. test~1.txt
89
90 $self->{'terse_source_path'} = $source_filename;
91
92 # Use the FileUtil library methods as they are aware of more special
93 # cases such as HDFS [jmt12]
94 if (&FileUtils::fileExists($source_filename))
95 {
96 # See if we can do better for Windows with a filename
97 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
98 require Win32;
99 $self->{'source_path'} = Win32::GetLongPathName($source_filename);
100 }
101 else {
102 # For Unix-based systems, there is no difference between the two
103 $self->{'source_path'} = $source_filename;
104 }
105 }
106 else {
107 print STDERR "Warning: In doc::set_source_path(), file\n";
108 print STDERR " $source_filename\n";
109 print STDERR " does not exist\n";
110
111 # (default) Set it to whatever we were given
112 $self->{'source_path'} = $source_filename;
113 }
114 }
115 else {
116 # Previous code for setting source_path allowed for
117 # it to be undefined, so continue this practice
118 $self->{'terse_source_path'} = undef;
119 $self->{'source_path'} = undef;
120 }
121}
122
123
124sub get_source_path
125{
126 my $self = shift @_;
127
128 return $self->{'terse_source_path'};
129}
130
131# set lastmodified for OAI purposes, added by GRB, moved by kjdon
132sub set_oailastmodified {
133 my $self = shift (@_);
134
135 my $source_path = $self->{'terse_source_path'};
136
137 if (defined $source_path && (-e $source_path)) {
138 my $current_time = time;
139
140 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
141 $wday, $yday, $isdst) = localtime($current_time);
142
143 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
144
145 $self->add_utf8_metadata($self->get_top_section(), "oailastmodified", $current_time);
146 $self->add_utf8_metadata($self->get_top_section(), "oailastmodifieddate", $date_modified);
147 }
148}
149
150# no longer used for OAI purposes, since lastmodified is not what we want as the
151# Datestamp of a document. This doc metadata may be useful for general purposes.
152sub set_lastmodified {
153 my $self = shift (@_);
154
155 my $source_path = $self->{'terse_source_path'};
156
157 if (defined $source_path && (-e $source_path)) {
158
159 my $file_stat = stat($source_path);
160 my $mtime = $file_stat->mtime;
161 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
162 $wday, $yday, $isdst) = localtime($mtime);
163
164 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
165
166 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $mtime);
167 $self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
168 }
169}
170
171# clone the $self object
172sub duplicate {
173 my $self = shift (@_);
174
175 my $newobj = {};
176
177 foreach my $k (keys %$self) {
178 $newobj->{$k} = &clone ($self->{$k});
179 }
180
181 bless $newobj, ref($self);
182 return $newobj;
183}
184
185sub clone {
186 my ($from) = @_;
187 my $type = ref ($from);
188
189 if ($type eq "HASH") {
190 my $to = {};
191 foreach my $key (keys %$from) {
192 $to->{$key} = &clone ($from->{$key});
193 }
194 return $to;
195 } elsif ($type eq "ARRAY") {
196 my $to = [];
197 foreach my $v (@$from) {
198 push (@$to, &clone ($v));
199 }
200 return $to;
201 } else {
202 return $from;
203 }
204}
205
206sub set_OIDtype {
207 my $self = shift (@_);
208 my ($type, $metadata) = @_;
209
210 if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|hash_on_full_filename|incremental|filename|dirname|full_filename|assigned)$/) {
211 $self->{'OIDtype'} = $type;
212 } else {
213 $self->{'OIDtype'} = "hash";
214 }
215
216 if ($type =~ /^assigned$/) {
217 if (defined $metadata) {
218 $self->{'OIDmetadata'} = $metadata;
219 } else {
220 $self->{'OIDmetadata'} = "dc.Identifier";
221 }
222 }
223}
224
225# rename_method can be 'url', 'none', 'base64'
226sub set_source_filename {
227 my $self = shift (@_);
228 my ($source_filename, $rename_method) = @_;
229
230 # Since the gsdlsourcefilename element goes into the doc.xml it has
231 # to be utf8. However, it should also *represent* the source filename
232 # (in the import directory) which may not be utf8 at all.
233 # For instance, if this meta element (gsdlsourcefilename) will be used
234 # by other applications that parse doc.xml in order to locate
235 # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
236 # encode the real filename as this is a binary-to-text encoding meaning
237 # that the resulting string is ASCII (utf8). Decoding will give the original.
238
239# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
240
241 # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
242 $source_filename = $self->encode_filename($source_filename, $rename_method);
243# my ($srcfilename,$dirname,$suffix)
244# = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
245# print STDERR "-> $srcfilename -> ";
246# $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
247# $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
248# print STDERR "$source_filename\n";
249
250 $self->set_utf8_metadata_element ($self->get_top_section(),
251 "gsdlsourcefilename",
252 $source_filename);
253}
254
255sub encode_filename {
256 my $self = shift (@_);
257 my ($source_filename, $rename_method) = @_;
258
259 my ($srcfilename,$dirname,$suffix)
260 = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
261# print STDERR "-> $srcfilename -> ";
262 $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
263 $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
264
265 return $source_filename;
266}
267
268sub set_converted_filename {
269 my $self = shift (@_);
270 my ($converted_filename) = @_;
271
272 # we know the converted filename is utf8
273 $self->set_utf8_metadata_element ($self->get_top_section(),
274 "gsdlconvertedfilename",
275 $converted_filename);
276}
277
278# returns the source_filename as it was provided
279sub get_unmodified_source_filename {
280 my $self = shift (@_);
281
282 return $self->{'terse_source_path'};
283}
284
285# returns the source_filename with whatever rename_method was given
286sub get_source_filename {
287 my $self = shift (@_);
288
289 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
290}
291
292
293
294# returns converted filename if available else returns source filename
295sub get_filename_for_hashing {
296 my $self = shift (@_);
297
298 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
299
300 if (!defined $filename) {
301 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
302 # if NULPlug processed file, then don't give a filename
303 if (defined $plugin_name && $plugin_name eq "NULPlug") {
304 $filename = undef;
305 } else { # returns the URL encoded source filename!
306 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
307 }
308 }
309
310 if (!&FileUtils::isFilenameAbsolute($filename)) {
311 $filename = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$filename);
312 }
313
314 return $filename;
315}
316
317sub set_doc_type {
318 my $self = shift (@_);
319 my ($doc_type) = @_;
320
321 $self->set_metadata_element ($self->get_top_section(),
322 "gsdldoctype",
323 $doc_type);
324}
325
326# returns the gsdldoctype as it was provided
327# the default of "indexed_doc" is used if no document
328# type was provided
329sub get_doc_type {
330 my $self = shift (@_);
331
332 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
333 return $doc_type if (defined $doc_type);
334 return "indexed_doc";
335}
336
337
338# look up the reference to the a particular section
339sub _lookup_section {
340 my $self = shift (@_);
341 my ($section) = @_;
342
343 my ($num);
344 my $sectionref = $self;
345
346 while (defined $section && $section ne "") {
347
348 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
349
350 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
351
352 $section = "" unless defined $section;
353
354
355 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
356 $sectionref = $sectionref->{'subsections'}->{$num};
357 } else {
358 return undef;
359 }
360 }
361
362 return $sectionref;
363}
364
365# calculate OID by hashing the contents of the document
366sub _calc_OID {
367 my $self = shift (@_);
368 my ($filename) = @_;
369
370
371 my $osexe = &util::get_os_exe();
372
373 my $hashfile_exe = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin",
374 $ENV{'GSDLOS'},"hashfile$osexe");
375
376 &util::set_gnomelib_env(); # gnomelib_env (particularly lib/libiconv2.dylib) required to run the hashfile executable on Mac Lions
377 # The subroutine will set the gnomelib env once for each subshell launched, by first testing if GEXTGNOME is not already set
378
379 # A different way to set the gnomelib env would be to do it more locally: exporting the necessary vars
380 # (specifically DYLD/LD_LIB_PATH) for gnome_lib as part of the command executed.
381 # E.g. $result=`export LD_LIBRARY_PATH=../ext/gnome-lib/darwin/lib; hashfile...`
382
383 my $result = "NULL";
384
385
386 if (-e "$hashfile_exe") {
387# $result = `\"$hashfile_exe\" \"$filename\"`;
388# $result = `hashfile$osexe \"$filename\" 2>&1`;
389 $result = `hashfile$osexe \"$filename\"`;
390
391 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
392 } else {
393 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
394 }
395 return "HASH$result";
396}
397
398# methods dealing with OID, not groups of them.
399
400# if $OID is not provided one is calculated
401sub set_OID {
402 my $self = shift (@_);
403 my ($OID) = @_;
404
405 my $use_hash_oid = 0;
406 # if an OID wasn't provided calculate one
407 if (!defined $OID) {
408 $OID = "NULL";
409 if ($self->{'OIDtype'} =~ /^hash/) {
410 $use_hash_oid = 1;
411 } elsif ($self->{'OIDtype'} eq "incremental") {
412 $OID = "D" . $OIDcount;
413 $OIDcount ++;
414 } elsif ($self->{'OIDtype'} eq "filename") {
415 my $filename = $self->get_source_filename();
416 $OID = &File::Basename::fileparse($filename, qr/\.[^.]*/);
417 $OID = &util::tidy_up_oid($OID);
418 } elsif ($self->{'OIDtype'} eq "full_filename") {
419 my $source_filename = $self->get_source_filename();
420 my $dirsep = &util::get_os_dirsep();
421
422 $source_filename =~ s/^import$dirsep//;
423 $source_filename =~ s/$dirsep/-/g;
424 $source_filename =~ s/\./_/g;
425
426 $OID = $source_filename;
427 $OID = &util::tidy_up_oid($OID);
428 } elsif ($self->{'OIDtype'} eq "dirname") {
429 my $filename = $self->get_source_filename();
430 if (defined($filename)) { # && -e $filename) {
431 # get the immediate parent directory
432 $OID = &File::Basename::dirname($filename);
433 if (defined $OID) {
434 $OID = &File::Basename::basename($OID);
435 $OID = &util::tidy_up_oid($OID);
436 } else {
437 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
438 $use_hash_oid = 1;
439 }
440 } else {
441 print STDERR "Failed to find a filename, generating hash id\n";
442 $use_hash_oid = 1;
443 }
444
445 } elsif ($self->{'OIDtype'} eq "assigned") {
446 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
447 if (defined $identifier && $identifier ne "") {
448 $OID = $identifier;
449 $OID = &util::tidy_up_oid($OID);
450 } else {
451 # need a hash id
452 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
453 $use_hash_oid = 1;
454 }
455
456 } else {
457 $use_hash_oid = 1;
458 }
459
460 if ($use_hash_oid) {
461 my $hash_on_file = 1;
462 my $hash_on_ga_xml = 0;
463
464 if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
465 $hash_on_file = 0;
466 $hash_on_ga_xml = 1;
467 }
468
469 if ($self->{'OIDtype'} eq "hash_on_full_filename") {
470 $hash_on_file = 0;
471 $hash_on_ga_xml = 0;
472
473 my $source_filename = $self->get_source_filename();
474 my $dirsep = &util::get_os_dirsep();
475
476 $source_filename =~ s/^import$dirsep//;
477 $source_filename =~ s/$dirsep/-/g;
478 $source_filename =~ s/\./_/g;
479
480 # If the filename is very short then (handled naively)
481 # this can cause conjestion in the hash-values
482 # computed, leading documents sharing the same leading
483 # Hex values in the computed has.
484 #
485 # The solution taken here is to replace the name of
486 # the file name a sufficient number of times (up to
487 # the character limit defined in 'rep_limit' and
488 # make that the content that is hashed on
489
490 # *** Think twice before changing the following value
491 # as it will break backward compatability of computed
492 # document HASH values
493
494 my $rep_limit = 256;
495 my $hash_content = undef;
496
497 if (length($source_filename)<$rep_limit) {
498 my $rep_string = "$source_filename|";
499 my $rs_len = length($rep_string);
500
501 my $clone_times = int(($rep_limit-1)/$rs_len) +1;
502
503 $hash_content = substr($rep_string x $clone_times, 0, $rep_limit);
504 }
505 else {
506 $hash_content = $source_filename;
507 }
508
509 my $filename = &util::get_tmp_filename();
510 if (!open (OUTFILE, ">:utf8", $filename)) {
511 print STDERR "doc::set_OID could not write to $filename\n";
512 } else {
513 print OUTFILE $hash_content;
514 close (OUTFILE);
515 }
516 $OID = $self->_calc_OID ($filename);
517
518 &FileUtils::removeFiles ($filename);
519 }
520
521 if ($hash_on_file) {
522 # "hash" OID - feed file to hashfile.exe
523 my $filename = $self->get_filename_for_hashing();
524
525 # -z: don't want to hash on the file if it is zero size
526 if (defined($filename) && -e $filename && !-z $filename) {
527 $OID = $self->_calc_OID ($filename);
528 } else {
529 $hash_on_ga_xml = 1; # switch to back-up plan, and hash on GA file instead
530 }
531 }
532
533 if ($hash_on_ga_xml) {
534 # In addition being asked to explicity calculate the has based on the GA file,
535 # can also end up coming into this block is doing 'hash_on_file' but the file
536 # itself is of zero bytes (as could be the case with 'doc.nul' file
537
538 my $filename = &util::get_tmp_filename();
539 if (!open (OUTFILE, ">:utf8", $filename)) {
540 print STDERR "doc::set_OID could not write to $filename\n";
541 } else {
542 my $doc_text = &docprint::get_section_xml($self);
543 print OUTFILE $doc_text;
544 close (OUTFILE);
545 }
546 $OID = $self->_calc_OID ($filename);
547 &FileUtils::removeFiles($filename);
548 }
549 }
550 }
551 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
552}
553
554# this uses hashdoc (embedded c thingy) which is faster but still
555# needs a little work to be sufficiently stable
556sub ___set_OID {
557 my $self = shift (@_);
558 my ($OID) = @_;
559
560 # if an OID wasn't provided then calculate hash value based on document
561 if (!defined $OID)
562 {
563 my $hash_text = &docprint::get_section_xml($self);
564 my $hash_len = length($hash_text);
565
566 $OID = &hashdoc::buffer($hash_text,$hash_len);
567 }
568
569 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
570}
571
572# returns the OID for this document
573sub get_OID {
574 my $self = shift (@_);
575 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
576 return $OID if (defined $OID);
577 return "NULL";
578}
579
580sub delete_OID {
581 my $self = shift (@_);
582
583 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
584}
585
586
587# methods for manipulating section names
588
589# returns the name of the top-most section (the top
590# level of the document
591sub get_top_section {
592 my $self = shift (@_);
593
594 return "";
595}
596
597# returns a section
598sub get_parent_section {
599 my $self = shift (@_);
600 my ($section) = @_;
601
602 $section =~ s/(^|\.)\d+$//;
603
604 return $section;
605}
606
607# returns the first child section (or the end child
608# if there isn't any)
609sub get_begin_child {
610 my $self = shift (@_);
611 my ($section) = @_;
612
613 my $section_ptr = $self->_lookup_section($section);
614 return "" unless defined $section_ptr;
615
616 if (defined $section_ptr->{'subsection_order'}->[0]) {
617 return "$section.$section_ptr->{'subsection_order'}->[0]";
618 }
619
620 return $self->get_end_child ($section);
621}
622
623# returns the next child of a parent section
624sub get_next_child {
625 my $self = shift (@_);
626 my ($section) = @_;
627
628 my $parent_section = $self->get_parent_section($section);
629 my $parent_section_ptr = $self->_lookup_section($parent_section);
630 return undef unless defined $parent_section_ptr;
631
632 my ($section_num) = $section =~ /(\d+)$/;
633 return undef unless defined $section_num;
634
635 my $i = 0;
636 my $section_order = $parent_section_ptr->{'subsection_order'};
637 while ($i < scalar(@$section_order)) {
638 last if $section_order->[$i] eq $section_num;
639 $i++;
640 }
641
642 $i++; # the next child
643 if ($i < scalar(@$section_order)) {
644 return $section_order->[$i] if $parent_section eq "";
645 return "$parent_section.$section_order->[$i]";
646 }
647
648 # no more sections in this level
649 return undef;
650}
651
652# returns a reference to a list of children
653sub get_children {
654 my $self = shift (@_);
655 my ($section) = @_;
656
657 my $section_ptr = $self->_lookup_section($section);
658 return [] unless defined $section_ptr;
659
660 my @children = @{$section_ptr->{'subsection_order'}};
661
662 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
663 return \@children;
664}
665
666# returns the child section one past the last one (which
667# is coded as "0")
668sub get_end_child {
669 my $self = shift (@_);
670 my ($section) = @_;
671
672 return $section . ".0" unless $section eq "";
673 return "0";
674}
675
676# returns the next section in book order
677sub get_next_section {
678 my $self = shift (@_);
679 my ($section) = @_;
680
681 return undef unless defined $section;
682
683 my $section_ptr = $self->_lookup_section($section);
684 return undef unless defined $section_ptr;
685
686 # first try to find first child
687 if (defined $section_ptr->{'subsection_order'}->[0]) {
688 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
689 return "$section.$section_ptr->{'subsection_order'}->[0]";
690 }
691
692 do {
693 # try to find sibling
694 my $next_child = $self->get_next_child ($section);
695 return $next_child if (defined $next_child);
696
697 # move up one level
698 $section = $self->get_parent_section ($section);
699 } while $section =~ /\d/;
700
701 return undef;
702}
703
704sub is_leaf_section {
705 my $self = shift (@_);
706 my ($section) = @_;
707
708 my $section_ptr = $self->_lookup_section($section);
709 return 1 unless defined $section_ptr;
710
711 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
712}
713
714# methods for dealing with sections
715
716# returns the name of the inserted section
717sub insert_section {
718 my $self = shift (@_);
719 my ($before_section) = @_;
720
721 # get the child to insert before and its parent section
722 my $parent_section = "";
723 my $before_child = "0";
724 my @before_section = split (/\./, $before_section);
725 if (scalar(@before_section) > 0) {
726 $before_child = pop (@before_section);
727 $parent_section = join (".", @before_section);
728 }
729
730 my $parent_section_ptr = $self->_lookup_section($parent_section);
731 if (!defined $parent_section_ptr) {
732 print STDERR "doc::insert_section couldn't find parent section " .
733 "$parent_section\n";
734 return;
735 }
736
737 # get the next section number
738 my $section_num = $parent_section_ptr->{'next_subsection'}++;
739
740 my $i = 0;
741 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
742 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
743 $i++;
744 }
745
746 # insert the section number into the order list
747 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
748
749 # add this section to the parent section
750 my $section_ptr = {'subsection_order'=>[],
751 'next_subsection'=>1,
752 'subsections'=>{},
753 'metadata'=>[],
754 'text'=>""};
755 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
756
757 # work out the full section number
758 my $section = $parent_section;
759 $section .= "." unless $section eq "";
760 $section .= $section_num;
761
762 return $section;
763}
764
765# creates a pre-named section
766sub create_named_section {
767 my $self = shift (@_);
768 my ($mastersection) = @_;
769
770 my ($num);
771 my $section = $mastersection;
772 my $sectionref = $self;
773
774 while ($section ne "") {
775 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
776 $num =~ s/^0+(\d)/$1/; # remove leading 0s
777 $section = "" unless defined $section;
778
779 if (defined $num) {
780 if (!defined $sectionref->{'subsections'}->{$num}) {
781 push (@{$sectionref->{'subsection_order'}}, $num);
782 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
783 'next_subsection'=>1,
784 'subsections'=>{},
785 'metadata'=>[],
786 'text'=>""};
787 if ($num >= $sectionref->{'next_subsection'}) {
788 $sectionref->{'next_subsection'} = $num + 1;
789 }
790 }
791 $sectionref = $sectionref->{'subsections'}->{$num};
792
793 } else {
794 print STDERR "doc::create_named_section couldn't create section ";
795 print STDERR "$mastersection\n";
796 last;
797 }
798 }
799}
800
801# returns a reference to a list of subsections
802sub list_subsections {
803 my $self = shift (@_);
804 my ($section) = @_;
805
806 my $section_ptr = $self->_lookup_section ($section);
807 if (!defined $section_ptr) {
808 print STDERR "doc::list_subsections couldn't find section $section\n";
809 return [];
810 }
811
812 return [@{$section_ptr->{'subsection_order'}}];
813}
814
815sub delete_section {
816 my $self = shift (@_);
817 my ($section) = @_;
818
819# my $section_ptr = {'subsection_order'=>[],
820# 'next_subsection'=>1,
821# 'subsections'=>{},
822# 'metadata'=>[],
823# 'text'=>""};
824
825 # if this is the top section reset everything
826 if ($section eq "") {
827 $self->{'subsection_order'} = [];
828 $self->{'subsections'} = {};
829 $self->{'metadata'} = [];
830 $self->{'text'} = "";
831 return;
832 }
833
834 # find the parent of the section to delete
835 my $parent_section = "";
836 my $child = "0";
837 my @section = split (/\./, $section);
838 if (scalar(@section) > 0) {
839 $child = pop (@section);
840 $parent_section = join (".", @section);
841 }
842
843 my $parent_section_ptr = $self->_lookup_section($parent_section);
844 if (!defined $parent_section_ptr) {
845 print STDERR "doc::delete_section couldn't find parent section " .
846 "$parent_section\n";
847 return;
848 }
849
850 # remove this section from the subsection_order list
851 my $i = 0;
852 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
853 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
854 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
855 last;
856 }
857 $i++;
858 }
859
860 # remove this section from the subsection hash
861 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
862 undef $parent_section_ptr->{'subsections'}->{$child};
863 }
864}
865
866#--
867# methods for dealing with metadata
868
869# set_metadata_element and get_metadata_element are for metadata
870# which should only have one value. add_meta_data and get_metadata
871# are for metadata which can have more than one value.
872
873# returns the first metadata value which matches field
874
875# This version of get metadata element works much like the one above,
876# except it allows for the namespace portion of a metadata element to
877# be ignored, thus if you are searching for dc.Title, the first piece
878# of matching metadata ending with the name Title (once any namespace
879# is removed) would be returned.
880# 28-11-2003 John Thompson
881sub get_metadata_element {
882 my $self = shift (@_);
883 my ($section, $field, $ignore_namespace) = @_;
884 my ($data);
885
886 $ignore_namespace = 0 unless defined $ignore_namespace;
887
888 my $section_ptr = $self->_lookup_section($section);
889 if (!defined $section_ptr) {
890 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
891 return;
892 }
893
894 # Remove any namespace if we are being told to ignore them
895 if($ignore_namespace) {
896 $field =~ s/^.*\.//; #$field =~ s/^\w*\.//;
897 }
898
899 foreach $data (@{$section_ptr->{'metadata'}}) {
900
901 my $data_name = $data->[0];
902
903 # Remove any namespace if we are being told to ignore them
904 if($ignore_namespace) {
905 $data_name =~ s/^.*\.//; #$data_name =~ s/^\w*\.//;
906 }
907 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
908 $data_name =~ s/^ex\.([^.]+)$/$1/; #$data_name =~ s/^ex\.//;
909
910 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
911 }
912
913 return undef; # was not found
914}
915
916# returns a list of the form [value1, value2, ...]
917sub get_metadata {
918 my $self = shift (@_);
919 my ($section, $field, $ignore_namespace) = @_;
920 my ($data);
921
922 $ignore_namespace = 0 unless defined $ignore_namespace;
923
924 my $section_ptr = $self->_lookup_section($section);
925 if (!defined $section_ptr) {
926 print STDERR "doc::get_metadata couldn't find section ",
927 $section, "\n";
928 return;
929 }
930
931 # Remove any namespace if we are being told to ignore them
932 if($ignore_namespace) {
933 $field =~ s/^.*\.//;
934 }
935
936 my @metadata = ();
937 foreach $data (@{$section_ptr->{'metadata'}}) {
938
939 my $data_name = $data->[0];
940
941 # Remove any namespace if we are being told to ignore them
942 if($ignore_namespace) {
943 $data_name =~ s/^.*\.//;
944 }
945 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
946 $data_name =~ s/^ex\.([^.]+)$/$1/;
947
948 push (@metadata, $data->[1]) if ($data_name eq $field);
949 }
950
951 return \@metadata;
952}
953
954sub get_metadata_hashmap {
955 my $self = shift (@_);
956 my ($section, $opt_namespace) = @_;
957
958 my $section_ptr = $self->_lookup_section($section);
959 if (!defined $section_ptr) {
960 print STDERR "doc::get_metadata couldn't find section ",
961 $section, "\n";
962 return;
963 }
964
965 my $metadata_hashmap = {};
966 foreach my $data (@{$section_ptr->{'metadata'}}) {
967 my $metaname = $data->[0];
968
969 if ((!defined $opt_namespace) || ($metaname =~ m/^$opt_namespace\./)) {
970 if (!defined $metadata_hashmap->{$metaname}) {
971 $metadata_hashmap->{$metaname} = [];
972 }
973 my $metaval_list = $metadata_hashmap->{$metaname};
974 push(@$metaval_list, $data->[1]);
975 }
976 }
977
978 return $metadata_hashmap;
979}
980
981# returns a list of the form [[field,value],[field,value],...]
982sub get_all_metadata {
983 my $self = shift (@_);
984 my ($section) = @_;
985
986 my $section_ptr = $self->_lookup_section($section);
987 if (!defined $section_ptr) {
988 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
989 return;
990 }
991
992 return $section_ptr->{'metadata'};
993}
994
995# $value is optional
996sub delete_metadata {
997 my $self = shift (@_);
998 my ($section, $field, $value) = @_;
999
1000 my $section_ptr = $self->_lookup_section($section);
1001 if (!defined $section_ptr) {
1002 print STDERR "doc::delete_metadata couldn't find section ", $section, "$field\n";
1003 return;
1004 }
1005
1006 my $i = 0;
1007 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
1008 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
1009 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
1010 splice (@{$section_ptr->{'metadata'}}, $i, 1);
1011 } else {
1012 $i++;
1013 }
1014 }
1015}
1016
1017sub delete_all_metadata {
1018 my $self = shift (@_);
1019 my ($section) = @_;
1020
1021 my $section_ptr = $self->_lookup_section($section);
1022 if (!defined $section_ptr) {
1023 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
1024 return;
1025 }
1026
1027 $section_ptr->{'metadata'} = [];
1028}
1029
1030sub set_metadata_element {
1031 my $self = shift (@_);
1032 my ($section, $field, $value) = @_;
1033
1034 $self->set_utf8_metadata_element ($section, $field,
1035 &unicode::ascii2utf8(\$value));
1036}
1037
1038# set_utf8_metadata_element assumes the text has already been
1039# converted to the UTF-8 encoding.
1040sub set_utf8_metadata_element {
1041 my $self = shift (@_);
1042 my ($section, $field, $value) = @_;
1043
1044 $self->delete_metadata ($section, $field);
1045 $self->add_utf8_metadata ($section, $field, $value);
1046}
1047
1048
1049# add_metadata assumes the text is in (extended) ascii form. For
1050# text which has already been converted to the UTF-8 format use
1051# add_utf8_metadata.
1052sub add_metadata {
1053 my $self = shift (@_);
1054 my ($section, $field, $value) = @_;
1055
1056 $self->add_utf8_metadata ($section, $field,
1057 &unicode::ascii2utf8(\$value));
1058}
1059
1060sub add_utf8_metadata {
1061 my $self = shift (@_);
1062 my ($section, $field, $value) = @_;
1063
1064 # my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1065 # my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
1066 # print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
1067
1068 my $section_ptr = $self->_lookup_section($section);
1069 if (!defined $section_ptr) {
1070 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
1071 return;
1072 }
1073 if (!defined $value) {
1074 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
1075 return;
1076 }
1077 if (!defined $field) {
1078 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
1079 return;
1080 }
1081
1082 #print STDERR "###$field=$value\n";
1083
1084 # For now, supress this check. Given that text data read in is now
1085 # Unicode aware, then the following block of code can (ironically enough)
1086 # cause our unicode compliant string to be re-encoded (leading to
1087 # a double-encoded UTF-8 string, which we definitely don't want!).
1088
1089
1090 # double check that the value is utf-8
1091 # if (!&unicode::check_is_utf8($value)) {
1092 # print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
1093 # &unicode::ensure_utf8(\$value);
1094 # print STDERR " Tried converting to utf8: $value\n";
1095 # }
1096
1097 #If the metadata value is either a latitude or a longitude value then we want to save a shortened version for spacial searching purposes
1098 if ($field =~ m/^(.+\.)?Latitude$/ || $field =~ m/^(.+\.)?Longitude$/)
1099 {
1100 my ($mdprefix,$metaname) = ($field =~ m/(.+)\.(.+)$/);
1101 if (defined $mdprefix) {
1102 # Add in a version of Latitude/Longitude without the metadata namespace prefix to keep Runtime happy
1103 push (@{$section_ptr->{'metadata'}}, [$metaname, $value]);
1104 }
1105
1106 my $direction;
1107 if($value =~ m/^-/)
1108 {
1109 $direction = ($field eq "Latitude") ? "S" : "W";
1110 }
1111 else
1112 {
1113 $direction = ($field eq "Latitude") ? "N" : "E";
1114 }
1115
1116 my ($beforeDec, $afterDec) = ($value =~ m/^-?([0-9]+)\.([0-9]+)$/);
1117 if(defined $beforeDec && defined $afterDec)
1118 {
1119 my $name = ($field eq "Latitude") ? "LatShort" : "LngShort";
1120 push (@{$section_ptr->{'metadata'}}, [$name, $beforeDec . $direction]);
1121
1122 for(my $i = 2; $i <= 4; $i++)
1123 {
1124 if(length($afterDec) >= $i)
1125 {
1126 push (@{$section_ptr->{'metadata'}}, [$name, substr($afterDec, 0, $i)]);
1127 }
1128 }
1129
1130 #Only add the metadata if it has not already been added
1131 my $metaMap = $self->get_metadata_hashmap($section);
1132 }
1133 }
1134
1135 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1136}
1137
1138
1139# methods for dealing with text
1140
1141# returns the text for a section
1142sub get_text {
1143 my $self = shift (@_);
1144 my ($section) = @_;
1145
1146 my $section_ptr = $self->_lookup_section($section);
1147 if (!defined $section_ptr) {
1148 print STDERR "doc::get_text couldn't find section " .
1149 "$section\n";
1150 return "";
1151 }
1152
1153 return $section_ptr->{'text'};
1154}
1155
1156# returns the (utf-8 encoded) length of the text for a section
1157sub get_text_length {
1158 my $self = shift (@_);
1159 my ($section) = @_;
1160
1161 my $section_ptr = $self->_lookup_section($section);
1162 if (!defined $section_ptr) {
1163 print STDERR "doc::get_text_length couldn't find section " .
1164 "$section\n";
1165 return 0;
1166 }
1167
1168 return length ($section_ptr->{'text'});
1169}
1170
1171# returns the total length for all the sections
1172sub get_total_text_length {
1173 my $self = shift (@_);
1174
1175 my $section = $self->get_top_section();
1176 my $length = 0;
1177 while (defined $section) {
1178 $length += $self->get_text_length($section);
1179 $section = $self->get_next_section($section);
1180 }
1181 return $length;
1182}
1183
1184sub delete_text {
1185 my $self = shift (@_);
1186 my ($section) = @_;
1187
1188 my $section_ptr = $self->_lookup_section($section);
1189 if (!defined $section_ptr) {
1190 print STDERR "doc::delete_text couldn't find section " .
1191 "$section\n";
1192 return;
1193 }
1194
1195 $section_ptr->{'text'} = "";
1196}
1197
1198# add_text assumes the text is in (extended) ascii form. For
1199# text which has been already converted to the UTF-8 format
1200# use add_utf8_text.
1201sub add_text {
1202 my $self = shift (@_);
1203 my ($section, $text) = @_;
1204
1205 # convert the text to UTF-8 encoded unicode characters
1206 # and add the text
1207 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1208}
1209
1210
1211# add_utf8_text assumes the text to be added has already
1212# been converted to the UTF-8 encoding. For ascii text use
1213# add_text
1214sub add_utf8_text {
1215 my $self = shift (@_);
1216 my ($section, $text) = @_;
1217
1218 my $section_ptr = $self->_lookup_section($section);
1219 if (!defined $section_ptr) {
1220 print STDERR "doc::add_utf8_text couldn't find section " .
1221 "$section\n";
1222 return;
1223 }
1224
1225 $section_ptr->{'text'} .= $text;
1226}
1227
1228# returns the Source meta, which is the utf8 filename generated.
1229# Added a separate method here for convenience
1230sub get_source {
1231 my $self = shift (@_);
1232 return $self->get_metadata_element ($self->get_top_section(), "Source");
1233}
1234
1235# returns the SourceFile meta, which is the url reference to the URL-encoded
1236# version of Source (the utf8 filename). Added a separate method here for convenience
1237sub get_sourcefile {
1238 my $self = shift (@_);
1239 return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
1240}
1241
1242# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
1243# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
1244sub get_assocfile_from_sourcefile {
1245 my $self = shift (@_);
1246
1247 # get the SourceFile meta, which is a *URL* to a file on the filesystem
1248 my $top_section = $self->get_top_section();
1249 my $source_file = $self->get_metadata_element($top_section, "SourceFile");
1250
1251 # get the actual filename as it exists on the filesystem which this url refers to
1252 $source_file = &unicode::url_to_filename($source_file);
1253 my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
1254 return $assocfilename;
1255}
1256
1257# methods for dealing with associated files
1258
1259# a file is associated with a document, NOT a section.
1260# if section is defined it is noted in the data structure
1261# only so that files associated from a particular section
1262# may be removed later (using delete_section_assoc_files)
1263sub associate_file {
1264 my $self = shift (@_);
1265 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1266 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1267
1268 # remove all associated files with the same name
1269 $self->delete_assoc_file ($assoc_filename);
1270
1271 # Too harsh a requirement
1272 # Definitely get HTML docs, for example, with some missing
1273 # support files
1274# if (!&util::fd_exists($real_filename)) {
1275# print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n";
1276# exit -1;
1277# }
1278
1279# print STDERR "**** is the following a UTF8 rep of *real* filename?\n $real_filename\n";
1280# print STDERR "****##### so, ensure it is before storing?!?!?\n";
1281## my $utf8_filename = Encode::encode("utf8",$filename);
1282
1283 push (@{$self->{'associated_files'}},
1284 [$real_filename, $assoc_filename, $mime_type, $section]);
1285}
1286
1287# returns a list of associated files in the form
1288# [[real_filename, assoc_filename, mimetype], ...]
1289sub get_assoc_files {
1290 my $self = shift (@_);
1291
1292 return $self->{'associated_files'};
1293}
1294
1295# the following two methods used to keep track of original associated files
1296# for incremental building. eg a txt file used by an item file does not end
1297# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1298# file for incremental build
1299sub associate_source_file {
1300 my $self = shift (@_);
1301 my ($full_filename) = @_;
1302
1303 push (@{$self->{'source_assoc_files'}}, $full_filename);
1304
1305}
1306
1307sub get_source_assoc_files {
1308 my $self = shift (@_);
1309
1310 return $self->{'source_assoc_files'};
1311
1312
1313}
1314sub metadata_file {
1315 my $self = shift (@_);
1316 my ($real_filename, $filename) = @_;
1317
1318 push (@{$self->{'metadata_files'}},
1319 [$real_filename, $filename]);
1320}
1321
1322# used for writing out the archiveinf-doc info database, to list all the metadata files
1323sub get_meta_files {
1324 my $self = shift (@_);
1325
1326 return $self->{'metadata_files'};
1327}
1328
1329sub delete_section_assoc_files {
1330 my $self = shift (@_);
1331 my ($section) = @_;
1332
1333 my $i=0;
1334 while ($i < scalar (@{$self->{'associated_files'}})) {
1335 if (defined $self->{'associated_files'}->[$i]->[3] &&
1336 $self->{'associated_files'}->[$i]->[3] eq $section) {
1337 splice (@{$self->{'associated_files'}}, $i, 1);
1338 } else {
1339 $i++;
1340 }
1341 }
1342}
1343
1344sub delete_assoc_file {
1345 my $self = shift (@_);
1346 my ($assoc_filename) = @_;
1347
1348 my $i=0;
1349 while ($i < scalar (@{$self->{'associated_files'}})) {
1350 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1351 splice (@{$self->{'associated_files'}}, $i, 1);
1352 } else {
1353 $i++;
1354 }
1355 }
1356}
1357
1358sub reset_nextsection_ptr {
1359 my $self = shift (@_);
1360 my ($section) = @_;
1361
1362 my $section_ptr = $self->_lookup_section($section);
1363 $section_ptr->{'next_subsection'} = 1;
1364}
1365
13661;
Note: See TracBrowser for help on using the repository browser.