source: main/trunk/greenstone2/perllib/doc.pm@ 27393

Last change on this file since 27393 was 27393, checked in by jmt12, 11 years ago

Replace hardcoded -e with FileUtils::fileExists() call and util::rm() with FileUtils::removeFiles() call

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 38.2 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use FileUtils;
40use ghtml;
41use File::stat;
42##use hashdoc;
43use docprint;
44
45# the document type may be indexed_doc, nonindexed_doc, or
46# classification
47
48our $OIDcount = 0;
49
50# rename_method can be 'url', 'none', 'base64'
51sub new {
52 my $class = shift (@_);
53 my ($source_filename, $doc_type, $rename_method) = @_;
54
55
56 my $self = bless {'associated_files'=>[],
57 'subsection_order'=>[],
58 'next_subsection'=>1,
59 'subsections'=>{},
60 'metadata'=>[],
61 'text'=>"",
62 'OIDtype'=>"hash"}, $class;
63
64 # used to set lastmodified here, but this can screw up the HASH ids, so
65 # the docsave processor now calls set_lastmodified
66
67 $self->set_source_path($source_filename);
68
69 if (defined $source_filename) {
70 $source_filename = &util::filename_within_collection($source_filename);
71 print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
72 $self->set_source_filename ($source_filename, $rename_method);
73 }
74
75 $self->set_doc_type ($doc_type) if defined $doc_type;
76
77 return $self;
78}
79
80
81sub set_source_path
82{
83 my $self = shift @_;
84 my ($source_filename) = @_;
85
86 if (defined $source_filename) {
87 # On Windows the source_filename can be in terse DOS format
88 # e.g. test~1.txt
89
90 $self->{'terse_source_path'} = $source_filename;
91
92 # Use the FileUtil library methods as they are aware of more special
93 # cases such as HDFS [jmt12]
94 if (&FileUtils::fileExists($source_filename))
95 {
96 # See if we can do better for Windows with a filename
97 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
98 require Win32;
99 $self->{'source_path'} = Win32::GetLongPathName($source_filename);
100 }
101 else {
102 # For Unix-based systems, there is no difference between the two
103 $self->{'source_path'} = $source_filename;
104 }
105 }
106 else {
107 print STDERR "Warning: In doc::set_source_path(), file\n";
108 print STDERR " $source_filename\n";
109 print STDERR " does not exist\n";
110
111 # (default) Set it to whatever we were given
112 $self->{'source_path'} = $source_filename;
113 }
114 }
115 else {
116 # Previous code for setting source_path allowed for
117 # it to be undefined, so continue this practice
118 $self->{'terse_source_path'} = undef;
119 $self->{'source_path'} = undef;
120 }
121}
122
123
124sub get_source_path
125{
126 my $self = shift @_;
127
128 return $self->{'terse_source_path'};
129}
130
131# set lastmodified for OAI purposes, added by GRB, moved by kjdon
132sub set_oailastmodified {
133 my $self = shift (@_);
134
135 my $source_path = $self->{'terse_source_path'};
136
137 if (defined $source_path && (-e $source_path)) {
138 my $current_time = time;
139
140 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
141 $wday, $yday, $isdst) = localtime($current_time);
142
143 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
144
145 $self->add_utf8_metadata($self->get_top_section(), "oailastmodified", $current_time);
146 $self->add_utf8_metadata($self->get_top_section(), "oailastmodifieddate", $date_modified);
147 }
148}
149
150# no longer used for OAI purposes, since lastmodified is not what we want as the
151# Datestamp of a document. This doc metadata may be useful for general purposes.
152sub set_lastmodified {
153 my $self = shift (@_);
154
155 my $source_path = $self->{'terse_source_path'};
156
157 if (defined $source_path && (-e $source_path)) {
158
159 my $file_stat = stat($source_path);
160 my $mtime = $file_stat->mtime;
161 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
162 $wday, $yday, $isdst) = localtime($mtime);
163
164 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
165
166 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $mtime);
167 $self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
168 }
169}
170
171# clone the $self object
172sub duplicate {
173 my $self = shift (@_);
174
175 my $newobj = {};
176
177 foreach my $k (keys %$self) {
178 $newobj->{$k} = &clone ($self->{$k});
179 }
180
181 bless $newobj, ref($self);
182 return $newobj;
183}
184
185sub clone {
186 my ($from) = @_;
187 my $type = ref ($from);
188
189 if ($type eq "HASH") {
190 my $to = {};
191 foreach my $key (keys %$from) {
192 $to->{$key} = &clone ($from->{$key});
193 }
194 return $to;
195 } elsif ($type eq "ARRAY") {
196 my $to = [];
197 foreach my $v (@$from) {
198 push (@$to, &clone ($v));
199 }
200 return $to;
201 } else {
202 return $from;
203 }
204}
205
206sub set_OIDtype {
207 my $self = shift (@_);
208 my ($type, $metadata) = @_;
209
210 if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|hash_on_full_filename|incremental|filename|dirname|full_filename|assigned)$/) {
211 $self->{'OIDtype'} = $type;
212 } else {
213 $self->{'OIDtype'} = "hash";
214 }
215
216 if ($type =~ /^assigned$/) {
217 if (defined $metadata) {
218 $self->{'OIDmetadata'} = $metadata;
219 } else {
220 $self->{'OIDmetadata'} = "dc.Identifier";
221 }
222 }
223}
224
225# rename_method can be 'url', 'none', 'base64'
226sub set_source_filename {
227 my $self = shift (@_);
228 my ($source_filename, $rename_method) = @_;
229
230 # Since the gsdlsourcefilename element goes into the doc.xml it has
231 # to be utf8. However, it should also *represent* the source filename
232 # (in the import directory) which may not be utf8 at all.
233 # For instance, if this meta element (gsdlsourcefilename) will be used
234 # by other applications that parse doc.xml in order to locate
235 # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
236 # encode the real filename as this is a binary-to-text encoding meaning
237 # that the resulting string is ASCII (utf8). Decoding will give the original.
238
239# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
240
241 # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
242 $source_filename = $self->encode_filename($source_filename, $rename_method);
243# my ($srcfilename,$dirname,$suffix)
244# = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
245# print STDERR "-> $srcfilename -> ";
246# $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
247# $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
248# print STDERR "$source_filename\n";
249
250 $self->set_utf8_metadata_element ($self->get_top_section(),
251 "gsdlsourcefilename",
252 $source_filename);
253}
254
255sub encode_filename {
256 my $self = shift (@_);
257 my ($source_filename, $rename_method) = @_;
258
259 my ($srcfilename,$dirname,$suffix)
260 = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
261# print STDERR "-> $srcfilename -> ";
262 $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
263 $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
264
265 return $source_filename;
266}
267
268sub set_converted_filename {
269 my $self = shift (@_);
270 my ($converted_filename) = @_;
271
272 # we know the converted filename is utf8
273 $self->set_utf8_metadata_element ($self->get_top_section(),
274 "gsdlconvertedfilename",
275 $converted_filename);
276}
277
278# returns the source_filename as it was provided
279sub get_unmodified_source_filename {
280 my $self = shift (@_);
281
282 return $self->{'terse_source_path'};
283}
284
285# returns the source_filename with whatever rename_method was given
286sub get_source_filename {
287 my $self = shift (@_);
288
289 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
290}
291
292
293
294# returns converted filename if available else returns source filename
295sub get_filename_for_hashing {
296 my $self = shift (@_);
297
298 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
299
300 if (!defined $filename) {
301 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
302 # if NULPlug processed file, then don't give a filename
303 if (defined $plugin_name && $plugin_name eq "NULPlug") {
304 $filename = undef;
305 } else { # returns the URL encoded source filename!
306 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
307 }
308 }
309
310 if (!&FileUtils::isFilenameAbsolute($filename)) {
311 $filename = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$filename);
312 }
313
314 return $filename;
315}
316
317sub set_doc_type {
318 my $self = shift (@_);
319 my ($doc_type) = @_;
320
321 $self->set_metadata_element ($self->get_top_section(),
322 "gsdldoctype",
323 $doc_type);
324}
325
326# returns the gsdldoctype as it was provided
327# the default of "indexed_doc" is used if no document
328# type was provided
329sub get_doc_type {
330 my $self = shift (@_);
331
332 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
333 return $doc_type if (defined $doc_type);
334 return "indexed_doc";
335}
336
337
338# look up the reference to the a particular section
339sub _lookup_section {
340 my $self = shift (@_);
341 my ($section) = @_;
342
343 my ($num);
344 my $sectionref = $self;
345
346 while (defined $section && $section ne "") {
347
348 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
349
350 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
351
352 $section = "" unless defined $section;
353
354
355 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
356 $sectionref = $sectionref->{'subsections'}->{$num};
357 } else {
358 return undef;
359 }
360 }
361
362 return $sectionref;
363}
364
365# calculate OID by hashing the contents of the document
366sub _calc_OID {
367 my $self = shift (@_);
368 my ($filename) = @_;
369
370
371 my $osexe = &util::get_os_exe();
372
373 my $hashfile_exe = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin",
374 $ENV{'GSDLOS'},"hashfile$osexe");
375
376 my $result = "NULL";
377
378
379 if (-e "$hashfile_exe") {
380# $result = `\"$hashfile_exe\" \"$filename\"`;
381# $result = `hashfile$osexe \"$filename\" 2>&1`;
382 $result = `hashfile$osexe \"$filename\"`;
383
384 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
385 } else {
386 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
387 }
388 return "HASH$result";
389}
390
391# methods dealing with OID, not groups of them.
392
393# if $OID is not provided one is calculated
394sub set_OID {
395 my $self = shift (@_);
396 my ($OID) = @_;
397
398 my $use_hash_oid = 0;
399 # if an OID wasn't provided calculate one
400 if (!defined $OID) {
401 $OID = "NULL";
402 if ($self->{'OIDtype'} =~ /^hash/) {
403 $use_hash_oid = 1;
404 } elsif ($self->{'OIDtype'} eq "incremental") {
405 $OID = "D" . $OIDcount;
406 $OIDcount ++;
407 } elsif ($self->{'OIDtype'} eq "filename") {
408 my $filename = $self->get_source_filename();
409 $OID = &File::Basename::fileparse($filename, qr/\.[^.]*/);
410 $OID = &util::tidy_up_oid($OID);
411 } elsif ($self->{'OIDtype'} eq "full_filename") {
412 my $source_filename = $self->get_source_filename();
413 my $dirsep = &util::get_os_dirsep();
414
415 $source_filename =~ s/^import$dirsep//;
416 $source_filename =~ s/$dirsep/-/g;
417 $source_filename =~ s/\./_/g;
418
419 $OID = $source_filename;
420 $OID = &util::tidy_up_oid($OID);
421 } elsif ($self->{'OIDtype'} eq "dirname") {
422 my $filename = $self->get_source_filename();
423 if (defined($filename)) { # && -e $filename) {
424 # get the immediate parent directory
425 $OID = &File::Basename::dirname($filename);
426 if (defined $OID) {
427 $OID = &File::Basename::basename($OID);
428 $OID = &util::tidy_up_oid($OID);
429 } else {
430 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
431 $use_hash_oid = 1;
432 }
433 } else {
434 print STDERR "Failed to find a filename, generating hash id\n";
435 $use_hash_oid = 1;
436 }
437
438 } elsif ($self->{'OIDtype'} eq "assigned") {
439 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
440 if (defined $identifier && $identifier ne "") {
441 $OID = $identifier;
442 $OID = &util::tidy_up_oid($OID);
443 } else {
444 # need a hash id
445 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
446 $use_hash_oid = 1;
447 }
448
449 } else {
450 $use_hash_oid = 1;
451 }
452
453 if ($use_hash_oid) {
454 my $hash_on_file = 1;
455 my $hash_on_ga_xml = 0;
456
457 if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
458 $hash_on_file = 0;
459 $hash_on_ga_xml = 1;
460 }
461
462 if ($self->{'OIDtype'} eq "hash_on_full_filename") {
463 $hash_on_file = 0;
464 $hash_on_ga_xml = 0;
465
466 my $source_filename = $self->get_source_filename();
467 my $dirsep = &util::get_os_dirsep();
468
469 $source_filename =~ s/^import$dirsep//;
470 $source_filename =~ s/$dirsep/-/g;
471 $source_filename =~ s/\./_/g;
472
473 # If the filename is very short then (handled naively)
474 # this can cause conjestion in the hash-values
475 # computed, leading documents sharing the same leading
476 # Hex values in the computed has.
477 #
478 # The solution taken here is to replace the name of
479 # the file name a sufficient number of times (up to
480 # the character limit defined in 'rep_limit' and
481 # make that the content that is hashed on
482
483 # *** Think twice before changing the following value
484 # as it will break backward compatability of computed
485 # document HASH values
486
487 my $rep_limit = 256;
488 my $hash_content = undef;
489
490 if (length($source_filename)<$rep_limit) {
491 my $rep_string = "$source_filename|";
492 my $rs_len = length($rep_string);
493
494 my $clone_times = int(($rep_limit-1)/$rs_len) +1;
495
496 $hash_content = substr($rep_string x $clone_times, 0, $rep_limit);
497 }
498 else {
499 $hash_content = $source_filename;
500 }
501
502 my $filename = &util::get_tmp_filename();
503 if (!open (OUTFILE, ">:utf8", $filename)) {
504 print STDERR "doc::set_OID could not write to $filename\n";
505 } else {
506 print OUTFILE $hash_content;
507 close (OUTFILE);
508 }
509 $OID = $self->_calc_OID ($filename);
510
511 &util::rm ($filename);
512 }
513
514 if ($hash_on_file) {
515 # "hash" OID - feed file to hashfile.exe
516 my $filename = $self->get_filename_for_hashing();
517
518 # -z: don't want to hash on the file if it is zero size
519 if (defined($filename) && -e $filename && !-z $filename) {
520 $OID = $self->_calc_OID ($filename);
521 } else {
522 $hash_on_ga_xml = 1; # switch to back-up plan, and hash on GA file instead
523 }
524 }
525
526 if ($hash_on_ga_xml) {
527 # In addition being asked to explicity calculate the has based on the GA file,
528 # can also end up coming into this block is doing 'hash_on_file' but the file
529 # itself is of zero bytes (as could be the case with 'doc.nul' file
530
531 my $filename = &util::get_tmp_filename();
532 if (!open (OUTFILE, ">:utf8", $filename)) {
533 print STDERR "doc::set_OID could not write to $filename\n";
534 } else {
535 my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
536 print OUTFILE $doc_text;
537 close (OUTFILE);
538 }
539 $OID = $self->_calc_OID ($filename);
540 &FileUtils::removeFiles($filename);
541 }
542 }
543 }
544 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
545}
546
547# this uses hashdoc (embedded c thingy) which is faster but still
548# needs a little work to be suffiently stable
549sub ___set_OID {
550 my $self = shift (@_);
551 my ($OID) = @_;
552
553 # if an OID wasn't provided then calculate hash value based on document
554 if (!defined $OID)
555 {
556 my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
557 my $hash_len = length($hash_text);
558
559 $OID = &hashdoc::buffer($hash_text,$hash_len);
560 }
561
562 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
563}
564
565# returns the OID for this document
566sub get_OID {
567 my $self = shift (@_);
568 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
569 return $OID if (defined $OID);
570 return "NULL";
571}
572
573sub delete_OID {
574 my $self = shift (@_);
575
576 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
577}
578
579
580# methods for manipulating section names
581
582# returns the name of the top-most section (the top
583# level of the document
584sub get_top_section {
585 my $self = shift (@_);
586
587 return "";
588}
589
590# returns a section
591sub get_parent_section {
592 my $self = shift (@_);
593 my ($section) = @_;
594
595 $section =~ s/(^|\.)\d+$//;
596
597 return $section;
598}
599
600# returns the first child section (or the end child
601# if there isn't any)
602sub get_begin_child {
603 my $self = shift (@_);
604 my ($section) = @_;
605
606 my $section_ptr = $self->_lookup_section($section);
607 return "" unless defined $section_ptr;
608
609 if (defined $section_ptr->{'subsection_order'}->[0]) {
610 return "$section.$section_ptr->{'subsection_order'}->[0]";
611 }
612
613 return $self->get_end_child ($section);
614}
615
616# returns the next child of a parent section
617sub get_next_child {
618 my $self = shift (@_);
619 my ($section) = @_;
620
621 my $parent_section = $self->get_parent_section($section);
622 my $parent_section_ptr = $self->_lookup_section($parent_section);
623 return undef unless defined $parent_section_ptr;
624
625 my ($section_num) = $section =~ /(\d+)$/;
626 return undef unless defined $section_num;
627
628 my $i = 0;
629 my $section_order = $parent_section_ptr->{'subsection_order'};
630 while ($i < scalar(@$section_order)) {
631 last if $section_order->[$i] eq $section_num;
632 $i++;
633 }
634
635 $i++; # the next child
636 if ($i < scalar(@$section_order)) {
637 return $section_order->[$i] if $parent_section eq "";
638 return "$parent_section.$section_order->[$i]";
639 }
640
641 # no more sections in this level
642 return undef;
643}
644
645# returns a reference to a list of children
646sub get_children {
647 my $self = shift (@_);
648 my ($section) = @_;
649
650 my $section_ptr = $self->_lookup_section($section);
651 return [] unless defined $section_ptr;
652
653 my @children = @{$section_ptr->{'subsection_order'}};
654
655 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
656 return \@children;
657}
658
659# returns the child section one past the last one (which
660# is coded as "0")
661sub get_end_child {
662 my $self = shift (@_);
663 my ($section) = @_;
664
665 return $section . ".0" unless $section eq "";
666 return "0";
667}
668
669# returns the next section in book order
670sub get_next_section {
671 my $self = shift (@_);
672 my ($section) = @_;
673
674 return undef unless defined $section;
675
676 my $section_ptr = $self->_lookup_section($section);
677 return undef unless defined $section_ptr;
678
679 # first try to find first child
680 if (defined $section_ptr->{'subsection_order'}->[0]) {
681 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
682 return "$section.$section_ptr->{'subsection_order'}->[0]";
683 }
684
685 do {
686 # try to find sibling
687 my $next_child = $self->get_next_child ($section);
688 return $next_child if (defined $next_child);
689
690 # move up one level
691 $section = $self->get_parent_section ($section);
692 } while $section =~ /\d/;
693
694 return undef;
695}
696
697sub is_leaf_section {
698 my $self = shift (@_);
699 my ($section) = @_;
700
701 my $section_ptr = $self->_lookup_section($section);
702 return 1 unless defined $section_ptr;
703
704 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
705}
706
707# methods for dealing with sections
708
709# returns the name of the inserted section
710sub insert_section {
711 my $self = shift (@_);
712 my ($before_section) = @_;
713
714 # get the child to insert before and its parent section
715 my $parent_section = "";
716 my $before_child = "0";
717 my @before_section = split (/\./, $before_section);
718 if (scalar(@before_section) > 0) {
719 $before_child = pop (@before_section);
720 $parent_section = join (".", @before_section);
721 }
722
723 my $parent_section_ptr = $self->_lookup_section($parent_section);
724 if (!defined $parent_section_ptr) {
725 print STDERR "doc::insert_section couldn't find parent section " .
726 "$parent_section\n";
727 return;
728 }
729
730 # get the next section number
731 my $section_num = $parent_section_ptr->{'next_subsection'}++;
732
733 my $i = 0;
734 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
735 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
736 $i++;
737 }
738
739 # insert the section number into the order list
740 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
741
742 # add this section to the parent section
743 my $section_ptr = {'subsection_order'=>[],
744 'next_subsection'=>1,
745 'subsections'=>{},
746 'metadata'=>[],
747 'text'=>""};
748 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
749
750 # work out the full section number
751 my $section = $parent_section;
752 $section .= "." unless $section eq "";
753 $section .= $section_num;
754
755 return $section;
756}
757
758# creates a pre-named section
759sub create_named_section {
760 my $self = shift (@_);
761 my ($mastersection) = @_;
762
763 my ($num);
764 my $section = $mastersection;
765 my $sectionref = $self;
766
767 while ($section ne "") {
768 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
769 $num =~ s/^0+(\d)/$1/; # remove leading 0s
770 $section = "" unless defined $section;
771
772 if (defined $num) {
773 if (!defined $sectionref->{'subsections'}->{$num}) {
774 push (@{$sectionref->{'subsection_order'}}, $num);
775 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
776 'next_subsection'=>1,
777 'subsections'=>{},
778 'metadata'=>[],
779 'text'=>""};
780 if ($num >= $sectionref->{'next_subsection'}) {
781 $sectionref->{'next_subsection'} = $num + 1;
782 }
783 }
784 $sectionref = $sectionref->{'subsections'}->{$num};
785
786 } else {
787 print STDERR "doc::create_named_section couldn't create section ";
788 print STDERR "$mastersection\n";
789 last;
790 }
791 }
792}
793
794# returns a reference to a list of subsections
795sub list_subsections {
796 my $self = shift (@_);
797 my ($section) = @_;
798
799 my $section_ptr = $self->_lookup_section ($section);
800 if (!defined $section_ptr) {
801 print STDERR "doc::list_subsections couldn't find section $section\n";
802 return [];
803 }
804
805 return [@{$section_ptr->{'subsection_order'}}];
806}
807
808sub delete_section {
809 my $self = shift (@_);
810 my ($section) = @_;
811
812# my $section_ptr = {'subsection_order'=>[],
813# 'next_subsection'=>1,
814# 'subsections'=>{},
815# 'metadata'=>[],
816# 'text'=>""};
817
818 # if this is the top section reset everything
819 if ($section eq "") {
820 $self->{'subsection_order'} = [];
821 $self->{'subsections'} = {};
822 $self->{'metadata'} = [];
823 $self->{'text'} = "";
824 return;
825 }
826
827 # find the parent of the section to delete
828 my $parent_section = "";
829 my $child = "0";
830 my @section = split (/\./, $section);
831 if (scalar(@section) > 0) {
832 $child = pop (@section);
833 $parent_section = join (".", @section);
834 }
835
836 my $parent_section_ptr = $self->_lookup_section($parent_section);
837 if (!defined $parent_section_ptr) {
838 print STDERR "doc::delete_section couldn't find parent section " .
839 "$parent_section\n";
840 return;
841 }
842
843 # remove this section from the subsection_order list
844 my $i = 0;
845 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
846 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
847 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
848 last;
849 }
850 $i++;
851 }
852
853 # remove this section from the subsection hash
854 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
855 undef $parent_section_ptr->{'subsections'}->{$child};
856 }
857}
858
859#--
860# methods for dealing with metadata
861
862# set_metadata_element and get_metadata_element are for metadata
863# which should only have one value. add_meta_data and get_metadata
864# are for metadata which can have more than one value.
865
866# returns the first metadata value which matches field
867
868# This version of get metadata element works much like the one above,
869# except it allows for the namespace portion of a metadata element to
870# be ignored, thus if you are searching for dc.Title, the first piece
871# of matching metadata ending with the name Title (once any namespace
872# is removed) would be returned.
873# 28-11-2003 John Thompson
874sub get_metadata_element {
875 my $self = shift (@_);
876 my ($section, $field, $ignore_namespace) = @_;
877 my ($data);
878
879 $ignore_namespace = 0 unless defined $ignore_namespace;
880
881 my $section_ptr = $self->_lookup_section($section);
882 if (!defined $section_ptr) {
883 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
884 return;
885 }
886
887 # Remove any namespace if we are being told to ignore them
888 if($ignore_namespace) {
889 $field =~ s/^.*\.//; #$field =~ s/^\w*\.//;
890 }
891
892 foreach $data (@{$section_ptr->{'metadata'}}) {
893
894 my $data_name = $data->[0];
895
896 # Remove any namespace if we are being told to ignore them
897 if($ignore_namespace) {
898 $data_name =~ s/^.*\.//; #$data_name =~ s/^\w*\.//;
899 }
900 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
901 $data_name =~ s/^ex\.([^.]+)$/$1/; #$data_name =~ s/^ex\.//;
902
903 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
904 }
905
906 return undef; # was not found
907}
908
909# returns a list of the form [value1, value2, ...]
910sub get_metadata {
911 my $self = shift (@_);
912 my ($section, $field, $ignore_namespace) = @_;
913 my ($data);
914
915 $ignore_namespace = 0 unless defined $ignore_namespace;
916
917 my $section_ptr = $self->_lookup_section($section);
918 if (!defined $section_ptr) {
919 print STDERR "doc::get_metadata couldn't find section ",
920 $section, "\n";
921 return;
922 }
923
924 # Remove any namespace if we are being told to ignore them
925 if($ignore_namespace) {
926 $field =~ s/^.*\.//;
927 }
928
929 my @metadata = ();
930 foreach $data (@{$section_ptr->{'metadata'}}) {
931
932 my $data_name = $data->[0];
933
934 # Remove any namespace if we are being told to ignore them
935 if($ignore_namespace) {
936 $data_name =~ s/^.*\.//;
937 }
938 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
939 $data_name =~ s/^ex\.([^.]+)$/$1/;
940
941 push (@metadata, $data->[1]) if ($data_name eq $field);
942 }
943
944 return \@metadata;
945}
946
947sub get_metadata_hashmap {
948 my $self = shift (@_);
949 my ($section, $opt_namespace) = @_;
950
951 my $section_ptr = $self->_lookup_section($section);
952 if (!defined $section_ptr) {
953 print STDERR "doc::get_metadata couldn't find section ",
954 $section, "\n";
955 return;
956 }
957
958 my $metadata_hashmap = {};
959 foreach my $data (@{$section_ptr->{'metadata'}}) {
960 my $metaname = $data->[0];
961
962 if ((!defined $opt_namespace) || ($metaname =~ m/^$opt_namespace\./)) {
963 if (!defined $metadata_hashmap->{$metaname}) {
964 $metadata_hashmap->{$metaname} = [];
965 }
966 my $metaval_list = $metadata_hashmap->{$metaname};
967 push(@$metaval_list, $data->[1]);
968 }
969 }
970
971 return $metadata_hashmap;
972}
973
974# returns a list of the form [[field,value],[field,value],...]
975sub get_all_metadata {
976 my $self = shift (@_);
977 my ($section) = @_;
978
979 my $section_ptr = $self->_lookup_section($section);
980 if (!defined $section_ptr) {
981 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
982 return;
983 }
984
985 return $section_ptr->{'metadata'};
986}
987
988# $value is optional
989sub delete_metadata {
990 my $self = shift (@_);
991 my ($section, $field, $value) = @_;
992
993 my $section_ptr = $self->_lookup_section($section);
994 if (!defined $section_ptr) {
995 print STDERR "doc::delete_metadata couldn't find section ", $section, "$field\n";
996 return;
997 }
998
999 my $i = 0;
1000 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
1001 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
1002 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
1003 splice (@{$section_ptr->{'metadata'}}, $i, 1);
1004 } else {
1005 $i++;
1006 }
1007 }
1008}
1009
1010sub delete_all_metadata {
1011 my $self = shift (@_);
1012 my ($section) = @_;
1013
1014 my $section_ptr = $self->_lookup_section($section);
1015 if (!defined $section_ptr) {
1016 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
1017 return;
1018 }
1019
1020 $section_ptr->{'metadata'} = [];
1021}
1022
1023sub set_metadata_element {
1024 my $self = shift (@_);
1025 my ($section, $field, $value) = @_;
1026
1027 $self->set_utf8_metadata_element ($section, $field,
1028 &unicode::ascii2utf8(\$value));
1029}
1030
1031# set_utf8_metadata_element assumes the text has already been
1032# converted to the UTF-8 encoding.
1033sub set_utf8_metadata_element {
1034 my $self = shift (@_);
1035 my ($section, $field, $value) = @_;
1036
1037 $self->delete_metadata ($section, $field);
1038 $self->add_utf8_metadata ($section, $field, $value);
1039}
1040
1041
1042# add_metadata assumes the text is in (extended) ascii form. For
1043# text which has already been converted to the UTF-8 format use
1044# add_utf8_metadata.
1045sub add_metadata {
1046 my $self = shift (@_);
1047 my ($section, $field, $value) = @_;
1048
1049 $self->add_utf8_metadata ($section, $field,
1050 &unicode::ascii2utf8(\$value));
1051}
1052
1053sub add_utf8_metadata {
1054 my $self = shift (@_);
1055 my ($section, $field, $value) = @_;
1056
1057 # my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1058 # my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
1059 # print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
1060
1061 my $section_ptr = $self->_lookup_section($section);
1062 if (!defined $section_ptr) {
1063 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
1064 return;
1065 }
1066 if (!defined $value) {
1067 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
1068 return;
1069 }
1070 if (!defined $field) {
1071 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
1072 return;
1073 }
1074
1075 #print STDERR "###$field=$value\n";
1076
1077 # For now, supress this check. Given that text data read in is now
1078 # Unicode aware, then the following block of code can (ironically enough)
1079 # cause our unicode compliant string to be re-encoded (leading to
1080 # a double-encoded UTF-8 string, which we definitely don't want!).
1081
1082
1083 # double check that the value is utf-8
1084 # if (!&unicode::check_is_utf8($value)) {
1085 # print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
1086 # &unicode::ensure_utf8(\$value);
1087 # print STDERR " Tried converting to utf8: $value\n";
1088 # }
1089
1090 #If the metadata value is either a latitude or a longitude value then we want to save a shortened version for spacial searching purposes
1091 if ($field =~ m/^(.+\.)?Latitude$/ || $field =~ m/^(.+\.)?Longitude$/)
1092 {
1093 my ($mdprefix,$metaname) = ($field =~ m/(.+)\.(.+)$/);
1094 if (defined $mdprefix) {
1095 # Add in a version of Latitude/Longitude without the metadata namespace prefix to keep Runtime happy
1096 push (@{$section_ptr->{'metadata'}}, [$metaname, $value]);
1097 }
1098
1099 my $direction;
1100 if($value =~ m/^-/)
1101 {
1102 $direction = ($field eq "Latitude") ? "S" : "W";
1103 }
1104 else
1105 {
1106 $direction = ($field eq "Latitude") ? "N" : "E";
1107 }
1108
1109 my ($beforeDec, $afterDec) = ($value =~ m/^-?([0-9]+)\.([0-9]+)$/);
1110 if(defined $beforeDec && defined $afterDec)
1111 {
1112 my $name = ($field eq "Latitude") ? "LatShort" : "LngShort";
1113 push (@{$section_ptr->{'metadata'}}, [$name, $beforeDec . $direction]);
1114
1115 for(my $i = 2; $i <= 4; $i++)
1116 {
1117 if(length($afterDec) >= $i)
1118 {
1119 push (@{$section_ptr->{'metadata'}}, [$name, substr($afterDec, 0, $i)]);
1120 }
1121 }
1122
1123 #Only add the metadata if it has not already been added
1124 my $metaMap = $self->get_metadata_hashmap($section);
1125 }
1126 }
1127
1128 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1129}
1130
1131
1132# methods for dealing with text
1133
1134# returns the text for a section
1135sub get_text {
1136 my $self = shift (@_);
1137 my ($section) = @_;
1138
1139 my $section_ptr = $self->_lookup_section($section);
1140 if (!defined $section_ptr) {
1141 print STDERR "doc::get_text couldn't find section " .
1142 "$section\n";
1143 return "";
1144 }
1145
1146 return $section_ptr->{'text'};
1147}
1148
1149# returns the (utf-8 encoded) length of the text for a section
1150sub get_text_length {
1151 my $self = shift (@_);
1152 my ($section) = @_;
1153
1154 my $section_ptr = $self->_lookup_section($section);
1155 if (!defined $section_ptr) {
1156 print STDERR "doc::get_text_length couldn't find section " .
1157 "$section\n";
1158 return 0;
1159 }
1160
1161 return length ($section_ptr->{'text'});
1162}
1163
1164# returns the total length for all the sections
1165sub get_total_text_length {
1166 my $self = shift (@_);
1167
1168 my $section = $self->get_top_section();
1169 my $length = 0;
1170 while (defined $section) {
1171 $length += $self->get_text_length($section);
1172 $section = $self->get_next_section($section);
1173 }
1174 return $length;
1175}
1176
1177sub delete_text {
1178 my $self = shift (@_);
1179 my ($section) = @_;
1180
1181 my $section_ptr = $self->_lookup_section($section);
1182 if (!defined $section_ptr) {
1183 print STDERR "doc::delete_text couldn't find section " .
1184 "$section\n";
1185 return;
1186 }
1187
1188 $section_ptr->{'text'} = "";
1189}
1190
1191# add_text assumes the text is in (extended) ascii form. For
1192# text which has been already converted to the UTF-8 format
1193# use add_utf8_text.
1194sub add_text {
1195 my $self = shift (@_);
1196 my ($section, $text) = @_;
1197
1198 # convert the text to UTF-8 encoded unicode characters
1199 # and add the text
1200 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1201}
1202
1203
1204# add_utf8_text assumes the text to be added has already
1205# been converted to the UTF-8 encoding. For ascii text use
1206# add_text
1207sub add_utf8_text {
1208 my $self = shift (@_);
1209 my ($section, $text) = @_;
1210
1211 my $section_ptr = $self->_lookup_section($section);
1212 if (!defined $section_ptr) {
1213 print STDERR "doc::add_utf8_text couldn't find section " .
1214 "$section\n";
1215 return;
1216 }
1217
1218 $section_ptr->{'text'} .= $text;
1219}
1220
1221# returns the Source meta, which is the utf8 filename generated.
1222# Added a separate method here for convenience
1223sub get_source {
1224 my $self = shift (@_);
1225 return $self->get_metadata_element ($self->get_top_section(), "Source");
1226}
1227
1228# returns the SourceFile meta, which is the url reference to the URL-encoded
1229# version of Source (the utf8 filename). Added a separate method here for convenience
1230sub get_sourcefile {
1231 my $self = shift (@_);
1232 return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
1233}
1234
1235# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
1236# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
1237sub get_assocfile_from_sourcefile {
1238 my $self = shift (@_);
1239
1240 # get the SourceFile meta, which is a *URL* to a file on the filesystem
1241 my $top_section = $self->get_top_section();
1242 my $source_file = $self->get_metadata_element($top_section, "SourceFile");
1243
1244 # get the actual filename as it exists on the filesystem which this url refers to
1245 $source_file = &unicode::url_to_filename($source_file);
1246 my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
1247 return $assocfilename;
1248}
1249
1250# methods for dealing with associated files
1251
1252# a file is associated with a document, NOT a section.
1253# if section is defined it is noted in the data structure
1254# only so that files associated from a particular section
1255# may be removed later (using delete_section_assoc_files)
1256sub associate_file {
1257 my $self = shift (@_);
1258 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1259 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1260
1261 # remove all associated files with the same name
1262 $self->delete_assoc_file ($assoc_filename);
1263
1264 # Too harsh a requirement
1265 # Definitely get HTML docs, for example, with some missing
1266 # support files
1267# if (!&util::fd_exists($real_filename)) {
1268# print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n";
1269# exit -1;
1270# }
1271
1272# print STDERR "**** is the following a UTF8 rep of *real* filename?\n $real_filename\n";
1273# print STDERR "****##### so, ensure it is before storing?!?!?\n";
1274## my $utf8_filename = Encode::encode("utf8",$filename);
1275
1276 push (@{$self->{'associated_files'}},
1277 [$real_filename, $assoc_filename, $mime_type, $section]);
1278}
1279
1280# returns a list of associated files in the form
1281# [[real_filename, assoc_filename, mimetype], ...]
1282sub get_assoc_files {
1283 my $self = shift (@_);
1284
1285 return $self->{'associated_files'};
1286}
1287
1288# the following two methods used to keep track of original associated files
1289# for incremental building. eg a txt file used by an item file does not end
1290# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1291# file for incremental build
1292sub associate_source_file {
1293 my $self = shift (@_);
1294 my ($full_filename) = @_;
1295
1296 push (@{$self->{'source_assoc_files'}}, $full_filename);
1297
1298}
1299
1300sub get_source_assoc_files {
1301 my $self = shift (@_);
1302
1303 return $self->{'source_assoc_files'};
1304
1305
1306}
1307sub metadata_file {
1308 my $self = shift (@_);
1309 my ($real_filename, $filename) = @_;
1310
1311 push (@{$self->{'metadata_files'}},
1312 [$real_filename, $filename]);
1313}
1314
1315# used for writing out the archiveinf-doc info database, to list all the metadata files
1316sub get_meta_files {
1317 my $self = shift (@_);
1318
1319 return $self->{'metadata_files'};
1320}
1321
1322sub delete_section_assoc_files {
1323 my $self = shift (@_);
1324 my ($section) = @_;
1325
1326 my $i=0;
1327 while ($i < scalar (@{$self->{'associated_files'}})) {
1328 if (defined $self->{'associated_files'}->[$i]->[3] &&
1329 $self->{'associated_files'}->[$i]->[3] eq $section) {
1330 splice (@{$self->{'associated_files'}}, $i, 1);
1331 } else {
1332 $i++;
1333 }
1334 }
1335}
1336
1337sub delete_assoc_file {
1338 my $self = shift (@_);
1339 my ($assoc_filename) = @_;
1340
1341 my $i=0;
1342 while ($i < scalar (@{$self->{'associated_files'}})) {
1343 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1344 splice (@{$self->{'associated_files'}}, $i, 1);
1345 } else {
1346 $i++;
1347 }
1348 }
1349}
1350
1351sub reset_nextsection_ptr {
1352 my $self = shift (@_);
1353 my ($section) = @_;
1354
1355 my $section_ptr = $self->_lookup_section($section);
1356 $section_ptr->{'next_subsection'} = 1;
1357}
1358
13591;
Note: See TracBrowser for help on using the repository browser.