source: main/trunk/greenstone2/perllib/doc.pm@ 26540

Last change on this file since 26540 was 26540, checked in by davidb, 11 years ago

Removal of rogue debugging statement

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 37.7 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use ghtml;
40use File::stat;
41##use hashdoc;
42use docprint;
43
44# the document type may be indexed_doc, nonindexed_doc, or
45# classification
46
47our $OIDcount = 0;
48
49# rename_method can be 'url', 'none', 'base64'
50sub new {
51 my $class = shift (@_);
52 my ($source_filename, $doc_type, $rename_method) = @_;
53
54
55 my $self = bless {'associated_files'=>[],
56 'subsection_order'=>[],
57 'next_subsection'=>1,
58 'subsections'=>{},
59 'metadata'=>[],
60 'text'=>"",
61 'OIDtype'=>"hash"}, $class;
62
63 # used to set lastmodified here, but this can screw up the HASH ids, so
64 # the docsave processor now calls set_lastmodified
65
66 $self->set_source_path($source_filename);
67
68 if (defined $source_filename) {
69 $source_filename = &util::filename_within_collection($source_filename);
70 print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
71 $self->set_source_filename ($source_filename, $rename_method);
72 }
73
74 $self->set_doc_type ($doc_type) if defined $doc_type;
75
76 return $self;
77}
78
79
80sub set_source_path
81{
82 my $self = shift @_;
83 my ($source_filename) = @_;
84
85 if (defined $source_filename) {
86 # On Windows the source_filename can be in terse DOS format
87 # e.g. test~1.txt
88
89 $self->{'terse_source_path'} = $source_filename;
90
91 if (-e $source_filename) {
92 # See if we can do better for Windows with a filename
93 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
94 require Win32;
95 $self->{'source_path'} = Win32::GetLongPathName($source_filename);
96 }
97 else {
98 # For Unix-based systems, there is no difference between the two
99 $self->{'source_path'} = $source_filename;
100 }
101 }
102 else {
103 print STDERR "Warning: In doc::set_source_path(), file\n";
104 print STDERR " $source_filename\n";
105 print STDERR " does not exist\n";
106
107 # (default) Set it to whatever we were given
108 $self->{'source_path'} = $source_filename;
109 }
110 }
111 else {
112 # Previous code for setting source_path allowed for
113 # it to be undefined, so continue this practice
114 $self->{'terse_source_path'} = undef;
115 $self->{'source_path'} = undef;
116 }
117}
118
119
120sub get_source_path
121{
122 my $self = shift @_;
123
124 return $self->{'terse_source_path'};
125}
126
127# set lastmodified for OAI purposes, added by GRB, moved by kjdon
128sub set_oailastmodified {
129 my $self = shift (@_);
130
131 my $source_path = $self->{'terse_source_path'};
132
133 if (defined $source_path && (-e $source_path)) {
134 my $current_time = time;
135
136 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
137 $wday, $yday, $isdst) = localtime($current_time);
138
139 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
140
141 $self->add_utf8_metadata($self->get_top_section(), "oailastmodified", $current_time);
142 $self->add_utf8_metadata($self->get_top_section(), "oailastmodifieddate", $date_modified);
143 }
144}
145
146# no longer used for OAI purposes, since lastmodified is not what we want as the
147# Datestamp of a document. This doc metadata may be useful for general purposes.
148sub set_lastmodified {
149 my $self = shift (@_);
150
151 my $source_path = $self->{'terse_source_path'};
152
153 if (defined $source_path && (-e $source_path)) {
154
155 my $file_stat = stat($source_path);
156 my $mtime = $file_stat->mtime;
157 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
158 $wday, $yday, $isdst) = localtime($mtime);
159
160 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
161
162 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $mtime);
163 $self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
164 }
165}
166
167# clone the $self object
168sub duplicate {
169 my $self = shift (@_);
170
171 my $newobj = {};
172
173 foreach my $k (keys %$self) {
174 $newobj->{$k} = &clone ($self->{$k});
175 }
176
177 bless $newobj, ref($self);
178 return $newobj;
179}
180
181sub clone {
182 my ($from) = @_;
183 my $type = ref ($from);
184
185 if ($type eq "HASH") {
186 my $to = {};
187 foreach my $key (keys %$from) {
188 $to->{$key} = &clone ($from->{$key});
189 }
190 return $to;
191 } elsif ($type eq "ARRAY") {
192 my $to = [];
193 foreach my $v (@$from) {
194 push (@$to, &clone ($v));
195 }
196 return $to;
197 } else {
198 return $from;
199 }
200}
201
202sub set_OIDtype {
203 my $self = shift (@_);
204 my ($type, $metadata) = @_;
205
206 if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|hash_on_full_filename|incremental|filename|dirname|full_filename|assigned)$/) {
207 $self->{'OIDtype'} = $type;
208 } else {
209 $self->{'OIDtype'} = "hash";
210 }
211
212 if ($type =~ /^assigned$/) {
213 if (defined $metadata) {
214 $self->{'OIDmetadata'} = $metadata;
215 } else {
216 $self->{'OIDmetadata'} = "dc.Identifier";
217 }
218 }
219}
220
221# rename_method can be 'url', 'none', 'base64'
222sub set_source_filename {
223 my $self = shift (@_);
224 my ($source_filename, $rename_method) = @_;
225
226 # Since the gsdlsourcefilename element goes into the doc.xml it has
227 # to be utf8. However, it should also *represent* the source filename
228 # (in the import directory) which may not be utf8 at all.
229 # For instance, if this meta element (gsdlsourcefilename) will be used
230 # by other applications that parse doc.xml in order to locate
231 # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
232 # encode the real filename as this is a binary-to-text encoding meaning
233 # that the resulting string is ASCII (utf8). Decoding will give the original.
234
235# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
236
237 # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
238 $source_filename = $self->encode_filename($source_filename, $rename_method);
239# my ($srcfilename,$dirname,$suffix)
240# = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
241# print STDERR "-> $srcfilename -> ";
242# $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
243# $source_filename = &util::filename_cat($dirname, $srcfilename);
244# print STDERR "$source_filename\n";
245
246 $self->set_utf8_metadata_element ($self->get_top_section(),
247 "gsdlsourcefilename",
248 $source_filename);
249}
250
251sub encode_filename {
252 my $self = shift (@_);
253 my ($source_filename, $rename_method) = @_;
254
255 my ($srcfilename,$dirname,$suffix)
256 = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
257# print STDERR "-> $srcfilename -> ";
258 $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
259 $source_filename = &util::filename_cat($dirname, $srcfilename);
260
261 return $source_filename;
262}
263
264sub set_converted_filename {
265 my $self = shift (@_);
266 my ($converted_filename) = @_;
267
268 # we know the converted filename is utf8
269 $self->set_utf8_metadata_element ($self->get_top_section(),
270 "gsdlconvertedfilename",
271 $converted_filename);
272}
273
274# returns the source_filename as it was provided
275sub get_unmodified_source_filename {
276 my $self = shift (@_);
277
278 return $self->{'terse_source_path'};
279}
280
281# returns the source_filename with whatever rename_method was given
282sub get_source_filename {
283 my $self = shift (@_);
284
285 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
286}
287
288
289
290# returns converted filename if available else returns source filename
291sub get_filename_for_hashing {
292 my $self = shift (@_);
293
294 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
295
296 if (!defined $filename) {
297 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
298 # if NULPlug processed file, then don't give a filename
299 if (defined $plugin_name && $plugin_name eq "NULPlug") {
300 $filename = undef;
301 } else { # returns the URL encoded source filename!
302 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
303 }
304 }
305
306 if (!&util::filename_is_absolute($filename)) {
307 $filename = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$filename);
308 }
309
310 return $filename;
311}
312
313sub set_doc_type {
314 my $self = shift (@_);
315 my ($doc_type) = @_;
316
317 $self->set_metadata_element ($self->get_top_section(),
318 "gsdldoctype",
319 $doc_type);
320}
321
322# returns the gsdldoctype as it was provided
323# the default of "indexed_doc" is used if no document
324# type was provided
325sub get_doc_type {
326 my $self = shift (@_);
327
328 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
329 return $doc_type if (defined $doc_type);
330 return "indexed_doc";
331}
332
333
334# look up the reference to the a particular section
335sub _lookup_section {
336 my $self = shift (@_);
337 my ($section) = @_;
338
339 my ($num);
340 my $sectionref = $self;
341
342 while (defined $section && $section ne "") {
343
344 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
345
346 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
347
348 $section = "" unless defined $section;
349
350
351 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
352 $sectionref = $sectionref->{'subsections'}->{$num};
353 } else {
354 return undef;
355 }
356 }
357
358 return $sectionref;
359}
360
361# calculate OID by hashing the contents of the document
362sub _calc_OID {
363 my $self = shift (@_);
364 my ($filename) = @_;
365
366
367 my $osexe = &util::get_os_exe();
368
369 my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
370 $ENV{'GSDLOS'},"hashfile$osexe");
371
372 my $result = "NULL";
373
374
375 if (-e "$hashfile_exe") {
376# $result = `\"$hashfile_exe\" \"$filename\"`;
377# $result = `hashfile$osexe \"$filename\" 2>&1`;
378 $result = `hashfile$osexe \"$filename\"`;
379
380 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
381 } else {
382 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
383 }
384 return "HASH$result";
385}
386
387# methods dealing with OID, not groups of them.
388
389# if $OID is not provided one is calculated
390sub set_OID {
391 my $self = shift (@_);
392 my ($OID) = @_;
393
394 my $use_hash_oid = 0;
395 # if an OID wasn't provided calculate one
396 if (!defined $OID) {
397 $OID = "NULL";
398 if ($self->{'OIDtype'} =~ /^hash/) {
399 $use_hash_oid = 1;
400 } elsif ($self->{'OIDtype'} eq "incremental") {
401 $OID = "D" . $OIDcount;
402 $OIDcount ++;
403 } elsif ($self->{'OIDtype'} eq "filename") {
404 my $filename = $self->get_source_filename();
405 $OID = &File::Basename::fileparse($filename, qr/\.[^.]*/);
406 $OID = &util::tidy_up_oid($OID);
407 } elsif ($self->{'OIDtype'} eq "full_filename") {
408 my $source_filename = $self->get_source_filename();
409 my $dirsep = &util::get_os_dirsep();
410
411 $source_filename =~ s/^import$dirsep//;
412 $source_filename =~ s/$dirsep/-/g;
413 $source_filename =~ s/\./_/g;
414
415 $OID = $source_filename;
416 $OID = &util::tidy_up_oid($OID);
417 } elsif ($self->{'OIDtype'} eq "dirname") {
418 $OID = 'J';
419 my $filename = $self->get_source_filename();
420 if (defined($filename)) { # && -e $filename) {
421 $OID = &File::Basename::dirname($filename);
422 if (defined $OID) {
423 $OID = 'J'.&File::Basename::basename($OID);
424 $OID = &util::tidy_up_oid($OID);
425 } else {
426 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
427 $use_hash_oid = 1;
428 }
429 } else {
430 print STDERR "Failed to find filename, generating hash id\n";
431 $use_hash_oid = 1;
432 }
433
434 } elsif ($self->{'OIDtype'} eq "assigned") {
435 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
436 if (defined $identifier && $identifier ne "") {
437 $OID = $identifier;
438 $OID = &util::tidy_up_oid($OID);
439 } else {
440 # need a hash id
441 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
442 $use_hash_oid = 1;
443 }
444
445 } else {
446 $use_hash_oid = 1;
447 }
448
449 if ($use_hash_oid) {
450 my $hash_on_file = 1;
451 my $hash_on_ga_xml = 0;
452
453 if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
454 $hash_on_file = 0;
455 $hash_on_ga_xml = 1;
456 }
457
458 if ($self->{'OIDtype'} eq "hash_on_full_filename") {
459 $hash_on_file = 0;
460 $hash_on_ga_xml = 0;
461
462 my $source_filename = $self->get_source_filename();
463 my $dirsep = &util::get_os_dirsep();
464
465 $source_filename =~ s/^import$dirsep//;
466 $source_filename =~ s/$dirsep/-/g;
467 $source_filename =~ s/\./_/g;
468
469 # If the filename is very short then (handled naively)
470 # this can cause conjestion in the hash-values
471 # computed, leading documents sharing the same leading
472 # Hex values in the computed has.
473 #
474 # The solution taken here is to replace the name of
475 # the file name a sufficient number of times (up to
476 # the character limit defined in 'rep_limit' and
477 # make that the content that is hashed on
478
479 # *** Think twice before changing the following value
480 # as it will break backward compatability of computed
481 # document HASH values
482
483 my $rep_limit = 256;
484 my $hash_content = undef;
485
486 if (length($source_filename)<$rep_limit) {
487 my $rep_string = "$source_filename|";
488 my $rs_len = length($rep_string);
489
490 my $clone_times = int(($rep_limit-1)/$rs_len) +1;
491
492 $hash_content = substr($rep_string x $clone_times, 0, $rep_limit);
493 }
494 else {
495 $hash_content = $source_filename;
496 }
497
498 my $filename = &util::get_tmp_filename();
499 if (!open (OUTFILE, ">:utf8", $filename)) {
500 print STDERR "doc::set_OID could not write to $filename\n";
501 } else {
502 print OUTFILE $hash_content;
503 close (OUTFILE);
504 }
505 $OID = $self->_calc_OID ($filename);
506
507 &util::rm ($filename);
508 }
509
510 if ($hash_on_file) {
511 # "hash" OID - feed file to hashfile.exe
512 my $filename = $self->get_filename_for_hashing();
513
514 # -z: don't want to hash on the file if it is zero size
515 if (defined($filename) && -e $filename && !-z $filename) {
516 $OID = $self->_calc_OID ($filename);
517 } else {
518 $hash_on_ga_xml = 1; # switch to back-up plan, and hash on GA file instead
519 }
520 }
521
522 if ($hash_on_ga_xml) {
523 # In addition being asked to explicity calculate the has based on the GA file,
524 # can also end up coming into this block is doing 'hash_on_file' but the file
525 # itself is of zero bytes (as could be the case with 'doc.nul' file
526
527 my $filename = &util::get_tmp_filename();
528 if (!open (OUTFILE, ">:utf8", $filename)) {
529 print STDERR "doc::set_OID could not write to $filename\n";
530 } else {
531 my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
532 print OUTFILE $doc_text;
533 close (OUTFILE);
534 }
535 $OID = $self->_calc_OID ($filename);
536 &util::rm ($filename);
537 }
538 }
539 }
540 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
541}
542
543# this uses hashdoc (embedded c thingy) which is faster but still
544# needs a little work to be suffiently stable
545sub ___set_OID {
546 my $self = shift (@_);
547 my ($OID) = @_;
548
549 # if an OID wasn't provided then calculate hash value based on document
550 if (!defined $OID)
551 {
552 my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
553 my $hash_len = length($hash_text);
554
555 $OID = &hashdoc::buffer($hash_text,$hash_len);
556 }
557
558 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
559}
560
561# returns the OID for this document
562sub get_OID {
563 my $self = shift (@_);
564 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
565 return $OID if (defined $OID);
566 return "NULL";
567}
568
569sub delete_OID {
570 my $self = shift (@_);
571
572 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
573}
574
575
576# methods for manipulating section names
577
578# returns the name of the top-most section (the top
579# level of the document
580sub get_top_section {
581 my $self = shift (@_);
582
583 return "";
584}
585
586# returns a section
587sub get_parent_section {
588 my $self = shift (@_);
589 my ($section) = @_;
590
591 $section =~ s/(^|\.)\d+$//;
592
593 return $section;
594}
595
596# returns the first child section (or the end child
597# if there isn't any)
598sub get_begin_child {
599 my $self = shift (@_);
600 my ($section) = @_;
601
602 my $section_ptr = $self->_lookup_section($section);
603 return "" unless defined $section_ptr;
604
605 if (defined $section_ptr->{'subsection_order'}->[0]) {
606 return "$section.$section_ptr->{'subsection_order'}->[0]";
607 }
608
609 return $self->get_end_child ($section);
610}
611
612# returns the next child of a parent section
613sub get_next_child {
614 my $self = shift (@_);
615 my ($section) = @_;
616
617 my $parent_section = $self->get_parent_section($section);
618 my $parent_section_ptr = $self->_lookup_section($parent_section);
619 return undef unless defined $parent_section_ptr;
620
621 my ($section_num) = $section =~ /(\d+)$/;
622 return undef unless defined $section_num;
623
624 my $i = 0;
625 my $section_order = $parent_section_ptr->{'subsection_order'};
626 while ($i < scalar(@$section_order)) {
627 last if $section_order->[$i] eq $section_num;
628 $i++;
629 }
630
631 $i++; # the next child
632 if ($i < scalar(@$section_order)) {
633 return $section_order->[$i] if $parent_section eq "";
634 return "$parent_section.$section_order->[$i]";
635 }
636
637 # no more sections in this level
638 return undef;
639}
640
641# returns a reference to a list of children
642sub get_children {
643 my $self = shift (@_);
644 my ($section) = @_;
645
646 my $section_ptr = $self->_lookup_section($section);
647 return [] unless defined $section_ptr;
648
649 my @children = @{$section_ptr->{'subsection_order'}};
650
651 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
652 return \@children;
653}
654
655# returns the child section one past the last one (which
656# is coded as "0")
657sub get_end_child {
658 my $self = shift (@_);
659 my ($section) = @_;
660
661 return $section . ".0" unless $section eq "";
662 return "0";
663}
664
665# returns the next section in book order
666sub get_next_section {
667 my $self = shift (@_);
668 my ($section) = @_;
669
670 return undef unless defined $section;
671
672 my $section_ptr = $self->_lookup_section($section);
673 return undef unless defined $section_ptr;
674
675 # first try to find first child
676 if (defined $section_ptr->{'subsection_order'}->[0]) {
677 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
678 return "$section.$section_ptr->{'subsection_order'}->[0]";
679 }
680
681 do {
682 # try to find sibling
683 my $next_child = $self->get_next_child ($section);
684 return $next_child if (defined $next_child);
685
686 # move up one level
687 $section = $self->get_parent_section ($section);
688 } while $section =~ /\d/;
689
690 return undef;
691}
692
693sub is_leaf_section {
694 my $self = shift (@_);
695 my ($section) = @_;
696
697 my $section_ptr = $self->_lookup_section($section);
698 return 1 unless defined $section_ptr;
699
700 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
701}
702
703# methods for dealing with sections
704
705# returns the name of the inserted section
706sub insert_section {
707 my $self = shift (@_);
708 my ($before_section) = @_;
709
710 # get the child to insert before and its parent section
711 my $parent_section = "";
712 my $before_child = "0";
713 my @before_section = split (/\./, $before_section);
714 if (scalar(@before_section) > 0) {
715 $before_child = pop (@before_section);
716 $parent_section = join (".", @before_section);
717 }
718
719 my $parent_section_ptr = $self->_lookup_section($parent_section);
720 if (!defined $parent_section_ptr) {
721 print STDERR "doc::insert_section couldn't find parent section " .
722 "$parent_section\n";
723 return;
724 }
725
726 # get the next section number
727 my $section_num = $parent_section_ptr->{'next_subsection'}++;
728
729 my $i = 0;
730 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
731 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
732 $i++;
733 }
734
735 # insert the section number into the order list
736 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
737
738 # add this section to the parent section
739 my $section_ptr = {'subsection_order'=>[],
740 'next_subsection'=>1,
741 'subsections'=>{},
742 'metadata'=>[],
743 'text'=>""};
744 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
745
746 # work out the full section number
747 my $section = $parent_section;
748 $section .= "." unless $section eq "";
749 $section .= $section_num;
750
751 return $section;
752}
753
754# creates a pre-named section
755sub create_named_section {
756 my $self = shift (@_);
757 my ($mastersection) = @_;
758
759 my ($num);
760 my $section = $mastersection;
761 my $sectionref = $self;
762
763 while ($section ne "") {
764 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
765 $num =~ s/^0+(\d)/$1/; # remove leading 0s
766 $section = "" unless defined $section;
767
768 if (defined $num) {
769 if (!defined $sectionref->{'subsections'}->{$num}) {
770 push (@{$sectionref->{'subsection_order'}}, $num);
771 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
772 'next_subsection'=>1,
773 'subsections'=>{},
774 'metadata'=>[],
775 'text'=>""};
776 if ($num >= $sectionref->{'next_subsection'}) {
777 $sectionref->{'next_subsection'} = $num + 1;
778 }
779 }
780 $sectionref = $sectionref->{'subsections'}->{$num};
781
782 } else {
783 print STDERR "doc::create_named_section couldn't create section ";
784 print STDERR "$mastersection\n";
785 last;
786 }
787 }
788}
789
790# returns a reference to a list of subsections
791sub list_subsections {
792 my $self = shift (@_);
793 my ($section) = @_;
794
795 my $section_ptr = $self->_lookup_section ($section);
796 if (!defined $section_ptr) {
797 print STDERR "doc::list_subsections couldn't find section $section\n";
798 return [];
799 }
800
801 return [@{$section_ptr->{'subsection_order'}}];
802}
803
804sub delete_section {
805 my $self = shift (@_);
806 my ($section) = @_;
807
808# my $section_ptr = {'subsection_order'=>[],
809# 'next_subsection'=>1,
810# 'subsections'=>{},
811# 'metadata'=>[],
812# 'text'=>""};
813
814 # if this is the top section reset everything
815 if ($section eq "") {
816 $self->{'subsection_order'} = [];
817 $self->{'subsections'} = {};
818 $self->{'metadata'} = [];
819 $self->{'text'} = "";
820 return;
821 }
822
823 # find the parent of the section to delete
824 my $parent_section = "";
825 my $child = "0";
826 my @section = split (/\./, $section);
827 if (scalar(@section) > 0) {
828 $child = pop (@section);
829 $parent_section = join (".", @section);
830 }
831
832 my $parent_section_ptr = $self->_lookup_section($parent_section);
833 if (!defined $parent_section_ptr) {
834 print STDERR "doc::delete_section couldn't find parent section " .
835 "$parent_section\n";
836 return;
837 }
838
839 # remove this section from the subsection_order list
840 my $i = 0;
841 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
842 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
843 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
844 last;
845 }
846 $i++;
847 }
848
849 # remove this section from the subsection hash
850 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
851 undef $parent_section_ptr->{'subsections'}->{$child};
852 }
853}
854
855#--
856# methods for dealing with metadata
857
858# set_metadata_element and get_metadata_element are for metadata
859# which should only have one value. add_meta_data and get_metadata
860# are for metadata which can have more than one value.
861
862# returns the first metadata value which matches field
863
864# This version of get metadata element works much like the one above,
865# except it allows for the namespace portion of a metadata element to
866# be ignored, thus if you are searching for dc.Title, the first piece
867# of matching metadata ending with the name Title (once any namespace
868# is removed) would be returned.
869# 28-11-2003 John Thompson
870sub get_metadata_element {
871 my $self = shift (@_);
872 my ($section, $field, $ignore_namespace) = @_;
873 my ($data);
874
875 $ignore_namespace = 0 unless defined $ignore_namespace;
876
877 my $section_ptr = $self->_lookup_section($section);
878 if (!defined $section_ptr) {
879 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
880 return;
881 }
882
883 # Remove any namespace if we are being told to ignore them
884 if($ignore_namespace) {
885 $field =~ s/^.*\.//; #$field =~ s/^\w*\.//;
886 }
887
888 foreach $data (@{$section_ptr->{'metadata'}}) {
889
890 my $data_name = $data->[0];
891
892 # Remove any namespace if we are being told to ignore them
893 if($ignore_namespace) {
894 $data_name =~ s/^.*\.//; #$data_name =~ s/^\w*\.//;
895 }
896 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
897 $data_name =~ s/^ex\.([^.]+)$/$1/; #$data_name =~ s/^ex\.//;
898
899 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
900 }
901
902 return undef; # was not found
903}
904
905# returns a list of the form [value1, value2, ...]
906sub get_metadata {
907 my $self = shift (@_);
908 my ($section, $field, $ignore_namespace) = @_;
909 my ($data);
910
911 $ignore_namespace = 0 unless defined $ignore_namespace;
912
913 my $section_ptr = $self->_lookup_section($section);
914 if (!defined $section_ptr) {
915 print STDERR "doc::get_metadata couldn't find section ",
916 $section, "\n";
917 return;
918 }
919
920 # Remove any namespace if we are being told to ignore them
921 if($ignore_namespace) {
922 $field =~ s/^.*\.//;
923 }
924
925 my @metadata = ();
926 foreach $data (@{$section_ptr->{'metadata'}}) {
927
928 my $data_name = $data->[0];
929
930 # Remove any namespace if we are being told to ignore them
931 if($ignore_namespace) {
932 $data_name =~ s/^.*\.//;
933 }
934 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
935 $data_name =~ s/^ex\.([^.]+)$/$1/;
936
937 push (@metadata, $data->[1]) if ($data_name eq $field);
938 }
939
940 return \@metadata;
941}
942
943sub get_metadata_hashmap {
944 my $self = shift (@_);
945 my ($section, $opt_namespace) = @_;
946
947 my $section_ptr = $self->_lookup_section($section);
948 if (!defined $section_ptr) {
949 print STDERR "doc::get_metadata couldn't find section ",
950 $section, "\n";
951 return;
952 }
953
954 my $metadata_hashmap = {};
955 foreach my $data (@{$section_ptr->{'metadata'}}) {
956 my $metaname = $data->[0];
957
958 if ((!defined $opt_namespace) || ($metaname =~ m/^$opt_namespace\./)) {
959 if (!defined $metadata_hashmap->{$metaname}) {
960 $metadata_hashmap->{$metaname} = [];
961 }
962 my $metaval_list = $metadata_hashmap->{$metaname};
963 push(@$metaval_list, $data->[1]);
964 }
965 }
966
967 return $metadata_hashmap;
968}
969
970# returns a list of the form [[field,value],[field,value],...]
971sub get_all_metadata {
972 my $self = shift (@_);
973 my ($section) = @_;
974
975 my $section_ptr = $self->_lookup_section($section);
976 if (!defined $section_ptr) {
977 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
978 return;
979 }
980
981 return $section_ptr->{'metadata'};
982}
983
984# $value is optional
985sub delete_metadata {
986 my $self = shift (@_);
987 my ($section, $field, $value) = @_;
988
989 my $section_ptr = $self->_lookup_section($section);
990 if (!defined $section_ptr) {
991 print STDERR "doc::delete_metadata couldn't find section ", $section, "$field\n";
992 return;
993 }
994
995 my $i = 0;
996 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
997 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
998 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
999 splice (@{$section_ptr->{'metadata'}}, $i, 1);
1000 } else {
1001 $i++;
1002 }
1003 }
1004}
1005
1006sub delete_all_metadata {
1007 my $self = shift (@_);
1008 my ($section) = @_;
1009
1010 my $section_ptr = $self->_lookup_section($section);
1011 if (!defined $section_ptr) {
1012 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
1013 return;
1014 }
1015
1016 $section_ptr->{'metadata'} = [];
1017}
1018
1019sub set_metadata_element {
1020 my $self = shift (@_);
1021 my ($section, $field, $value) = @_;
1022
1023 $self->set_utf8_metadata_element ($section, $field,
1024 &unicode::ascii2utf8(\$value));
1025}
1026
1027# set_utf8_metadata_element assumes the text has already been
1028# converted to the UTF-8 encoding.
1029sub set_utf8_metadata_element {
1030 my $self = shift (@_);
1031 my ($section, $field, $value) = @_;
1032
1033 $self->delete_metadata ($section, $field);
1034 $self->add_utf8_metadata ($section, $field, $value);
1035}
1036
1037
1038# add_metadata assumes the text is in (extended) ascii form. For
1039# text which has already been converted to the UTF-8 format use
1040# add_utf8_metadata.
1041sub add_metadata {
1042 my $self = shift (@_);
1043 my ($section, $field, $value) = @_;
1044
1045 $self->add_utf8_metadata ($section, $field,
1046 &unicode::ascii2utf8(\$value));
1047}
1048
1049sub add_utf8_metadata {
1050 my $self = shift (@_);
1051 my ($section, $field, $value) = @_;
1052
1053 # my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1054 # my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
1055 # print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
1056
1057 my $section_ptr = $self->_lookup_section($section);
1058 if (!defined $section_ptr) {
1059 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
1060 return;
1061 }
1062 if (!defined $value) {
1063 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
1064 return;
1065 }
1066 if (!defined $field) {
1067 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
1068 return;
1069 }
1070
1071 #print STDERR "###$field=$value\n";
1072
1073 # For now, supress this check. Given that text data read in is now
1074 # Unicode aware, then the following block of code can (ironically enough)
1075 # cause our unicode compliant string to be re-encoded (leading to
1076 # a double-encoded UTF-8 string, which we definitely don't want!).
1077
1078
1079 # double check that the value is utf-8
1080 # if (!&unicode::check_is_utf8($value)) {
1081 # print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
1082 # &unicode::ensure_utf8(\$value);
1083 # print STDERR " Tried converting to utf8: $value\n";
1084 # }
1085
1086 #If the metadata value is either a latitude or a longitude value then we want to save a shortened version for spacial searching purposes
1087 if($field eq "Latitude" || $field eq "Longitude")
1088 {
1089 my $direction;
1090 if($value =~ m/^-/)
1091 {
1092 $direction = ($field eq "Latitude") ? "S" : "W";
1093 }
1094 else
1095 {
1096 $direction = ($field eq "Latitude") ? "N" : "E";
1097 }
1098
1099 my ($beforeDec, $afterDec) = ($value =~ m/^-?([0-9]+)\.([0-9]+)$/);
1100 if(defined $beforeDec && defined $afterDec)
1101 {
1102 my $name = ($field eq "Latitude") ? "LatShort" : "LngShort";
1103 push (@{$section_ptr->{'metadata'}}, [$name, $beforeDec . $direction]);
1104
1105 for(my $i = 2; $i <= 4; $i++)
1106 {
1107 if(length($afterDec) >= $i)
1108 {
1109 push (@{$section_ptr->{'metadata'}}, [$name, substr($afterDec, 0, $i)]);
1110 }
1111 }
1112
1113 #Only add the metadata if it has not already been added
1114 my $metaMap = $self->get_metadata_hashmap($section);
1115 }
1116 }
1117
1118 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1119}
1120
1121
1122# methods for dealing with text
1123
1124# returns the text for a section
1125sub get_text {
1126 my $self = shift (@_);
1127 my ($section) = @_;
1128
1129 my $section_ptr = $self->_lookup_section($section);
1130 if (!defined $section_ptr) {
1131 print STDERR "doc::get_text couldn't find section " .
1132 "$section\n";
1133 return "";
1134 }
1135
1136 return $section_ptr->{'text'};
1137}
1138
1139# returns the (utf-8 encoded) length of the text for a section
1140sub get_text_length {
1141 my $self = shift (@_);
1142 my ($section) = @_;
1143
1144 my $section_ptr = $self->_lookup_section($section);
1145 if (!defined $section_ptr) {
1146 print STDERR "doc::get_text_length couldn't find section " .
1147 "$section\n";
1148 return 0;
1149 }
1150
1151 return length ($section_ptr->{'text'});
1152}
1153
1154# returns the total length for all the sections
1155sub get_total_text_length {
1156 my $self = shift (@_);
1157
1158 my $section = $self->get_top_section();
1159 my $length = 0;
1160 while (defined $section) {
1161 $length += $self->get_text_length($section);
1162 $section = $self->get_next_section($section);
1163 }
1164 return $length;
1165}
1166
1167sub delete_text {
1168 my $self = shift (@_);
1169 my ($section) = @_;
1170
1171 my $section_ptr = $self->_lookup_section($section);
1172 if (!defined $section_ptr) {
1173 print STDERR "doc::delete_text couldn't find section " .
1174 "$section\n";
1175 return;
1176 }
1177
1178 $section_ptr->{'text'} = "";
1179}
1180
1181# add_text assumes the text is in (extended) ascii form. For
1182# text which has been already converted to the UTF-8 format
1183# use add_utf8_text.
1184sub add_text {
1185 my $self = shift (@_);
1186 my ($section, $text) = @_;
1187
1188 # convert the text to UTF-8 encoded unicode characters
1189 # and add the text
1190 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1191}
1192
1193
1194# add_utf8_text assumes the text to be added has already
1195# been converted to the UTF-8 encoding. For ascii text use
1196# add_text
1197sub add_utf8_text {
1198 my $self = shift (@_);
1199 my ($section, $text) = @_;
1200
1201 my $section_ptr = $self->_lookup_section($section);
1202 if (!defined $section_ptr) {
1203 print STDERR "doc::add_utf8_text couldn't find section " .
1204 "$section\n";
1205 return;
1206 }
1207
1208 $section_ptr->{'text'} .= $text;
1209}
1210
1211# returns the Source meta, which is the utf8 filename generated.
1212# Added a separate method here for convenience
1213sub get_source {
1214 my $self = shift (@_);
1215 return $self->get_metadata_element ($self->get_top_section(), "Source");
1216}
1217
1218# returns the SourceFile meta, which is the url reference to the URL-encoded
1219# version of Source (the utf8 filename). Added a separate method here for convenience
1220sub get_sourcefile {
1221 my $self = shift (@_);
1222 return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
1223}
1224
1225# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
1226# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
1227sub get_assocfile_from_sourcefile {
1228 my $self = shift (@_);
1229
1230 # get the SourceFile meta, which is a *URL* to a file on the filesystem
1231 my $top_section = $self->get_top_section();
1232 my $source_file = $self->get_metadata_element($top_section, "SourceFile");
1233
1234 # get the actual filename as it exists on the filesystem which this url refers to
1235 $source_file = &unicode::url_to_filename($source_file);
1236 my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
1237 return $assocfilename;
1238}
1239
1240# methods for dealing with associated files
1241
1242# a file is associated with a document, NOT a section.
1243# if section is defined it is noted in the data structure
1244# only so that files associated from a particular section
1245# may be removed later (using delete_section_assoc_files)
1246sub associate_file {
1247 my $self = shift (@_);
1248 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1249 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1250
1251 # remove all associated files with the same name
1252 $self->delete_assoc_file ($assoc_filename);
1253
1254 # Too harsh a requirement
1255 # Definitely get HTML docs, for example, with some missing
1256 # support files
1257# if (!&util::fd_exists($real_filename)) {
1258# print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n";
1259# exit -1;
1260# }
1261
1262# print STDERR "**** is the following a UTF8 rep of *real* filename?\n $real_filename\n";
1263# print STDERR "****##### so, ensure it is before storing?!?!?\n";
1264## my $utf8_filename = Encode::encode("utf8",$filename);
1265
1266 push (@{$self->{'associated_files'}},
1267 [$real_filename, $assoc_filename, $mime_type, $section]);
1268}
1269
1270# returns a list of associated files in the form
1271# [[real_filename, assoc_filename, mimetype], ...]
1272sub get_assoc_files {
1273 my $self = shift (@_);
1274
1275 return $self->{'associated_files'};
1276}
1277
1278# the following two methods used to keep track of original associated files
1279# for incremental building. eg a txt file used by an item file does not end
1280# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1281# file for incremental build
1282sub associate_source_file {
1283 my $self = shift (@_);
1284 my ($full_filename) = @_;
1285
1286 push (@{$self->{'source_assoc_files'}}, $full_filename);
1287
1288}
1289
1290sub get_source_assoc_files {
1291 my $self = shift (@_);
1292
1293 return $self->{'source_assoc_files'};
1294
1295
1296}
1297sub metadata_file {
1298 my $self = shift (@_);
1299 my ($real_filename, $filename) = @_;
1300
1301 push (@{$self->{'metadata_files'}},
1302 [$real_filename, $filename]);
1303}
1304
1305# used for writing out the archiveinf-doc info database, to list all the metadata files
1306sub get_meta_files {
1307 my $self = shift (@_);
1308
1309 return $self->{'metadata_files'};
1310}
1311
1312sub delete_section_assoc_files {
1313 my $self = shift (@_);
1314 my ($section) = @_;
1315
1316 my $i=0;
1317 while ($i < scalar (@{$self->{'associated_files'}})) {
1318 if (defined $self->{'associated_files'}->[$i]->[3] &&
1319 $self->{'associated_files'}->[$i]->[3] eq $section) {
1320 splice (@{$self->{'associated_files'}}, $i, 1);
1321 } else {
1322 $i++;
1323 }
1324 }
1325}
1326
1327sub delete_assoc_file {
1328 my $self = shift (@_);
1329 my ($assoc_filename) = @_;
1330
1331 my $i=0;
1332 while ($i < scalar (@{$self->{'associated_files'}})) {
1333 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1334 splice (@{$self->{'associated_files'}}, $i, 1);
1335 } else {
1336 $i++;
1337 }
1338 }
1339}
1340
1341sub reset_nextsection_ptr {
1342 my $self = shift (@_);
1343 my ($section) = @_;
1344
1345 my $section_ptr = $self->_lookup_section($section);
1346 $section_ptr->{'next_subsection'} = 1;
1347}
1348
13491;
Note: See TracBrowser for help on using the repository browser.