source: main/trunk/greenstone2/perllib/doc.pm@ 26850

Last change on this file since 26850 was 26850, checked in by davidb, 11 years ago

Additional tweak to handling Latitude and Longitude added, so versions that use metadata set prefixes also get fed in to the 'ex' version, so the runtime system can see these values

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 38.0 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use ghtml;
40use File::stat;
41##use hashdoc;
42use docprint;
43
44# the document type may be indexed_doc, nonindexed_doc, or
45# classification
46
47our $OIDcount = 0;
48
49# rename_method can be 'url', 'none', 'base64'
50sub new {
51 my $class = shift (@_);
52 my ($source_filename, $doc_type, $rename_method) = @_;
53
54
55 my $self = bless {'associated_files'=>[],
56 'subsection_order'=>[],
57 'next_subsection'=>1,
58 'subsections'=>{},
59 'metadata'=>[],
60 'text'=>"",
61 'OIDtype'=>"hash"}, $class;
62
63 # used to set lastmodified here, but this can screw up the HASH ids, so
64 # the docsave processor now calls set_lastmodified
65
66 $self->set_source_path($source_filename);
67
68 if (defined $source_filename) {
69 $source_filename = &util::filename_within_collection($source_filename);
70 print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
71 $self->set_source_filename ($source_filename, $rename_method);
72 }
73
74 $self->set_doc_type ($doc_type) if defined $doc_type;
75
76 return $self;
77}
78
79
80sub set_source_path
81{
82 my $self = shift @_;
83 my ($source_filename) = @_;
84
85 if (defined $source_filename) {
86 # On Windows the source_filename can be in terse DOS format
87 # e.g. test~1.txt
88
89 $self->{'terse_source_path'} = $source_filename;
90
91 if (-e $source_filename) {
92 # See if we can do better for Windows with a filename
93 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
94 require Win32;
95 $self->{'source_path'} = Win32::GetLongPathName($source_filename);
96 }
97 else {
98 # For Unix-based systems, there is no difference between the two
99 $self->{'source_path'} = $source_filename;
100 }
101 }
102 else {
103 print STDERR "Warning: In doc::set_source_path(), file\n";
104 print STDERR " $source_filename\n";
105 print STDERR " does not exist\n";
106
107 # (default) Set it to whatever we were given
108 $self->{'source_path'} = $source_filename;
109 }
110 }
111 else {
112 # Previous code for setting source_path allowed for
113 # it to be undefined, so continue this practice
114 $self->{'terse_source_path'} = undef;
115 $self->{'source_path'} = undef;
116 }
117}
118
119
120sub get_source_path
121{
122 my $self = shift @_;
123
124 return $self->{'terse_source_path'};
125}
126
127# set lastmodified for OAI purposes, added by GRB, moved by kjdon
128sub set_oailastmodified {
129 my $self = shift (@_);
130
131 my $source_path = $self->{'terse_source_path'};
132
133 if (defined $source_path && (-e $source_path)) {
134 my $current_time = time;
135
136 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
137 $wday, $yday, $isdst) = localtime($current_time);
138
139 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
140
141 $self->add_utf8_metadata($self->get_top_section(), "oailastmodified", $current_time);
142 $self->add_utf8_metadata($self->get_top_section(), "oailastmodifieddate", $date_modified);
143 }
144}
145
146# no longer used for OAI purposes, since lastmodified is not what we want as the
147# Datestamp of a document. This doc metadata may be useful for general purposes.
148sub set_lastmodified {
149 my $self = shift (@_);
150
151 my $source_path = $self->{'terse_source_path'};
152
153 if (defined $source_path && (-e $source_path)) {
154
155 my $file_stat = stat($source_path);
156 my $mtime = $file_stat->mtime;
157 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
158 $wday, $yday, $isdst) = localtime($mtime);
159
160 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
161
162 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $mtime);
163 $self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
164 }
165}
166
167# clone the $self object
168sub duplicate {
169 my $self = shift (@_);
170
171 my $newobj = {};
172
173 foreach my $k (keys %$self) {
174 $newobj->{$k} = &clone ($self->{$k});
175 }
176
177 bless $newobj, ref($self);
178 return $newobj;
179}
180
181sub clone {
182 my ($from) = @_;
183 my $type = ref ($from);
184
185 if ($type eq "HASH") {
186 my $to = {};
187 foreach my $key (keys %$from) {
188 $to->{$key} = &clone ($from->{$key});
189 }
190 return $to;
191 } elsif ($type eq "ARRAY") {
192 my $to = [];
193 foreach my $v (@$from) {
194 push (@$to, &clone ($v));
195 }
196 return $to;
197 } else {
198 return $from;
199 }
200}
201
202sub set_OIDtype {
203 my $self = shift (@_);
204 my ($type, $metadata) = @_;
205
206 if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|hash_on_full_filename|incremental|filename|dirname|full_filename|assigned)$/) {
207 $self->{'OIDtype'} = $type;
208 } else {
209 $self->{'OIDtype'} = "hash";
210 }
211
212 if ($type =~ /^assigned$/) {
213 if (defined $metadata) {
214 $self->{'OIDmetadata'} = $metadata;
215 } else {
216 $self->{'OIDmetadata'} = "dc.Identifier";
217 }
218 }
219}
220
221# rename_method can be 'url', 'none', 'base64'
222sub set_source_filename {
223 my $self = shift (@_);
224 my ($source_filename, $rename_method) = @_;
225
226 # Since the gsdlsourcefilename element goes into the doc.xml it has
227 # to be utf8. However, it should also *represent* the source filename
228 # (in the import directory) which may not be utf8 at all.
229 # For instance, if this meta element (gsdlsourcefilename) will be used
230 # by other applications that parse doc.xml in order to locate
231 # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
232 # encode the real filename as this is a binary-to-text encoding meaning
233 # that the resulting string is ASCII (utf8). Decoding will give the original.
234
235# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
236
237 # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
238 $source_filename = $self->encode_filename($source_filename, $rename_method);
239# my ($srcfilename,$dirname,$suffix)
240# = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
241# print STDERR "-> $srcfilename -> ";
242# $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
243# $source_filename = &util::filename_cat($dirname, $srcfilename);
244# print STDERR "$source_filename\n";
245
246 $self->set_utf8_metadata_element ($self->get_top_section(),
247 "gsdlsourcefilename",
248 $source_filename);
249}
250
251sub encode_filename {
252 my $self = shift (@_);
253 my ($source_filename, $rename_method) = @_;
254
255 my ($srcfilename,$dirname,$suffix)
256 = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
257# print STDERR "-> $srcfilename -> ";
258 $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
259 $source_filename = &util::filename_cat($dirname, $srcfilename);
260
261 return $source_filename;
262}
263
264sub set_converted_filename {
265 my $self = shift (@_);
266 my ($converted_filename) = @_;
267
268 # we know the converted filename is utf8
269 $self->set_utf8_metadata_element ($self->get_top_section(),
270 "gsdlconvertedfilename",
271 $converted_filename);
272}
273
274# returns the source_filename as it was provided
275sub get_unmodified_source_filename {
276 my $self = shift (@_);
277
278 return $self->{'terse_source_path'};
279}
280
281# returns the source_filename with whatever rename_method was given
282sub get_source_filename {
283 my $self = shift (@_);
284
285 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
286}
287
288
289
290# returns converted filename if available else returns source filename
291sub get_filename_for_hashing {
292 my $self = shift (@_);
293
294 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
295
296 if (!defined $filename) {
297 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
298 # if NULPlug processed file, then don't give a filename
299 if (defined $plugin_name && $plugin_name eq "NULPlug") {
300 $filename = undef;
301 } else { # returns the URL encoded source filename!
302 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
303 }
304 }
305
306 if (!&util::filename_is_absolute($filename)) {
307 $filename = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$filename);
308 }
309
310 return $filename;
311}
312
313sub set_doc_type {
314 my $self = shift (@_);
315 my ($doc_type) = @_;
316
317 $self->set_metadata_element ($self->get_top_section(),
318 "gsdldoctype",
319 $doc_type);
320}
321
322# returns the gsdldoctype as it was provided
323# the default of "indexed_doc" is used if no document
324# type was provided
325sub get_doc_type {
326 my $self = shift (@_);
327
328 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
329 return $doc_type if (defined $doc_type);
330 return "indexed_doc";
331}
332
333
334# look up the reference to the a particular section
335sub _lookup_section {
336 my $self = shift (@_);
337 my ($section) = @_;
338
339 my ($num);
340 my $sectionref = $self;
341
342 while (defined $section && $section ne "") {
343
344 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
345
346 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
347
348 $section = "" unless defined $section;
349
350
351 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
352 $sectionref = $sectionref->{'subsections'}->{$num};
353 } else {
354 return undef;
355 }
356 }
357
358 return $sectionref;
359}
360
361# calculate OID by hashing the contents of the document
362sub _calc_OID {
363 my $self = shift (@_);
364 my ($filename) = @_;
365
366
367 my $osexe = &util::get_os_exe();
368
369 my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
370 $ENV{'GSDLOS'},"hashfile$osexe");
371
372 my $result = "NULL";
373
374
375 if (-e "$hashfile_exe") {
376# $result = `\"$hashfile_exe\" \"$filename\"`;
377# $result = `hashfile$osexe \"$filename\" 2>&1`;
378 $result = `hashfile$osexe \"$filename\"`;
379
380 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
381 } else {
382 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
383 }
384 return "HASH$result";
385}
386
387# methods dealing with OID, not groups of them.
388
389# if $OID is not provided one is calculated
390sub set_OID {
391 my $self = shift (@_);
392 my ($OID) = @_;
393
394 my $use_hash_oid = 0;
395 # if an OID wasn't provided calculate one
396 if (!defined $OID) {
397 $OID = "NULL";
398 if ($self->{'OIDtype'} =~ /^hash/) {
399 $use_hash_oid = 1;
400 } elsif ($self->{'OIDtype'} eq "incremental") {
401 $OID = "D" . $OIDcount;
402 $OIDcount ++;
403 } elsif ($self->{'OIDtype'} eq "filename") {
404 my $filename = $self->get_source_filename();
405 $OID = &File::Basename::fileparse($filename, qr/\.[^.]*/);
406 $OID = &util::tidy_up_oid($OID);
407 } elsif ($self->{'OIDtype'} eq "full_filename") {
408 my $source_filename = $self->get_source_filename();
409 my $dirsep = &util::get_os_dirsep();
410
411 $source_filename =~ s/^import$dirsep//;
412 $source_filename =~ s/$dirsep/-/g;
413 $source_filename =~ s/\./_/g;
414
415 $OID = $source_filename;
416 $OID = &util::tidy_up_oid($OID);
417 } elsif ($self->{'OIDtype'} eq "dirname") {
418 $OID = 'J';
419 my $filename = $self->get_source_filename();
420 if (defined($filename)) { # && -e $filename) {
421 $OID = &File::Basename::dirname($filename);
422 if (defined $OID) {
423 $OID = 'J'.&File::Basename::basename($OID);
424 $OID = &util::tidy_up_oid($OID);
425 } else {
426 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
427 $use_hash_oid = 1;
428 }
429 } else {
430 print STDERR "Failed to find filename, generating hash id\n";
431 $use_hash_oid = 1;
432 }
433
434 } elsif ($self->{'OIDtype'} eq "assigned") {
435 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
436 if (defined $identifier && $identifier ne "") {
437 $OID = $identifier;
438 $OID = &util::tidy_up_oid($OID);
439 } else {
440 # need a hash id
441 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
442 $use_hash_oid = 1;
443 }
444
445 } else {
446 $use_hash_oid = 1;
447 }
448
449 if ($use_hash_oid) {
450 my $hash_on_file = 1;
451 my $hash_on_ga_xml = 0;
452
453 if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
454 $hash_on_file = 0;
455 $hash_on_ga_xml = 1;
456 }
457
458 if ($self->{'OIDtype'} eq "hash_on_full_filename") {
459 $hash_on_file = 0;
460 $hash_on_ga_xml = 0;
461
462 my $source_filename = $self->get_source_filename();
463 my $dirsep = &util::get_os_dirsep();
464
465 $source_filename =~ s/^import$dirsep//;
466 $source_filename =~ s/$dirsep/-/g;
467 $source_filename =~ s/\./_/g;
468
469 # If the filename is very short then (handled naively)
470 # this can cause conjestion in the hash-values
471 # computed, leading documents sharing the same leading
472 # Hex values in the computed has.
473 #
474 # The solution taken here is to replace the name of
475 # the file name a sufficient number of times (up to
476 # the character limit defined in 'rep_limit' and
477 # make that the content that is hashed on
478
479 # *** Think twice before changing the following value
480 # as it will break backward compatability of computed
481 # document HASH values
482
483 my $rep_limit = 256;
484 my $hash_content = undef;
485
486 if (length($source_filename)<$rep_limit) {
487 my $rep_string = "$source_filename|";
488 my $rs_len = length($rep_string);
489
490 my $clone_times = int(($rep_limit-1)/$rs_len) +1;
491
492 $hash_content = substr($rep_string x $clone_times, 0, $rep_limit);
493 }
494 else {
495 $hash_content = $source_filename;
496 }
497
498 my $filename = &util::get_tmp_filename();
499 if (!open (OUTFILE, ">:utf8", $filename)) {
500 print STDERR "doc::set_OID could not write to $filename\n";
501 } else {
502 print OUTFILE $hash_content;
503 close (OUTFILE);
504 }
505 $OID = $self->_calc_OID ($filename);
506
507 &util::rm ($filename);
508 }
509
510 if ($hash_on_file) {
511 # "hash" OID - feed file to hashfile.exe
512 my $filename = $self->get_filename_for_hashing();
513
514 # -z: don't want to hash on the file if it is zero size
515 if (defined($filename) && -e $filename && !-z $filename) {
516 $OID = $self->_calc_OID ($filename);
517 } else {
518 $hash_on_ga_xml = 1; # switch to back-up plan, and hash on GA file instead
519 }
520 }
521
522 if ($hash_on_ga_xml) {
523 # In addition being asked to explicity calculate the has based on the GA file,
524 # can also end up coming into this block is doing 'hash_on_file' but the file
525 # itself is of zero bytes (as could be the case with 'doc.nul' file
526
527 my $filename = &util::get_tmp_filename();
528 if (!open (OUTFILE, ">:utf8", $filename)) {
529 print STDERR "doc::set_OID could not write to $filename\n";
530 } else {
531 my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
532 print OUTFILE $doc_text;
533 close (OUTFILE);
534 }
535 $OID = $self->_calc_OID ($filename);
536 &util::rm ($filename);
537 }
538 }
539 }
540 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
541}
542
543# this uses hashdoc (embedded c thingy) which is faster but still
544# needs a little work to be suffiently stable
545sub ___set_OID {
546 my $self = shift (@_);
547 my ($OID) = @_;
548
549 # if an OID wasn't provided then calculate hash value based on document
550 if (!defined $OID)
551 {
552 my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
553 my $hash_len = length($hash_text);
554
555 $OID = &hashdoc::buffer($hash_text,$hash_len);
556 }
557
558 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
559}
560
561# returns the OID for this document
562sub get_OID {
563 my $self = shift (@_);
564 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
565 return $OID if (defined $OID);
566 return "NULL";
567}
568
569sub delete_OID {
570 my $self = shift (@_);
571
572 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
573}
574
575
576# methods for manipulating section names
577
578# returns the name of the top-most section (the top
579# level of the document
580sub get_top_section {
581 my $self = shift (@_);
582
583 return "";
584}
585
586# returns a section
587sub get_parent_section {
588 my $self = shift (@_);
589 my ($section) = @_;
590
591 $section =~ s/(^|\.)\d+$//;
592
593 return $section;
594}
595
596# returns the first child section (or the end child
597# if there isn't any)
598sub get_begin_child {
599 my $self = shift (@_);
600 my ($section) = @_;
601
602 my $section_ptr = $self->_lookup_section($section);
603 return "" unless defined $section_ptr;
604
605 if (defined $section_ptr->{'subsection_order'}->[0]) {
606 return "$section.$section_ptr->{'subsection_order'}->[0]";
607 }
608
609 return $self->get_end_child ($section);
610}
611
612# returns the next child of a parent section
613sub get_next_child {
614 my $self = shift (@_);
615 my ($section) = @_;
616
617 my $parent_section = $self->get_parent_section($section);
618 my $parent_section_ptr = $self->_lookup_section($parent_section);
619 return undef unless defined $parent_section_ptr;
620
621 my ($section_num) = $section =~ /(\d+)$/;
622 return undef unless defined $section_num;
623
624 my $i = 0;
625 my $section_order = $parent_section_ptr->{'subsection_order'};
626 while ($i < scalar(@$section_order)) {
627 last if $section_order->[$i] eq $section_num;
628 $i++;
629 }
630
631 $i++; # the next child
632 if ($i < scalar(@$section_order)) {
633 return $section_order->[$i] if $parent_section eq "";
634 return "$parent_section.$section_order->[$i]";
635 }
636
637 # no more sections in this level
638 return undef;
639}
640
641# returns a reference to a list of children
642sub get_children {
643 my $self = shift (@_);
644 my ($section) = @_;
645
646 my $section_ptr = $self->_lookup_section($section);
647 return [] unless defined $section_ptr;
648
649 my @children = @{$section_ptr->{'subsection_order'}};
650
651 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
652 return \@children;
653}
654
655# returns the child section one past the last one (which
656# is coded as "0")
657sub get_end_child {
658 my $self = shift (@_);
659 my ($section) = @_;
660
661 return $section . ".0" unless $section eq "";
662 return "0";
663}
664
665# returns the next section in book order
666sub get_next_section {
667 my $self = shift (@_);
668 my ($section) = @_;
669
670 return undef unless defined $section;
671
672 my $section_ptr = $self->_lookup_section($section);
673 return undef unless defined $section_ptr;
674
675 # first try to find first child
676 if (defined $section_ptr->{'subsection_order'}->[0]) {
677 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
678 return "$section.$section_ptr->{'subsection_order'}->[0]";
679 }
680
681 do {
682 # try to find sibling
683 my $next_child = $self->get_next_child ($section);
684 return $next_child if (defined $next_child);
685
686 # move up one level
687 $section = $self->get_parent_section ($section);
688 } while $section =~ /\d/;
689
690 return undef;
691}
692
693sub is_leaf_section {
694 my $self = shift (@_);
695 my ($section) = @_;
696
697 my $section_ptr = $self->_lookup_section($section);
698 return 1 unless defined $section_ptr;
699
700 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
701}
702
703# methods for dealing with sections
704
705# returns the name of the inserted section
706sub insert_section {
707 my $self = shift (@_);
708 my ($before_section) = @_;
709
710 # get the child to insert before and its parent section
711 my $parent_section = "";
712 my $before_child = "0";
713 my @before_section = split (/\./, $before_section);
714 if (scalar(@before_section) > 0) {
715 $before_child = pop (@before_section);
716 $parent_section = join (".", @before_section);
717 }
718
719 my $parent_section_ptr = $self->_lookup_section($parent_section);
720 if (!defined $parent_section_ptr) {
721 print STDERR "doc::insert_section couldn't find parent section " .
722 "$parent_section\n";
723 return;
724 }
725
726 # get the next section number
727 my $section_num = $parent_section_ptr->{'next_subsection'}++;
728
729 my $i = 0;
730 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
731 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
732 $i++;
733 }
734
735 # insert the section number into the order list
736 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
737
738 # add this section to the parent section
739 my $section_ptr = {'subsection_order'=>[],
740 'next_subsection'=>1,
741 'subsections'=>{},
742 'metadata'=>[],
743 'text'=>""};
744 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
745
746 # work out the full section number
747 my $section = $parent_section;
748 $section .= "." unless $section eq "";
749 $section .= $section_num;
750
751 return $section;
752}
753
754# creates a pre-named section
755sub create_named_section {
756 my $self = shift (@_);
757 my ($mastersection) = @_;
758
759 my ($num);
760 my $section = $mastersection;
761 my $sectionref = $self;
762
763 while ($section ne "") {
764 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
765 $num =~ s/^0+(\d)/$1/; # remove leading 0s
766 $section = "" unless defined $section;
767
768 if (defined $num) {
769 if (!defined $sectionref->{'subsections'}->{$num}) {
770 push (@{$sectionref->{'subsection_order'}}, $num);
771 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
772 'next_subsection'=>1,
773 'subsections'=>{},
774 'metadata'=>[],
775 'text'=>""};
776 if ($num >= $sectionref->{'next_subsection'}) {
777 $sectionref->{'next_subsection'} = $num + 1;
778 }
779 }
780 $sectionref = $sectionref->{'subsections'}->{$num};
781
782 } else {
783 print STDERR "doc::create_named_section couldn't create section ";
784 print STDERR "$mastersection\n";
785 last;
786 }
787 }
788}
789
790# returns a reference to a list of subsections
791sub list_subsections {
792 my $self = shift (@_);
793 my ($section) = @_;
794
795 my $section_ptr = $self->_lookup_section ($section);
796 if (!defined $section_ptr) {
797 print STDERR "doc::list_subsections couldn't find section $section\n";
798 return [];
799 }
800
801 return [@{$section_ptr->{'subsection_order'}}];
802}
803
804sub delete_section {
805 my $self = shift (@_);
806 my ($section) = @_;
807
808# my $section_ptr = {'subsection_order'=>[],
809# 'next_subsection'=>1,
810# 'subsections'=>{},
811# 'metadata'=>[],
812# 'text'=>""};
813
814 # if this is the top section reset everything
815 if ($section eq "") {
816 $self->{'subsection_order'} = [];
817 $self->{'subsections'} = {};
818 $self->{'metadata'} = [];
819 $self->{'text'} = "";
820 return;
821 }
822
823 # find the parent of the section to delete
824 my $parent_section = "";
825 my $child = "0";
826 my @section = split (/\./, $section);
827 if (scalar(@section) > 0) {
828 $child = pop (@section);
829 $parent_section = join (".", @section);
830 }
831
832 my $parent_section_ptr = $self->_lookup_section($parent_section);
833 if (!defined $parent_section_ptr) {
834 print STDERR "doc::delete_section couldn't find parent section " .
835 "$parent_section\n";
836 return;
837 }
838
839 # remove this section from the subsection_order list
840 my $i = 0;
841 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
842 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
843 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
844 last;
845 }
846 $i++;
847 }
848
849 # remove this section from the subsection hash
850 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
851 undef $parent_section_ptr->{'subsections'}->{$child};
852 }
853}
854
855#--
856# methods for dealing with metadata
857
858# set_metadata_element and get_metadata_element are for metadata
859# which should only have one value. add_meta_data and get_metadata
860# are for metadata which can have more than one value.
861
862# returns the first metadata value which matches field
863
864# This version of get metadata element works much like the one above,
865# except it allows for the namespace portion of a metadata element to
866# be ignored, thus if you are searching for dc.Title, the first piece
867# of matching metadata ending with the name Title (once any namespace
868# is removed) would be returned.
869# 28-11-2003 John Thompson
870sub get_metadata_element {
871 my $self = shift (@_);
872 my ($section, $field, $ignore_namespace) = @_;
873 my ($data);
874
875 $ignore_namespace = 0 unless defined $ignore_namespace;
876
877 my $section_ptr = $self->_lookup_section($section);
878 if (!defined $section_ptr) {
879 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
880 return;
881 }
882
883 # Remove any namespace if we are being told to ignore them
884 if($ignore_namespace) {
885 $field =~ s/^.*\.//; #$field =~ s/^\w*\.//;
886 }
887
888 foreach $data (@{$section_ptr->{'metadata'}}) {
889
890 my $data_name = $data->[0];
891
892 # Remove any namespace if we are being told to ignore them
893 if($ignore_namespace) {
894 $data_name =~ s/^.*\.//; #$data_name =~ s/^\w*\.//;
895 }
896 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
897 $data_name =~ s/^ex\.([^.]+)$/$1/; #$data_name =~ s/^ex\.//;
898
899 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
900 }
901
902 return undef; # was not found
903}
904
905# returns a list of the form [value1, value2, ...]
906sub get_metadata {
907 my $self = shift (@_);
908 my ($section, $field, $ignore_namespace) = @_;
909 my ($data);
910
911 $ignore_namespace = 0 unless defined $ignore_namespace;
912
913 my $section_ptr = $self->_lookup_section($section);
914 if (!defined $section_ptr) {
915 print STDERR "doc::get_metadata couldn't find section ",
916 $section, "\n";
917 return;
918 }
919
920 # Remove any namespace if we are being told to ignore them
921 if($ignore_namespace) {
922 $field =~ s/^.*\.//;
923 }
924
925 my @metadata = ();
926 foreach $data (@{$section_ptr->{'metadata'}}) {
927
928 my $data_name = $data->[0];
929
930 # Remove any namespace if we are being told to ignore them
931 if($ignore_namespace) {
932 $data_name =~ s/^.*\.//;
933 }
934 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
935 $data_name =~ s/^ex\.([^.]+)$/$1/;
936
937 push (@metadata, $data->[1]) if ($data_name eq $field);
938 }
939
940 return \@metadata;
941}
942
943sub get_metadata_hashmap {
944 my $self = shift (@_);
945 my ($section, $opt_namespace) = @_;
946
947 my $section_ptr = $self->_lookup_section($section);
948 if (!defined $section_ptr) {
949 print STDERR "doc::get_metadata couldn't find section ",
950 $section, "\n";
951 return;
952 }
953
954 my $metadata_hashmap = {};
955 foreach my $data (@{$section_ptr->{'metadata'}}) {
956 my $metaname = $data->[0];
957
958 if ((!defined $opt_namespace) || ($metaname =~ m/^$opt_namespace\./)) {
959 if (!defined $metadata_hashmap->{$metaname}) {
960 $metadata_hashmap->{$metaname} = [];
961 }
962 my $metaval_list = $metadata_hashmap->{$metaname};
963 push(@$metaval_list, $data->[1]);
964 }
965 }
966
967 return $metadata_hashmap;
968}
969
970# returns a list of the form [[field,value],[field,value],...]
971sub get_all_metadata {
972 my $self = shift (@_);
973 my ($section) = @_;
974
975 my $section_ptr = $self->_lookup_section($section);
976 if (!defined $section_ptr) {
977 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
978 return;
979 }
980
981 return $section_ptr->{'metadata'};
982}
983
984# $value is optional
985sub delete_metadata {
986 my $self = shift (@_);
987 my ($section, $field, $value) = @_;
988
989 my $section_ptr = $self->_lookup_section($section);
990 if (!defined $section_ptr) {
991 print STDERR "doc::delete_metadata couldn't find section ", $section, "$field\n";
992 return;
993 }
994
995 my $i = 0;
996 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
997 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
998 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
999 splice (@{$section_ptr->{'metadata'}}, $i, 1);
1000 } else {
1001 $i++;
1002 }
1003 }
1004}
1005
1006sub delete_all_metadata {
1007 my $self = shift (@_);
1008 my ($section) = @_;
1009
1010 my $section_ptr = $self->_lookup_section($section);
1011 if (!defined $section_ptr) {
1012 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
1013 return;
1014 }
1015
1016 $section_ptr->{'metadata'} = [];
1017}
1018
1019sub set_metadata_element {
1020 my $self = shift (@_);
1021 my ($section, $field, $value) = @_;
1022
1023 $self->set_utf8_metadata_element ($section, $field,
1024 &unicode::ascii2utf8(\$value));
1025}
1026
1027# set_utf8_metadata_element assumes the text has already been
1028# converted to the UTF-8 encoding.
1029sub set_utf8_metadata_element {
1030 my $self = shift (@_);
1031 my ($section, $field, $value) = @_;
1032
1033 $self->delete_metadata ($section, $field);
1034 $self->add_utf8_metadata ($section, $field, $value);
1035}
1036
1037
1038# add_metadata assumes the text is in (extended) ascii form. For
1039# text which has already been converted to the UTF-8 format use
1040# add_utf8_metadata.
1041sub add_metadata {
1042 my $self = shift (@_);
1043 my ($section, $field, $value) = @_;
1044
1045 $self->add_utf8_metadata ($section, $field,
1046 &unicode::ascii2utf8(\$value));
1047}
1048
1049sub add_utf8_metadata {
1050 my $self = shift (@_);
1051 my ($section, $field, $value) = @_;
1052
1053 # my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1054 # my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
1055 # print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
1056
1057 my $section_ptr = $self->_lookup_section($section);
1058 if (!defined $section_ptr) {
1059 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
1060 return;
1061 }
1062 if (!defined $value) {
1063 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
1064 return;
1065 }
1066 if (!defined $field) {
1067 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
1068 return;
1069 }
1070
1071 #print STDERR "###$field=$value\n";
1072
1073 # For now, supress this check. Given that text data read in is now
1074 # Unicode aware, then the following block of code can (ironically enough)
1075 # cause our unicode compliant string to be re-encoded (leading to
1076 # a double-encoded UTF-8 string, which we definitely don't want!).
1077
1078
1079 # double check that the value is utf-8
1080 # if (!&unicode::check_is_utf8($value)) {
1081 # print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
1082 # &unicode::ensure_utf8(\$value);
1083 # print STDERR " Tried converting to utf8: $value\n";
1084 # }
1085
1086 #If the metadata value is either a latitude or a longitude value then we want to save a shortened version for spacial searching purposes
1087 if ($field =~ m/^(.+\.)?Latitude$/ || $field =~ m/^(.+\.)?Longitude$/)
1088 {
1089 my ($mdprefix,$metaname) = ($field =~ m/(.+)\.(.+)$/);
1090 if (defined $mdprefix) {
1091 # Add in a version of Latitude/Longitude without the metadata namespace prefix to keep Runtime happy
1092 push (@{$section_ptr->{'metadata'}}, [$metaname, $value]);
1093 }
1094
1095 my $direction;
1096 if($value =~ m/^-/)
1097 {
1098 $direction = ($field eq "Latitude") ? "S" : "W";
1099 }
1100 else
1101 {
1102 $direction = ($field eq "Latitude") ? "N" : "E";
1103 }
1104
1105 my ($beforeDec, $afterDec) = ($value =~ m/^-?([0-9]+)\.([0-9]+)$/);
1106 if(defined $beforeDec && defined $afterDec)
1107 {
1108 my $name = ($field eq "Latitude") ? "LatShort" : "LngShort";
1109 push (@{$section_ptr->{'metadata'}}, [$name, $beforeDec . $direction]);
1110
1111 for(my $i = 2; $i <= 4; $i++)
1112 {
1113 if(length($afterDec) >= $i)
1114 {
1115 push (@{$section_ptr->{'metadata'}}, [$name, substr($afterDec, 0, $i)]);
1116 }
1117 }
1118
1119 #Only add the metadata if it has not already been added
1120 my $metaMap = $self->get_metadata_hashmap($section);
1121 }
1122 }
1123
1124 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1125}
1126
1127
1128# methods for dealing with text
1129
1130# returns the text for a section
1131sub get_text {
1132 my $self = shift (@_);
1133 my ($section) = @_;
1134
1135 my $section_ptr = $self->_lookup_section($section);
1136 if (!defined $section_ptr) {
1137 print STDERR "doc::get_text couldn't find section " .
1138 "$section\n";
1139 return "";
1140 }
1141
1142 return $section_ptr->{'text'};
1143}
1144
1145# returns the (utf-8 encoded) length of the text for a section
1146sub get_text_length {
1147 my $self = shift (@_);
1148 my ($section) = @_;
1149
1150 my $section_ptr = $self->_lookup_section($section);
1151 if (!defined $section_ptr) {
1152 print STDERR "doc::get_text_length couldn't find section " .
1153 "$section\n";
1154 return 0;
1155 }
1156
1157 return length ($section_ptr->{'text'});
1158}
1159
1160# returns the total length for all the sections
1161sub get_total_text_length {
1162 my $self = shift (@_);
1163
1164 my $section = $self->get_top_section();
1165 my $length = 0;
1166 while (defined $section) {
1167 $length += $self->get_text_length($section);
1168 $section = $self->get_next_section($section);
1169 }
1170 return $length;
1171}
1172
1173sub delete_text {
1174 my $self = shift (@_);
1175 my ($section) = @_;
1176
1177 my $section_ptr = $self->_lookup_section($section);
1178 if (!defined $section_ptr) {
1179 print STDERR "doc::delete_text couldn't find section " .
1180 "$section\n";
1181 return;
1182 }
1183
1184 $section_ptr->{'text'} = "";
1185}
1186
1187# add_text assumes the text is in (extended) ascii form. For
1188# text which has been already converted to the UTF-8 format
1189# use add_utf8_text.
1190sub add_text {
1191 my $self = shift (@_);
1192 my ($section, $text) = @_;
1193
1194 # convert the text to UTF-8 encoded unicode characters
1195 # and add the text
1196 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1197}
1198
1199
1200# add_utf8_text assumes the text to be added has already
1201# been converted to the UTF-8 encoding. For ascii text use
1202# add_text
1203sub add_utf8_text {
1204 my $self = shift (@_);
1205 my ($section, $text) = @_;
1206
1207 my $section_ptr = $self->_lookup_section($section);
1208 if (!defined $section_ptr) {
1209 print STDERR "doc::add_utf8_text couldn't find section " .
1210 "$section\n";
1211 return;
1212 }
1213
1214 $section_ptr->{'text'} .= $text;
1215}
1216
1217# returns the Source meta, which is the utf8 filename generated.
1218# Added a separate method here for convenience
1219sub get_source {
1220 my $self = shift (@_);
1221 return $self->get_metadata_element ($self->get_top_section(), "Source");
1222}
1223
1224# returns the SourceFile meta, which is the url reference to the URL-encoded
1225# version of Source (the utf8 filename). Added a separate method here for convenience
1226sub get_sourcefile {
1227 my $self = shift (@_);
1228 return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
1229}
1230
1231# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
1232# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
1233sub get_assocfile_from_sourcefile {
1234 my $self = shift (@_);
1235
1236 # get the SourceFile meta, which is a *URL* to a file on the filesystem
1237 my $top_section = $self->get_top_section();
1238 my $source_file = $self->get_metadata_element($top_section, "SourceFile");
1239
1240 # get the actual filename as it exists on the filesystem which this url refers to
1241 $source_file = &unicode::url_to_filename($source_file);
1242 my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
1243 return $assocfilename;
1244}
1245
1246# methods for dealing with associated files
1247
1248# a file is associated with a document, NOT a section.
1249# if section is defined it is noted in the data structure
1250# only so that files associated from a particular section
1251# may be removed later (using delete_section_assoc_files)
1252sub associate_file {
1253 my $self = shift (@_);
1254 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1255 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1256
1257 # remove all associated files with the same name
1258 $self->delete_assoc_file ($assoc_filename);
1259
1260 # Too harsh a requirement
1261 # Definitely get HTML docs, for example, with some missing
1262 # support files
1263# if (!&util::fd_exists($real_filename)) {
1264# print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n";
1265# exit -1;
1266# }
1267
1268# print STDERR "**** is the following a UTF8 rep of *real* filename?\n $real_filename\n";
1269# print STDERR "****##### so, ensure it is before storing?!?!?\n";
1270## my $utf8_filename = Encode::encode("utf8",$filename);
1271
1272 push (@{$self->{'associated_files'}},
1273 [$real_filename, $assoc_filename, $mime_type, $section]);
1274}
1275
1276# returns a list of associated files in the form
1277# [[real_filename, assoc_filename, mimetype], ...]
1278sub get_assoc_files {
1279 my $self = shift (@_);
1280
1281 return $self->{'associated_files'};
1282}
1283
1284# the following two methods used to keep track of original associated files
1285# for incremental building. eg a txt file used by an item file does not end
1286# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1287# file for incremental build
1288sub associate_source_file {
1289 my $self = shift (@_);
1290 my ($full_filename) = @_;
1291
1292 push (@{$self->{'source_assoc_files'}}, $full_filename);
1293
1294}
1295
1296sub get_source_assoc_files {
1297 my $self = shift (@_);
1298
1299 return $self->{'source_assoc_files'};
1300
1301
1302}
1303sub metadata_file {
1304 my $self = shift (@_);
1305 my ($real_filename, $filename) = @_;
1306
1307 push (@{$self->{'metadata_files'}},
1308 [$real_filename, $filename]);
1309}
1310
1311# used for writing out the archiveinf-doc info database, to list all the metadata files
1312sub get_meta_files {
1313 my $self = shift (@_);
1314
1315 return $self->{'metadata_files'};
1316}
1317
1318sub delete_section_assoc_files {
1319 my $self = shift (@_);
1320 my ($section) = @_;
1321
1322 my $i=0;
1323 while ($i < scalar (@{$self->{'associated_files'}})) {
1324 if (defined $self->{'associated_files'}->[$i]->[3] &&
1325 $self->{'associated_files'}->[$i]->[3] eq $section) {
1326 splice (@{$self->{'associated_files'}}, $i, 1);
1327 } else {
1328 $i++;
1329 }
1330 }
1331}
1332
1333sub delete_assoc_file {
1334 my $self = shift (@_);
1335 my ($assoc_filename) = @_;
1336
1337 my $i=0;
1338 while ($i < scalar (@{$self->{'associated_files'}})) {
1339 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1340 splice (@{$self->{'associated_files'}}, $i, 1);
1341 } else {
1342 $i++;
1343 }
1344 }
1345}
1346
1347sub reset_nextsection_ptr {
1348 my $self = shift (@_);
1349 my ($section) = @_;
1350
1351 my $section_ptr = $self->_lookup_section($section);
1352 $section_ptr->{'next_subsection'} = 1;
1353}
1354
13551;
Note: See TracBrowser for help on using the repository browser.