source: main/trunk/greenstone2/perllib/doc.pm@ 27306

Last change on this file since 27306 was 27306, checked in by jmt12, 11 years ago

Moving the critical file-related functions (copy, rm, etc) out of util.pm into their own proper class FileUtils. Use of the old functions in util.pm will prompt deprecated warning messages. There may be further functions that could be moved across in the future, but these are the critical ones when considering supporting other filesystems (HTTP, HDFS, WebDav, etc). Updated some key files to use the new functions so now deprecated messages thrown when importing/building demo collection 'out of the box'

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 38.2 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use FileUtils;
40use ghtml;
41use File::stat;
42##use hashdoc;
43use docprint;
44
45# the document type may be indexed_doc, nonindexed_doc, or
46# classification
47
48our $OIDcount = 0;
49
50# rename_method can be 'url', 'none', 'base64'
51sub new {
52 my $class = shift (@_);
53 my ($source_filename, $doc_type, $rename_method) = @_;
54
55
56 my $self = bless {'associated_files'=>[],
57 'subsection_order'=>[],
58 'next_subsection'=>1,
59 'subsections'=>{},
60 'metadata'=>[],
61 'text'=>"",
62 'OIDtype'=>"hash"}, $class;
63
64 # used to set lastmodified here, but this can screw up the HASH ids, so
65 # the docsave processor now calls set_lastmodified
66
67 $self->set_source_path($source_filename);
68
69 if (defined $source_filename) {
70 $source_filename = &util::filename_within_collection($source_filename);
71 print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
72 $self->set_source_filename ($source_filename, $rename_method);
73 }
74
75 $self->set_doc_type ($doc_type) if defined $doc_type;
76
77 return $self;
78}
79
80
81sub set_source_path
82{
83 my $self = shift @_;
84 my ($source_filename) = @_;
85
86 if (defined $source_filename) {
87 # On Windows the source_filename can be in terse DOS format
88 # e.g. test~1.txt
89
90 $self->{'terse_source_path'} = $source_filename;
91
92 # Use the FileUtil library methods as they are aware of more special
93 # cases such as HDFS [jmt12]
94 ##if (&util::file_exists($source_filename)) {
95 if (-e $source_filename) {
96 # See if we can do better for Windows with a filename
97 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
98 require Win32;
99 $self->{'source_path'} = Win32::GetLongPathName($source_filename);
100 }
101 else {
102 # For Unix-based systems, there is no difference between the two
103 $self->{'source_path'} = $source_filename;
104 }
105 }
106 else {
107 print STDERR "Warning: In doc::set_source_path(), file\n";
108 print STDERR " $source_filename\n";
109 print STDERR " does not exist\n";
110
111 # (default) Set it to whatever we were given
112 $self->{'source_path'} = $source_filename;
113 }
114 }
115 else {
116 # Previous code for setting source_path allowed for
117 # it to be undefined, so continue this practice
118 $self->{'terse_source_path'} = undef;
119 $self->{'source_path'} = undef;
120 }
121}
122
123
124sub get_source_path
125{
126 my $self = shift @_;
127
128 return $self->{'terse_source_path'};
129}
130
131# set lastmodified for OAI purposes, added by GRB, moved by kjdon
132sub set_oailastmodified {
133 my $self = shift (@_);
134
135 my $source_path = $self->{'terse_source_path'};
136
137 if (defined $source_path && (-e $source_path)) {
138 my $current_time = time;
139
140 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
141 $wday, $yday, $isdst) = localtime($current_time);
142
143 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
144
145 $self->add_utf8_metadata($self->get_top_section(), "oailastmodified", $current_time);
146 $self->add_utf8_metadata($self->get_top_section(), "oailastmodifieddate", $date_modified);
147 }
148}
149
150# no longer used for OAI purposes, since lastmodified is not what we want as the
151# Datestamp of a document. This doc metadata may be useful for general purposes.
152sub set_lastmodified {
153 my $self = shift (@_);
154
155 my $source_path = $self->{'terse_source_path'};
156
157 if (defined $source_path && (-e $source_path)) {
158
159 my $file_stat = stat($source_path);
160 my $mtime = $file_stat->mtime;
161 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
162 $wday, $yday, $isdst) = localtime($mtime);
163
164 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
165
166 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $mtime);
167 $self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
168 }
169}
170
171# clone the $self object
172sub duplicate {
173 my $self = shift (@_);
174
175 my $newobj = {};
176
177 foreach my $k (keys %$self) {
178 $newobj->{$k} = &clone ($self->{$k});
179 }
180
181 bless $newobj, ref($self);
182 return $newobj;
183}
184
185sub clone {
186 my ($from) = @_;
187 my $type = ref ($from);
188
189 if ($type eq "HASH") {
190 my $to = {};
191 foreach my $key (keys %$from) {
192 $to->{$key} = &clone ($from->{$key});
193 }
194 return $to;
195 } elsif ($type eq "ARRAY") {
196 my $to = [];
197 foreach my $v (@$from) {
198 push (@$to, &clone ($v));
199 }
200 return $to;
201 } else {
202 return $from;
203 }
204}
205
206sub set_OIDtype {
207 my $self = shift (@_);
208 my ($type, $metadata) = @_;
209
210 if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|hash_on_full_filename|incremental|filename|dirname|full_filename|assigned)$/) {
211 $self->{'OIDtype'} = $type;
212 } else {
213 $self->{'OIDtype'} = "hash";
214 }
215
216 if ($type =~ /^assigned$/) {
217 if (defined $metadata) {
218 $self->{'OIDmetadata'} = $metadata;
219 } else {
220 $self->{'OIDmetadata'} = "dc.Identifier";
221 }
222 }
223}
224
225# rename_method can be 'url', 'none', 'base64'
226sub set_source_filename {
227 my $self = shift (@_);
228 my ($source_filename, $rename_method) = @_;
229
230 # Since the gsdlsourcefilename element goes into the doc.xml it has
231 # to be utf8. However, it should also *represent* the source filename
232 # (in the import directory) which may not be utf8 at all.
233 # For instance, if this meta element (gsdlsourcefilename) will be used
234 # by other applications that parse doc.xml in order to locate
235 # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
236 # encode the real filename as this is a binary-to-text encoding meaning
237 # that the resulting string is ASCII (utf8). Decoding will give the original.
238
239# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
240
241 # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
242 $source_filename = $self->encode_filename($source_filename, $rename_method);
243# my ($srcfilename,$dirname,$suffix)
244# = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
245# print STDERR "-> $srcfilename -> ";
246# $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
247# $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
248# print STDERR "$source_filename\n";
249
250 $self->set_utf8_metadata_element ($self->get_top_section(),
251 "gsdlsourcefilename",
252 $source_filename);
253}
254
255sub encode_filename {
256 my $self = shift (@_);
257 my ($source_filename, $rename_method) = @_;
258
259 my ($srcfilename,$dirname,$suffix)
260 = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
261# print STDERR "-> $srcfilename -> ";
262 $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
263 $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
264
265 return $source_filename;
266}
267
268sub set_converted_filename {
269 my $self = shift (@_);
270 my ($converted_filename) = @_;
271
272 # we know the converted filename is utf8
273 $self->set_utf8_metadata_element ($self->get_top_section(),
274 "gsdlconvertedfilename",
275 $converted_filename);
276}
277
278# returns the source_filename as it was provided
279sub get_unmodified_source_filename {
280 my $self = shift (@_);
281
282 return $self->{'terse_source_path'};
283}
284
285# returns the source_filename with whatever rename_method was given
286sub get_source_filename {
287 my $self = shift (@_);
288
289 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
290}
291
292
293
294# returns converted filename if available else returns source filename
295sub get_filename_for_hashing {
296 my $self = shift (@_);
297
298 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
299
300 if (!defined $filename) {
301 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
302 # if NULPlug processed file, then don't give a filename
303 if (defined $plugin_name && $plugin_name eq "NULPlug") {
304 $filename = undef;
305 } else { # returns the URL encoded source filename!
306 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
307 }
308 }
309
310 if (!&FileUtils::isFilenameAbsolute($filename)) {
311 $filename = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$filename);
312 }
313
314 return $filename;
315}
316
317sub set_doc_type {
318 my $self = shift (@_);
319 my ($doc_type) = @_;
320
321 $self->set_metadata_element ($self->get_top_section(),
322 "gsdldoctype",
323 $doc_type);
324}
325
326# returns the gsdldoctype as it was provided
327# the default of "indexed_doc" is used if no document
328# type was provided
329sub get_doc_type {
330 my $self = shift (@_);
331
332 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
333 return $doc_type if (defined $doc_type);
334 return "indexed_doc";
335}
336
337
338# look up the reference to the a particular section
339sub _lookup_section {
340 my $self = shift (@_);
341 my ($section) = @_;
342
343 my ($num);
344 my $sectionref = $self;
345
346 while (defined $section && $section ne "") {
347
348 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
349
350 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
351
352 $section = "" unless defined $section;
353
354
355 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
356 $sectionref = $sectionref->{'subsections'}->{$num};
357 } else {
358 return undef;
359 }
360 }
361
362 return $sectionref;
363}
364
365# calculate OID by hashing the contents of the document
366sub _calc_OID {
367 my $self = shift (@_);
368 my ($filename) = @_;
369
370
371 my $osexe = &util::get_os_exe();
372
373 my $hashfile_exe = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin",
374 $ENV{'GSDLOS'},"hashfile$osexe");
375
376 my $result = "NULL";
377
378
379 if (-e "$hashfile_exe") {
380# $result = `\"$hashfile_exe\" \"$filename\"`;
381# $result = `hashfile$osexe \"$filename\" 2>&1`;
382 $result = `hashfile$osexe \"$filename\"`;
383
384 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
385 } else {
386 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
387 }
388 return "HASH$result";
389}
390
391# methods dealing with OID, not groups of them.
392
393# if $OID is not provided one is calculated
394sub set_OID {
395 my $self = shift (@_);
396 my ($OID) = @_;
397
398 my $use_hash_oid = 0;
399 # if an OID wasn't provided calculate one
400 if (!defined $OID) {
401 $OID = "NULL";
402 if ($self->{'OIDtype'} =~ /^hash/) {
403 $use_hash_oid = 1;
404 } elsif ($self->{'OIDtype'} eq "incremental") {
405 $OID = "D" . $OIDcount;
406 $OIDcount ++;
407 } elsif ($self->{'OIDtype'} eq "filename") {
408 my $filename = $self->get_source_filename();
409 $OID = &File::Basename::fileparse($filename, qr/\.[^.]*/);
410 $OID = &util::tidy_up_oid($OID);
411 } elsif ($self->{'OIDtype'} eq "full_filename") {
412 my $source_filename = $self->get_source_filename();
413 my $dirsep = &util::get_os_dirsep();
414
415 $source_filename =~ s/^import$dirsep//;
416 $source_filename =~ s/$dirsep/-/g;
417 $source_filename =~ s/\./_/g;
418
419 $OID = $source_filename;
420 $OID = &util::tidy_up_oid($OID);
421 } elsif ($self->{'OIDtype'} eq "dirname") {
422 $OID = 'J';
423 my $filename = $self->get_source_filename();
424 if (defined($filename)) { # && -e $filename) {
425 $OID = &File::Basename::dirname($filename);
426 if (defined $OID) {
427 $OID = 'J'.&File::Basename::basename($OID);
428 $OID = &util::tidy_up_oid($OID);
429 } else {
430 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
431 $use_hash_oid = 1;
432 }
433 } else {
434 print STDERR "Failed to find filename, generating hash id\n";
435 $use_hash_oid = 1;
436 }
437
438 } elsif ($self->{'OIDtype'} eq "assigned") {
439 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
440 if (defined $identifier && $identifier ne "") {
441 $OID = $identifier;
442 $OID = &util::tidy_up_oid($OID);
443 } else {
444 # need a hash id
445 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
446 $use_hash_oid = 1;
447 }
448
449 } else {
450 $use_hash_oid = 1;
451 }
452
453 if ($use_hash_oid) {
454 my $hash_on_file = 1;
455 my $hash_on_ga_xml = 0;
456
457 if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
458 $hash_on_file = 0;
459 $hash_on_ga_xml = 1;
460 }
461
462 if ($self->{'OIDtype'} eq "hash_on_full_filename") {
463 $hash_on_file = 0;
464 $hash_on_ga_xml = 0;
465
466 my $source_filename = $self->get_source_filename();
467 my $dirsep = &util::get_os_dirsep();
468
469 $source_filename =~ s/^import$dirsep//;
470 $source_filename =~ s/$dirsep/-/g;
471 $source_filename =~ s/\./_/g;
472
473 # If the filename is very short then (handled naively)
474 # this can cause conjestion in the hash-values
475 # computed, leading documents sharing the same leading
476 # Hex values in the computed has.
477 #
478 # The solution taken here is to replace the name of
479 # the file name a sufficient number of times (up to
480 # the character limit defined in 'rep_limit' and
481 # make that the content that is hashed on
482
483 # *** Think twice before changing the following value
484 # as it will break backward compatability of computed
485 # document HASH values
486
487 my $rep_limit = 256;
488 my $hash_content = undef;
489
490 if (length($source_filename)<$rep_limit) {
491 my $rep_string = "$source_filename|";
492 my $rs_len = length($rep_string);
493
494 my $clone_times = int(($rep_limit-1)/$rs_len) +1;
495
496 $hash_content = substr($rep_string x $clone_times, 0, $rep_limit);
497 }
498 else {
499 $hash_content = $source_filename;
500 }
501
502 my $filename = &util::get_tmp_filename();
503 if (!open (OUTFILE, ">:utf8", $filename)) {
504 print STDERR "doc::set_OID could not write to $filename\n";
505 } else {
506 print OUTFILE $hash_content;
507 close (OUTFILE);
508 }
509 $OID = $self->_calc_OID ($filename);
510
511 &util::rm ($filename);
512 }
513
514 if ($hash_on_file) {
515 # "hash" OID - feed file to hashfile.exe
516 my $filename = $self->get_filename_for_hashing();
517
518 # -z: don't want to hash on the file if it is zero size
519 if (defined($filename) && -e $filename && !-z $filename) {
520 $OID = $self->_calc_OID ($filename);
521 } else {
522 $hash_on_ga_xml = 1; # switch to back-up plan, and hash on GA file instead
523 }
524 }
525
526 if ($hash_on_ga_xml) {
527 # In addition being asked to explicity calculate the has based on the GA file,
528 # can also end up coming into this block is doing 'hash_on_file' but the file
529 # itself is of zero bytes (as could be the case with 'doc.nul' file
530
531 my $filename = &util::get_tmp_filename();
532 if (!open (OUTFILE, ">:utf8", $filename)) {
533 print STDERR "doc::set_OID could not write to $filename\n";
534 } else {
535 my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
536 print OUTFILE $doc_text;
537 close (OUTFILE);
538 }
539 $OID = $self->_calc_OID ($filename);
540 &util::rm ($filename);
541 }
542 }
543 }
544 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
545}
546
547# this uses hashdoc (embedded c thingy) which is faster but still
548# needs a little work to be suffiently stable
549sub ___set_OID {
550 my $self = shift (@_);
551 my ($OID) = @_;
552
553 # if an OID wasn't provided then calculate hash value based on document
554 if (!defined $OID)
555 {
556 my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
557 my $hash_len = length($hash_text);
558
559 $OID = &hashdoc::buffer($hash_text,$hash_len);
560 }
561
562 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
563}
564
565# returns the OID for this document
566sub get_OID {
567 my $self = shift (@_);
568 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
569 return $OID if (defined $OID);
570 return "NULL";
571}
572
573sub delete_OID {
574 my $self = shift (@_);
575
576 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
577}
578
579
580# methods for manipulating section names
581
582# returns the name of the top-most section (the top
583# level of the document
584sub get_top_section {
585 my $self = shift (@_);
586
587 return "";
588}
589
590# returns a section
591sub get_parent_section {
592 my $self = shift (@_);
593 my ($section) = @_;
594
595 $section =~ s/(^|\.)\d+$//;
596
597 return $section;
598}
599
600# returns the first child section (or the end child
601# if there isn't any)
602sub get_begin_child {
603 my $self = shift (@_);
604 my ($section) = @_;
605
606 my $section_ptr = $self->_lookup_section($section);
607 return "" unless defined $section_ptr;
608
609 if (defined $section_ptr->{'subsection_order'}->[0]) {
610 return "$section.$section_ptr->{'subsection_order'}->[0]";
611 }
612
613 return $self->get_end_child ($section);
614}
615
616# returns the next child of a parent section
617sub get_next_child {
618 my $self = shift (@_);
619 my ($section) = @_;
620
621 my $parent_section = $self->get_parent_section($section);
622 my $parent_section_ptr = $self->_lookup_section($parent_section);
623 return undef unless defined $parent_section_ptr;
624
625 my ($section_num) = $section =~ /(\d+)$/;
626 return undef unless defined $section_num;
627
628 my $i = 0;
629 my $section_order = $parent_section_ptr->{'subsection_order'};
630 while ($i < scalar(@$section_order)) {
631 last if $section_order->[$i] eq $section_num;
632 $i++;
633 }
634
635 $i++; # the next child
636 if ($i < scalar(@$section_order)) {
637 return $section_order->[$i] if $parent_section eq "";
638 return "$parent_section.$section_order->[$i]";
639 }
640
641 # no more sections in this level
642 return undef;
643}
644
645# returns a reference to a list of children
646sub get_children {
647 my $self = shift (@_);
648 my ($section) = @_;
649
650 my $section_ptr = $self->_lookup_section($section);
651 return [] unless defined $section_ptr;
652
653 my @children = @{$section_ptr->{'subsection_order'}};
654
655 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
656 return \@children;
657}
658
659# returns the child section one past the last one (which
660# is coded as "0")
661sub get_end_child {
662 my $self = shift (@_);
663 my ($section) = @_;
664
665 return $section . ".0" unless $section eq "";
666 return "0";
667}
668
669# returns the next section in book order
670sub get_next_section {
671 my $self = shift (@_);
672 my ($section) = @_;
673
674 return undef unless defined $section;
675
676 my $section_ptr = $self->_lookup_section($section);
677 return undef unless defined $section_ptr;
678
679 # first try to find first child
680 if (defined $section_ptr->{'subsection_order'}->[0]) {
681 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
682 return "$section.$section_ptr->{'subsection_order'}->[0]";
683 }
684
685 do {
686 # try to find sibling
687 my $next_child = $self->get_next_child ($section);
688 return $next_child if (defined $next_child);
689
690 # move up one level
691 $section = $self->get_parent_section ($section);
692 } while $section =~ /\d/;
693
694 return undef;
695}
696
697sub is_leaf_section {
698 my $self = shift (@_);
699 my ($section) = @_;
700
701 my $section_ptr = $self->_lookup_section($section);
702 return 1 unless defined $section_ptr;
703
704 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
705}
706
707# methods for dealing with sections
708
709# returns the name of the inserted section
710sub insert_section {
711 my $self = shift (@_);
712 my ($before_section) = @_;
713
714 # get the child to insert before and its parent section
715 my $parent_section = "";
716 my $before_child = "0";
717 my @before_section = split (/\./, $before_section);
718 if (scalar(@before_section) > 0) {
719 $before_child = pop (@before_section);
720 $parent_section = join (".", @before_section);
721 }
722
723 my $parent_section_ptr = $self->_lookup_section($parent_section);
724 if (!defined $parent_section_ptr) {
725 print STDERR "doc::insert_section couldn't find parent section " .
726 "$parent_section\n";
727 return;
728 }
729
730 # get the next section number
731 my $section_num = $parent_section_ptr->{'next_subsection'}++;
732
733 my $i = 0;
734 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
735 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
736 $i++;
737 }
738
739 # insert the section number into the order list
740 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
741
742 # add this section to the parent section
743 my $section_ptr = {'subsection_order'=>[],
744 'next_subsection'=>1,
745 'subsections'=>{},
746 'metadata'=>[],
747 'text'=>""};
748 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
749
750 # work out the full section number
751 my $section = $parent_section;
752 $section .= "." unless $section eq "";
753 $section .= $section_num;
754
755 return $section;
756}
757
758# creates a pre-named section
759sub create_named_section {
760 my $self = shift (@_);
761 my ($mastersection) = @_;
762
763 my ($num);
764 my $section = $mastersection;
765 my $sectionref = $self;
766
767 while ($section ne "") {
768 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
769 $num =~ s/^0+(\d)/$1/; # remove leading 0s
770 $section = "" unless defined $section;
771
772 if (defined $num) {
773 if (!defined $sectionref->{'subsections'}->{$num}) {
774 push (@{$sectionref->{'subsection_order'}}, $num);
775 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
776 'next_subsection'=>1,
777 'subsections'=>{},
778 'metadata'=>[],
779 'text'=>""};
780 if ($num >= $sectionref->{'next_subsection'}) {
781 $sectionref->{'next_subsection'} = $num + 1;
782 }
783 }
784 $sectionref = $sectionref->{'subsections'}->{$num};
785
786 } else {
787 print STDERR "doc::create_named_section couldn't create section ";
788 print STDERR "$mastersection\n";
789 last;
790 }
791 }
792}
793
794# returns a reference to a list of subsections
795sub list_subsections {
796 my $self = shift (@_);
797 my ($section) = @_;
798
799 my $section_ptr = $self->_lookup_section ($section);
800 if (!defined $section_ptr) {
801 print STDERR "doc::list_subsections couldn't find section $section\n";
802 return [];
803 }
804
805 return [@{$section_ptr->{'subsection_order'}}];
806}
807
808sub delete_section {
809 my $self = shift (@_);
810 my ($section) = @_;
811
812# my $section_ptr = {'subsection_order'=>[],
813# 'next_subsection'=>1,
814# 'subsections'=>{},
815# 'metadata'=>[],
816# 'text'=>""};
817
818 # if this is the top section reset everything
819 if ($section eq "") {
820 $self->{'subsection_order'} = [];
821 $self->{'subsections'} = {};
822 $self->{'metadata'} = [];
823 $self->{'text'} = "";
824 return;
825 }
826
827 # find the parent of the section to delete
828 my $parent_section = "";
829 my $child = "0";
830 my @section = split (/\./, $section);
831 if (scalar(@section) > 0) {
832 $child = pop (@section);
833 $parent_section = join (".", @section);
834 }
835
836 my $parent_section_ptr = $self->_lookup_section($parent_section);
837 if (!defined $parent_section_ptr) {
838 print STDERR "doc::delete_section couldn't find parent section " .
839 "$parent_section\n";
840 return;
841 }
842
843 # remove this section from the subsection_order list
844 my $i = 0;
845 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
846 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
847 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
848 last;
849 }
850 $i++;
851 }
852
853 # remove this section from the subsection hash
854 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
855 undef $parent_section_ptr->{'subsections'}->{$child};
856 }
857}
858
859#--
860# methods for dealing with metadata
861
862# set_metadata_element and get_metadata_element are for metadata
863# which should only have one value. add_meta_data and get_metadata
864# are for metadata which can have more than one value.
865
866# returns the first metadata value which matches field
867
868# This version of get metadata element works much like the one above,
869# except it allows for the namespace portion of a metadata element to
870# be ignored, thus if you are searching for dc.Title, the first piece
871# of matching metadata ending with the name Title (once any namespace
872# is removed) would be returned.
873# 28-11-2003 John Thompson
874sub get_metadata_element {
875 my $self = shift (@_);
876 my ($section, $field, $ignore_namespace) = @_;
877 my ($data);
878
879 $ignore_namespace = 0 unless defined $ignore_namespace;
880
881 my $section_ptr = $self->_lookup_section($section);
882 if (!defined $section_ptr) {
883 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
884 return;
885 }
886
887 # Remove any namespace if we are being told to ignore them
888 if($ignore_namespace) {
889 $field =~ s/^.*\.//; #$field =~ s/^\w*\.//;
890 }
891
892 foreach $data (@{$section_ptr->{'metadata'}}) {
893
894 my $data_name = $data->[0];
895
896 # Remove any namespace if we are being told to ignore them
897 if($ignore_namespace) {
898 $data_name =~ s/^.*\.//; #$data_name =~ s/^\w*\.//;
899 }
900 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
901 $data_name =~ s/^ex\.([^.]+)$/$1/; #$data_name =~ s/^ex\.//;
902
903 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
904 }
905
906 return undef; # was not found
907}
908
909# returns a list of the form [value1, value2, ...]
910sub get_metadata {
911 my $self = shift (@_);
912 my ($section, $field, $ignore_namespace) = @_;
913 my ($data);
914
915 $ignore_namespace = 0 unless defined $ignore_namespace;
916
917 my $section_ptr = $self->_lookup_section($section);
918 if (!defined $section_ptr) {
919 print STDERR "doc::get_metadata couldn't find section ",
920 $section, "\n";
921 return;
922 }
923
924 # Remove any namespace if we are being told to ignore them
925 if($ignore_namespace) {
926 $field =~ s/^.*\.//;
927 }
928
929 my @metadata = ();
930 foreach $data (@{$section_ptr->{'metadata'}}) {
931
932 my $data_name = $data->[0];
933
934 # Remove any namespace if we are being told to ignore them
935 if($ignore_namespace) {
936 $data_name =~ s/^.*\.//;
937 }
938 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
939 $data_name =~ s/^ex\.([^.]+)$/$1/;
940
941 push (@metadata, $data->[1]) if ($data_name eq $field);
942 }
943
944 return \@metadata;
945}
946
947sub get_metadata_hashmap {
948 my $self = shift (@_);
949 my ($section, $opt_namespace) = @_;
950
951 my $section_ptr = $self->_lookup_section($section);
952 if (!defined $section_ptr) {
953 print STDERR "doc::get_metadata couldn't find section ",
954 $section, "\n";
955 return;
956 }
957
958 my $metadata_hashmap = {};
959 foreach my $data (@{$section_ptr->{'metadata'}}) {
960 my $metaname = $data->[0];
961
962 if ((!defined $opt_namespace) || ($metaname =~ m/^$opt_namespace\./)) {
963 if (!defined $metadata_hashmap->{$metaname}) {
964 $metadata_hashmap->{$metaname} = [];
965 }
966 my $metaval_list = $metadata_hashmap->{$metaname};
967 push(@$metaval_list, $data->[1]);
968 }
969 }
970
971 return $metadata_hashmap;
972}
973
974# returns a list of the form [[field,value],[field,value],...]
975sub get_all_metadata {
976 my $self = shift (@_);
977 my ($section) = @_;
978
979 my $section_ptr = $self->_lookup_section($section);
980 if (!defined $section_ptr) {
981 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
982 return;
983 }
984
985 return $section_ptr->{'metadata'};
986}
987
988# $value is optional
989sub delete_metadata {
990 my $self = shift (@_);
991 my ($section, $field, $value) = @_;
992
993 my $section_ptr = $self->_lookup_section($section);
994 if (!defined $section_ptr) {
995 print STDERR "doc::delete_metadata couldn't find section ", $section, "$field\n";
996 return;
997 }
998
999 my $i = 0;
1000 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
1001 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
1002 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
1003 splice (@{$section_ptr->{'metadata'}}, $i, 1);
1004 } else {
1005 $i++;
1006 }
1007 }
1008}
1009
1010sub delete_all_metadata {
1011 my $self = shift (@_);
1012 my ($section) = @_;
1013
1014 my $section_ptr = $self->_lookup_section($section);
1015 if (!defined $section_ptr) {
1016 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
1017 return;
1018 }
1019
1020 $section_ptr->{'metadata'} = [];
1021}
1022
1023sub set_metadata_element {
1024 my $self = shift (@_);
1025 my ($section, $field, $value) = @_;
1026
1027 $self->set_utf8_metadata_element ($section, $field,
1028 &unicode::ascii2utf8(\$value));
1029}
1030
1031# set_utf8_metadata_element assumes the text has already been
1032# converted to the UTF-8 encoding.
1033sub set_utf8_metadata_element {
1034 my $self = shift (@_);
1035 my ($section, $field, $value) = @_;
1036
1037 $self->delete_metadata ($section, $field);
1038 $self->add_utf8_metadata ($section, $field, $value);
1039}
1040
1041
1042# add_metadata assumes the text is in (extended) ascii form. For
1043# text which has already been converted to the UTF-8 format use
1044# add_utf8_metadata.
1045sub add_metadata {
1046 my $self = shift (@_);
1047 my ($section, $field, $value) = @_;
1048
1049 $self->add_utf8_metadata ($section, $field,
1050 &unicode::ascii2utf8(\$value));
1051}
1052
1053sub add_utf8_metadata {
1054 my $self = shift (@_);
1055 my ($section, $field, $value) = @_;
1056
1057 # my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1058 # my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
1059 # print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
1060
1061 my $section_ptr = $self->_lookup_section($section);
1062 if (!defined $section_ptr) {
1063 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
1064 return;
1065 }
1066 if (!defined $value) {
1067 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
1068 return;
1069 }
1070 if (!defined $field) {
1071 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
1072 return;
1073 }
1074
1075 #print STDERR "###$field=$value\n";
1076
1077 # For now, supress this check. Given that text data read in is now
1078 # Unicode aware, then the following block of code can (ironically enough)
1079 # cause our unicode compliant string to be re-encoded (leading to
1080 # a double-encoded UTF-8 string, which we definitely don't want!).
1081
1082
1083 # double check that the value is utf-8
1084 # if (!&unicode::check_is_utf8($value)) {
1085 # print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
1086 # &unicode::ensure_utf8(\$value);
1087 # print STDERR " Tried converting to utf8: $value\n";
1088 # }
1089
1090 #If the metadata value is either a latitude or a longitude value then we want to save a shortened version for spacial searching purposes
1091 if ($field =~ m/^(.+\.)?Latitude$/ || $field =~ m/^(.+\.)?Longitude$/)
1092 {
1093 my ($mdprefix,$metaname) = ($field =~ m/(.+)\.(.+)$/);
1094 if (defined $mdprefix) {
1095 # Add in a version of Latitude/Longitude without the metadata namespace prefix to keep Runtime happy
1096 push (@{$section_ptr->{'metadata'}}, [$metaname, $value]);
1097 }
1098
1099 my $direction;
1100 if($value =~ m/^-/)
1101 {
1102 $direction = ($field eq "Latitude") ? "S" : "W";
1103 }
1104 else
1105 {
1106 $direction = ($field eq "Latitude") ? "N" : "E";
1107 }
1108
1109 my ($beforeDec, $afterDec) = ($value =~ m/^-?([0-9]+)\.([0-9]+)$/);
1110 if(defined $beforeDec && defined $afterDec)
1111 {
1112 my $name = ($field eq "Latitude") ? "LatShort" : "LngShort";
1113 push (@{$section_ptr->{'metadata'}}, [$name, $beforeDec . $direction]);
1114
1115 for(my $i = 2; $i <= 4; $i++)
1116 {
1117 if(length($afterDec) >= $i)
1118 {
1119 push (@{$section_ptr->{'metadata'}}, [$name, substr($afterDec, 0, $i)]);
1120 }
1121 }
1122
1123 #Only add the metadata if it has not already been added
1124 my $metaMap = $self->get_metadata_hashmap($section);
1125 }
1126 }
1127
1128 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1129}
1130
1131
1132# methods for dealing with text
1133
1134# returns the text for a section
1135sub get_text {
1136 my $self = shift (@_);
1137 my ($section) = @_;
1138
1139 my $section_ptr = $self->_lookup_section($section);
1140 if (!defined $section_ptr) {
1141 print STDERR "doc::get_text couldn't find section " .
1142 "$section\n";
1143 return "";
1144 }
1145
1146 return $section_ptr->{'text'};
1147}
1148
1149# returns the (utf-8 encoded) length of the text for a section
1150sub get_text_length {
1151 my $self = shift (@_);
1152 my ($section) = @_;
1153
1154 my $section_ptr = $self->_lookup_section($section);
1155 if (!defined $section_ptr) {
1156 print STDERR "doc::get_text_length couldn't find section " .
1157 "$section\n";
1158 return 0;
1159 }
1160
1161 return length ($section_ptr->{'text'});
1162}
1163
1164# returns the total length for all the sections
1165sub get_total_text_length {
1166 my $self = shift (@_);
1167
1168 my $section = $self->get_top_section();
1169 my $length = 0;
1170 while (defined $section) {
1171 $length += $self->get_text_length($section);
1172 $section = $self->get_next_section($section);
1173 }
1174 return $length;
1175}
1176
1177sub delete_text {
1178 my $self = shift (@_);
1179 my ($section) = @_;
1180
1181 my $section_ptr = $self->_lookup_section($section);
1182 if (!defined $section_ptr) {
1183 print STDERR "doc::delete_text couldn't find section " .
1184 "$section\n";
1185 return;
1186 }
1187
1188 $section_ptr->{'text'} = "";
1189}
1190
1191# add_text assumes the text is in (extended) ascii form. For
1192# text which has been already converted to the UTF-8 format
1193# use add_utf8_text.
1194sub add_text {
1195 my $self = shift (@_);
1196 my ($section, $text) = @_;
1197
1198 # convert the text to UTF-8 encoded unicode characters
1199 # and add the text
1200 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1201}
1202
1203
1204# add_utf8_text assumes the text to be added has already
1205# been converted to the UTF-8 encoding. For ascii text use
1206# add_text
1207sub add_utf8_text {
1208 my $self = shift (@_);
1209 my ($section, $text) = @_;
1210
1211 my $section_ptr = $self->_lookup_section($section);
1212 if (!defined $section_ptr) {
1213 print STDERR "doc::add_utf8_text couldn't find section " .
1214 "$section\n";
1215 return;
1216 }
1217
1218 $section_ptr->{'text'} .= $text;
1219}
1220
1221# returns the Source meta, which is the utf8 filename generated.
1222# Added a separate method here for convenience
1223sub get_source {
1224 my $self = shift (@_);
1225 return $self->get_metadata_element ($self->get_top_section(), "Source");
1226}
1227
1228# returns the SourceFile meta, which is the url reference to the URL-encoded
1229# version of Source (the utf8 filename). Added a separate method here for convenience
1230sub get_sourcefile {
1231 my $self = shift (@_);
1232 return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
1233}
1234
1235# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
1236# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
1237sub get_assocfile_from_sourcefile {
1238 my $self = shift (@_);
1239
1240 # get the SourceFile meta, which is a *URL* to a file on the filesystem
1241 my $top_section = $self->get_top_section();
1242 my $source_file = $self->get_metadata_element($top_section, "SourceFile");
1243
1244 # get the actual filename as it exists on the filesystem which this url refers to
1245 $source_file = &unicode::url_to_filename($source_file);
1246 my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
1247 return $assocfilename;
1248}
1249
1250# methods for dealing with associated files
1251
1252# a file is associated with a document, NOT a section.
1253# if section is defined it is noted in the data structure
1254# only so that files associated from a particular section
1255# may be removed later (using delete_section_assoc_files)
1256sub associate_file {
1257 my $self = shift (@_);
1258 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1259 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1260
1261 # remove all associated files with the same name
1262 $self->delete_assoc_file ($assoc_filename);
1263
1264 # Too harsh a requirement
1265 # Definitely get HTML docs, for example, with some missing
1266 # support files
1267# if (!&util::fd_exists($real_filename)) {
1268# print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n";
1269# exit -1;
1270# }
1271
1272# print STDERR "**** is the following a UTF8 rep of *real* filename?\n $real_filename\n";
1273# print STDERR "****##### so, ensure it is before storing?!?!?\n";
1274## my $utf8_filename = Encode::encode("utf8",$filename);
1275
1276 push (@{$self->{'associated_files'}},
1277 [$real_filename, $assoc_filename, $mime_type, $section]);
1278}
1279
1280# returns a list of associated files in the form
1281# [[real_filename, assoc_filename, mimetype], ...]
1282sub get_assoc_files {
1283 my $self = shift (@_);
1284
1285 return $self->{'associated_files'};
1286}
1287
1288# the following two methods used to keep track of original associated files
1289# for incremental building. eg a txt file used by an item file does not end
1290# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1291# file for incremental build
1292sub associate_source_file {
1293 my $self = shift (@_);
1294 my ($full_filename) = @_;
1295
1296 push (@{$self->{'source_assoc_files'}}, $full_filename);
1297
1298}
1299
1300sub get_source_assoc_files {
1301 my $self = shift (@_);
1302
1303 return $self->{'source_assoc_files'};
1304
1305
1306}
1307sub metadata_file {
1308 my $self = shift (@_);
1309 my ($real_filename, $filename) = @_;
1310
1311 push (@{$self->{'metadata_files'}},
1312 [$real_filename, $filename]);
1313}
1314
1315# used for writing out the archiveinf-doc info database, to list all the metadata files
1316sub get_meta_files {
1317 my $self = shift (@_);
1318
1319 return $self->{'metadata_files'};
1320}
1321
1322sub delete_section_assoc_files {
1323 my $self = shift (@_);
1324 my ($section) = @_;
1325
1326 my $i=0;
1327 while ($i < scalar (@{$self->{'associated_files'}})) {
1328 if (defined $self->{'associated_files'}->[$i]->[3] &&
1329 $self->{'associated_files'}->[$i]->[3] eq $section) {
1330 splice (@{$self->{'associated_files'}}, $i, 1);
1331 } else {
1332 $i++;
1333 }
1334 }
1335}
1336
1337sub delete_assoc_file {
1338 my $self = shift (@_);
1339 my ($assoc_filename) = @_;
1340
1341 my $i=0;
1342 while ($i < scalar (@{$self->{'associated_files'}})) {
1343 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1344 splice (@{$self->{'associated_files'}}, $i, 1);
1345 } else {
1346 $i++;
1347 }
1348 }
1349}
1350
1351sub reset_nextsection_ptr {
1352 my $self = shift (@_);
1353 my ($section) = @_;
1354
1355 my $section_ptr = $self->_lookup_section($section);
1356 $section_ptr->{'next_subsection'} = 1;
1357}
1358
13591;
Note: See TracBrowser for help on using the repository browser.