source: main/trunk/greenstone2/perllib/doc.pm@ 23939

Last change on this file since 23939 was 23939, checked in by ak19, 12 years ago

GS3's OAIserver passes final official oaiserver validation tests: to do with earliestDatestamp. 1. Perl code (inexport, basebuilder, colcfg, buildconfigxml.pm perl files) write out the earliestDatestamp into GS3's buildconfig.xml. Whenever a full-build is performed, the archives directory is recreated. At this stage, inexport creates a new file in archives called earliestDatestamp containing the current time. Whenever an incremental build is performed, this file already exists in archive, so it is left untouched, preserving the time of the full-build (which is the earliestDatestamp). The other perl files are concerned with obtaining this value from the archives directory and writing it out to the build config file. 2. doc.pm and BasePlugout.pm write out the current date and time for each document processed under the new fields oailastmodified and oailastmodifieddate. Changes made in this commit are related to GS3 java src code changes that work in tandem.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 34.2 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use ghtml;
40use File::stat;
41##use hashdoc;
42use docprint;
43
44# the document type may be indexed_doc, nonindexed_doc, or
45# classification
46
47our $OIDcount = 0;
48
49# rename_method can be 'url', 'none', 'base64'
50sub new {
51 my $class = shift (@_);
52 my ($source_filename, $doc_type, $rename_method) = @_;
53
54
55 my $self = bless {'associated_files'=>[],
56 'subsection_order'=>[],
57 'next_subsection'=>1,
58 'subsections'=>{},
59 'metadata'=>[],
60 'text'=>"",
61 'OIDtype'=>"hash"}, $class;
62
63 # used to set lastmodified here, but this can screw up the HASH ids, so
64 # the docsave processor now calls set_lastmodified
65
66 $self->set_source_path($source_filename);
67
68 if (defined $source_filename) {
69 $source_filename = &util::filename_within_collection($source_filename);
70 print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
71 $self->set_source_filename ($source_filename, $rename_method);
72 }
73
74 $self->set_doc_type ($doc_type) if defined $doc_type;
75
76 return $self;
77}
78
79
80sub set_source_path
81{
82 my $self = shift @_;
83 my ($source_filename) = @_;
84
85 if (defined $source_filename) {
86 # On Windows the source_filename can be in terse DOS format
87 # e.g. test~1.txt
88
89 $self->{'terse_source_path'} = $source_filename;
90
91 if (-e $source_filename) {
92 # See if we can do better for Windows with a filename
93 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
94 require Win32;
95 $self->{'source_path'} = Win32::GetLongPathName($source_filename);
96 }
97 else {
98 # For Unix-based systems, there is no difference between the two
99 $self->{'source_path'} = $source_filename;
100 }
101 }
102 else {
103 print STDERR "Warning: In doc::set_source_path(), file\n";
104 print STDERR " $source_filename\n";
105 print STDERR " does not exist\n";
106
107 # (default) Set it to whatever we were given
108 $self->{'source_path'} = $source_filename;
109 }
110 }
111 else {
112 # Previous code for setting source_path allowed for
113 # it to be undefined, so continue this practice
114 $self->{'terse_source_path'} = undef;
115 $self->{'source_path'} = undef;
116 }
117}
118
119
120sub get_source_path
121{
122 my $self = shift @_;
123
124 return $self->{'terse_source_path'};
125}
126
127# set lastmodified for OAI purposes, added by GRB, moved by kjdon
128sub set_oailastmodified {
129 my $self = shift (@_);
130
131 my $source_path = $self->{'terse_source_path'};
132
133 if (defined $source_path && (-e $source_path)) {
134 my $current_time = time;
135
136 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
137 $wday, $yday, $isdst) = localtime($current_time);
138
139 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
140
141 $self->add_utf8_metadata($self->get_top_section(), "oailastmodified", $current_time);
142 $self->add_utf8_metadata($self->get_top_section(), "oailastmodifieddate", $date_modified);
143 }
144}
145
146# no longer used for OAI purposes, since lastmodified is not what we want as the
147# Datestamp of a document. This doc metadata may be useful for general purposes.
148sub set_lastmodified {
149 my $self = shift (@_);
150
151 my $source_path = $self->{'terse_source_path'};
152
153 if (defined $source_path && (-e $source_path)) {
154
155 my $file_stat = stat($source_path);
156 my $mtime = $file_stat->mtime;
157 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
158 $wday, $yday, $isdst) = localtime($mtime);
159
160 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
161
162 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $mtime);
163 $self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
164 }
165}
166
167# clone the $self object
168sub duplicate {
169 my $self = shift (@_);
170
171 my $newobj = {};
172
173 foreach my $k (keys %$self) {
174 $newobj->{$k} = &clone ($self->{$k});
175 }
176
177 bless $newobj, ref($self);
178 return $newobj;
179}
180
181sub clone {
182 my ($from) = @_;
183 my $type = ref ($from);
184
185 if ($type eq "HASH") {
186 my $to = {};
187 foreach my $key (keys %$from) {
188 $to->{$key} = &clone ($from->{$key});
189 }
190 return $to;
191 } elsif ($type eq "ARRAY") {
192 my $to = [];
193 foreach my $v (@$from) {
194 push (@$to, &clone ($v));
195 }
196 return $to;
197 } else {
198 return $from;
199 }
200}
201
202sub set_OIDtype {
203 my $self = shift (@_);
204 my ($type, $metadata) = @_;
205
206 if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|incremental|dirname|assigned)$/) {
207 $self->{'OIDtype'} = $type;
208 } else {
209 $self->{'OIDtype'} = "hash";
210 }
211
212 if ($type =~ /^assigned$/) {
213 if (defined $metadata) {
214 $self->{'OIDmetadata'} = $metadata;
215 } else {
216 $self->{'OIDmetadata'} = "dc.Identifier";
217 }
218 }
219}
220
221# rename_method can be 'url', 'none', 'base64'
222sub set_source_filename {
223 my $self = shift (@_);
224 my ($source_filename, $rename_method) = @_;
225
226 # Since the gsdlsourcefilename element goes into the doc.xml it has
227 # to be utf8. However, it should also *represent* the source filename
228 # (in the import directory) which may not be utf8 at all.
229 # For instance, if this meta element (gsdlsourcefilename) will be used
230 # by other applications that parse doc.xml in order to locate
231 # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
232 # encode the real filename as this is a binary-to-text encoding meaning
233 # that the resulting string is ASCII (utf8). Decoding will give the original.
234
235# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
236
237 # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
238 $source_filename = $self->encode_filename($source_filename, $rename_method);
239# my ($srcfilename,$dirname,$suffix)
240# = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
241# print STDERR "-> $srcfilename -> ";
242# $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
243# $source_filename = &util::filename_cat($dirname, $srcfilename);
244# print STDERR "$source_filename\n";
245
246 $self->set_utf8_metadata_element ($self->get_top_section(),
247 "gsdlsourcefilename",
248 $source_filename);
249}
250
251sub encode_filename {
252 my $self = shift (@_);
253 my ($source_filename, $rename_method) = @_;
254
255 my ($srcfilename,$dirname,$suffix)
256 = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
257# print STDERR "-> $srcfilename -> ";
258 $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
259 $source_filename = &util::filename_cat($dirname, $srcfilename);
260
261 return $source_filename;
262}
263
264sub set_converted_filename {
265 my $self = shift (@_);
266 my ($converted_filename) = @_;
267
268 # we know the converted filename is utf8
269 $self->set_utf8_metadata_element ($self->get_top_section(),
270 "gsdlconvertedfilename",
271 $converted_filename);
272}
273
274# returns the source_filename as it was provided
275sub get_unmodified_source_filename {
276 my $self = shift (@_);
277
278 return $self->{'terse_source_path'};
279}
280
281# returns the source_filename with whatever rename_method was given
282sub get_source_filename {
283 my $self = shift (@_);
284
285 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
286}
287
288
289
290# returns converted filename if available else returns source filename
291sub get_filename_for_hashing {
292 my $self = shift (@_);
293
294 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
295
296 if (!defined $filename) {
297 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
298 # if NULPlug processed file, then don't give a filename
299 if (defined $plugin_name && $plugin_name eq "NULPlug") {
300 $filename = undef;
301 } else { # returns the URL encoded source filename!
302 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
303 }
304 }
305
306 if (!&util::filename_is_absolute($filename)) {
307 $filename = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$filename);
308 }
309
310 return $filename;
311}
312
313sub set_doc_type {
314 my $self = shift (@_);
315 my ($doc_type) = @_;
316
317 $self->set_metadata_element ($self->get_top_section(),
318 "gsdldoctype",
319 $doc_type);
320}
321
322# returns the gsdldoctype as it was provided
323# the default of "indexed_doc" is used if no document
324# type was provided
325sub get_doc_type {
326 my $self = shift (@_);
327
328 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
329 return $doc_type if (defined $doc_type);
330 return "indexed_doc";
331}
332
333
334# look up the reference to the a particular section
335sub _lookup_section {
336 my $self = shift (@_);
337 my ($section) = @_;
338
339 my ($num);
340 my $sectionref = $self;
341
342 while (defined $section && $section ne "") {
343
344 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
345
346 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
347
348 $section = "" unless defined $section;
349
350
351 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
352 $sectionref = $sectionref->{'subsections'}->{$num};
353 } else {
354 return undef;
355 }
356 }
357
358 return $sectionref;
359}
360
361# calculate OID by hashing the contents of the document
362sub _calc_OID {
363 my $self = shift (@_);
364 my ($filename) = @_;
365
366
367 my $osexe = &util::get_os_exe();
368
369 my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
370 $ENV{'GSDLOS'},"hashfile$osexe");
371
372 my $result = "NULL";
373
374
375 if (-e "$hashfile_exe") {
376# $result = `\"$hashfile_exe\" \"$filename\"`;
377# $result = `hashfile$osexe \"$filename\" 2>&1`;
378 $result = `hashfile$osexe \"$filename\"`;
379
380 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
381 } else {
382 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
383 }
384 return "HASH$result";
385}
386
387# methods dealing with OID, not groups of them.
388
389# if $OID is not provided one is calculated
390sub set_OID {
391 my $self = shift (@_);
392 my ($OID) = @_;
393
394 my $use_hash_oid = 0;
395 # if an OID wasn't provided calculate one
396 if (!defined $OID) {
397 $OID = "NULL";
398 if ($self->{'OIDtype'} =~ /^hash/) {
399 $use_hash_oid = 1;
400 } elsif ($self->{'OIDtype'} eq "incremental") {
401 $OID = "D" . $OIDcount;
402 $OIDcount ++;
403
404 } elsif ($self->{'OIDtype'} eq "dirname") {
405 $OID = 'J';
406 my $filename = $self->get_source_filename();
407 if (defined($filename)) { # && -e $filename) {
408 $OID = &File::Basename::dirname($filename);
409 if (defined $OID) {
410 $OID = 'J'.&File::Basename::basename($OID);
411 $OID = &util::tidy_up_oid($OID);
412 } else {
413 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
414 $use_hash_oid = 1;
415 }
416 } else {
417 print STDERR "Failed to find filename, generating hash id\n";
418 $use_hash_oid = 1;
419 }
420
421 } elsif ($self->{'OIDtype'} eq "assigned") {
422 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
423 if (defined $identifier && $identifier ne "") {
424 $OID = $identifier;
425 $OID = &util::tidy_up_oid($OID);
426 } else {
427 # need a hash id
428 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
429 $use_hash_oid = 1;
430 }
431
432 } else {
433 $use_hash_oid = 1;
434 }
435
436 if ($use_hash_oid) {
437 my $hash_on_file = 1;
438 if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
439 $hash_on_file = 0;
440 }
441 if ($hash_on_file) {
442 # "hash" OID - feed file to hashfile.exe
443 my $filename = $self->get_filename_for_hashing();
444
445 # -z: don't want to hash on the file if it is zero size
446 if (defined($filename) && -e $filename && !-z $filename) {
447 $OID = $self->_calc_OID ($filename);
448 } else {
449 $hash_on_file = 0;
450 }
451 }
452 if (!$hash_on_file) {
453 my $filename = &util::get_tmp_filename();
454 if (!open (OUTFILE, ">:utf8", $filename)) {
455 print STDERR "doc::set_OID could not write to $filename\n";
456 } else {
457 my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
458 print OUTFILE $doc_text;
459 close (OUTFILE);
460 }
461 $OID = $self->_calc_OID ($filename);
462 &util::rm ($filename);
463 }
464 }
465 }
466 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
467}
468
469# this uses hashdoc (embedded c thingy) which is faster but still
470# needs a little work to be suffiently stable
471sub ___set_OID {
472 my $self = shift (@_);
473 my ($OID) = @_;
474
475 # if an OID wasn't provided then calculate hash value based on document
476 if (!defined $OID)
477 {
478 my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
479 my $hash_len = length($hash_text);
480
481 $OID = &hashdoc::buffer($hash_text,$hash_len);
482 }
483
484 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
485}
486
487# returns the OID for this document
488sub get_OID {
489 my $self = shift (@_);
490 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
491 return $OID if (defined $OID);
492 return "NULL";
493}
494
495sub delete_OID {
496 my $self = shift (@_);
497
498 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
499}
500
501
502# methods for manipulating section names
503
504# returns the name of the top-most section (the top
505# level of the document
506sub get_top_section {
507 my $self = shift (@_);
508
509 return "";
510}
511
512# returns a section
513sub get_parent_section {
514 my $self = shift (@_);
515 my ($section) = @_;
516
517 $section =~ s/(^|\.)\d+$//;
518
519 return $section;
520}
521
522# returns the first child section (or the end child
523# if there isn't any)
524sub get_begin_child {
525 my $self = shift (@_);
526 my ($section) = @_;
527
528 my $section_ptr = $self->_lookup_section($section);
529 return "" unless defined $section_ptr;
530
531 if (defined $section_ptr->{'subsection_order'}->[0]) {
532 return "$section.$section_ptr->{'subsection_order'}->[0]";
533 }
534
535 return $self->get_end_child ($section);
536}
537
538# returns the next child of a parent section
539sub get_next_child {
540 my $self = shift (@_);
541 my ($section) = @_;
542
543 my $parent_section = $self->get_parent_section($section);
544 my $parent_section_ptr = $self->_lookup_section($parent_section);
545 return undef unless defined $parent_section_ptr;
546
547 my ($section_num) = $section =~ /(\d+)$/;
548 return undef unless defined $section_num;
549
550 my $i = 0;
551 my $section_order = $parent_section_ptr->{'subsection_order'};
552 while ($i < scalar(@$section_order)) {
553 last if $section_order->[$i] eq $section_num;
554 $i++;
555 }
556
557 $i++; # the next child
558 if ($i < scalar(@$section_order)) {
559 return $section_order->[$i] if $parent_section eq "";
560 return "$parent_section.$section_order->[$i]";
561 }
562
563 # no more sections in this level
564 return undef;
565}
566
567# returns a reference to a list of children
568sub get_children {
569 my $self = shift (@_);
570 my ($section) = @_;
571
572 my $section_ptr = $self->_lookup_section($section);
573 return [] unless defined $section_ptr;
574
575 my @children = @{$section_ptr->{'subsection_order'}};
576
577 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
578 return \@children;
579}
580
581# returns the child section one past the last one (which
582# is coded as "0")
583sub get_end_child {
584 my $self = shift (@_);
585 my ($section) = @_;
586
587 return $section . ".0" unless $section eq "";
588 return "0";
589}
590
591# returns the next section in book order
592sub get_next_section {
593 my $self = shift (@_);
594 my ($section) = @_;
595
596 return undef unless defined $section;
597
598 my $section_ptr = $self->_lookup_section($section);
599 return undef unless defined $section_ptr;
600
601 # first try to find first child
602 if (defined $section_ptr->{'subsection_order'}->[0]) {
603 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
604 return "$section.$section_ptr->{'subsection_order'}->[0]";
605 }
606
607 do {
608 # try to find sibling
609 my $next_child = $self->get_next_child ($section);
610 return $next_child if (defined $next_child);
611
612 # move up one level
613 $section = $self->get_parent_section ($section);
614 } while $section =~ /\d/;
615
616 return undef;
617}
618
619sub is_leaf_section {
620 my $self = shift (@_);
621 my ($section) = @_;
622
623 my $section_ptr = $self->_lookup_section($section);
624 return 1 unless defined $section_ptr;
625
626 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
627}
628
629# methods for dealing with sections
630
631# returns the name of the inserted section
632sub insert_section {
633 my $self = shift (@_);
634 my ($before_section) = @_;
635
636 # get the child to insert before and its parent section
637 my $parent_section = "";
638 my $before_child = "0";
639 my @before_section = split (/\./, $before_section);
640 if (scalar(@before_section) > 0) {
641 $before_child = pop (@before_section);
642 $parent_section = join (".", @before_section);
643 }
644
645 my $parent_section_ptr = $self->_lookup_section($parent_section);
646 if (!defined $parent_section_ptr) {
647 print STDERR "doc::insert_section couldn't find parent section " .
648 "$parent_section\n";
649 return;
650 }
651
652 # get the next section number
653 my $section_num = $parent_section_ptr->{'next_subsection'}++;
654
655 my $i = 0;
656 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
657 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
658 $i++;
659 }
660
661 # insert the section number into the order list
662 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
663
664 # add this section to the parent section
665 my $section_ptr = {'subsection_order'=>[],
666 'next_subsection'=>1,
667 'subsections'=>{},
668 'metadata'=>[],
669 'text'=>""};
670 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
671
672 # work out the full section number
673 my $section = $parent_section;
674 $section .= "." unless $section eq "";
675 $section .= $section_num;
676
677 return $section;
678}
679
680# creates a pre-named section
681sub create_named_section {
682 my $self = shift (@_);
683 my ($mastersection) = @_;
684
685 my ($num);
686 my $section = $mastersection;
687 my $sectionref = $self;
688
689 while ($section ne "") {
690 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
691 $num =~ s/^0+(\d)/$1/; # remove leading 0s
692 $section = "" unless defined $section;
693
694 if (defined $num) {
695 if (!defined $sectionref->{'subsections'}->{$num}) {
696 push (@{$sectionref->{'subsection_order'}}, $num);
697 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
698 'next_subsection'=>1,
699 'subsections'=>{},
700 'metadata'=>[],
701 'text'=>""};
702 if ($num >= $sectionref->{'next_subsection'}) {
703 $sectionref->{'next_subsection'} = $num + 1;
704 }
705 }
706 $sectionref = $sectionref->{'subsections'}->{$num};
707
708 } else {
709 print STDERR "doc::create_named_section couldn't create section ";
710 print STDERR "$mastersection\n";
711 last;
712 }
713 }
714}
715
716# returns a reference to a list of subsections
717sub list_subsections {
718 my $self = shift (@_);
719 my ($section) = @_;
720
721 my $section_ptr = $self->_lookup_section ($section);
722 if (!defined $section_ptr) {
723 print STDERR "doc::list_subsections couldn't find section $section\n";
724 return [];
725 }
726
727 return [@{$section_ptr->{'subsection_order'}}];
728}
729
730sub delete_section {
731 my $self = shift (@_);
732 my ($section) = @_;
733
734# my $section_ptr = {'subsection_order'=>[],
735# 'next_subsection'=>1,
736# 'subsections'=>{},
737# 'metadata'=>[],
738# 'text'=>""};
739
740 # if this is the top section reset everything
741 if ($section eq "") {
742 $self->{'subsection_order'} = [];
743 $self->{'subsections'} = {};
744 $self->{'metadata'} = [];
745 $self->{'text'} = "";
746 return;
747 }
748
749 # find the parent of the section to delete
750 my $parent_section = "";
751 my $child = "0";
752 my @section = split (/\./, $section);
753 if (scalar(@section) > 0) {
754 $child = pop (@section);
755 $parent_section = join (".", @section);
756 }
757
758 my $parent_section_ptr = $self->_lookup_section($parent_section);
759 if (!defined $parent_section_ptr) {
760 print STDERR "doc::delete_section couldn't find parent section " .
761 "$parent_section\n";
762 return;
763 }
764
765 # remove this section from the subsection_order list
766 my $i = 0;
767 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
768 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
769 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
770 last;
771 }
772 $i++;
773 }
774
775 # remove this section from the subsection hash
776 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
777 undef $parent_section_ptr->{'subsections'}->{$child};
778 }
779}
780
781#--
782# methods for dealing with metadata
783
784# set_metadata_element and get_metadata_element are for metadata
785# which should only have one value. add_meta_data and get_metadata
786# are for metadata which can have more than one value.
787
788# returns the first metadata value which matches field
789
790# This version of get metadata element works much like the one above,
791# except it allows for the namespace portion of a metadata element to
792# be ignored, thus if you are searching for dc.Title, the first piece
793# of matching metadata ending with the name Title (once any namespace
794# is removed) would be returned.
795# 28-11-2003 John Thompson
796sub get_metadata_element {
797 my $self = shift (@_);
798 my ($section, $field, $ignore_namespace) = @_;
799 my ($data);
800
801 $ignore_namespace = 0 unless defined $ignore_namespace;
802
803 my $section_ptr = $self->_lookup_section($section);
804 if (!defined $section_ptr) {
805 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
806 return;
807 }
808
809 # Remove the namespace if we are being told to ignore them
810 if($ignore_namespace) {
811 $field =~ s/^\w*\.//;
812 }
813
814 foreach $data (@{$section_ptr->{'metadata'}}) {
815
816 my $data_name = $data->[0];
817
818 # Remove the any namespace if we are being told to ignore them
819 if($ignore_namespace) {
820 $data_name =~ s/^\w*\.//;
821 }
822 $data_name =~ s/^ex\.//; # we always remove ex. - it maybe there in doc_obj, but we will never ask for it.
823 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
824 }
825
826 return undef; # was not found
827}
828
829# returns a list of the form [value1, value2, ...]
830sub get_metadata {
831 my $self = shift (@_);
832 my ($section, $field, $ignore_namespace) = @_;
833 my ($data);
834
835 $ignore_namespace = 0 unless defined $ignore_namespace;
836
837 my $section_ptr = $self->_lookup_section($section);
838 if (!defined $section_ptr) {
839 print STDERR "doc::get_metadata couldn't find section ",
840 $section, "\n";
841 return;
842 }
843
844 # Remove the any namespace if we are being told to ignore them
845 if($ignore_namespace) {
846 $field =~ s/^\w*\.//;
847 }
848
849 my @metadata = ();
850 foreach $data (@{$section_ptr->{'metadata'}}) {
851
852 my $data_name = $data->[0];
853 # Remove the any namespace if we are being told to ignore them
854 if($ignore_namespace) {
855 $data_name =~ s/^\w*\.//;
856 }
857 $data_name =~ s/^ex\.//; # we always remove ex. - it maybe there in doc_obj, but we will never ask for it.
858 push (@metadata, $data->[1]) if ($data_name eq $field);
859 }
860
861 return \@metadata;
862}
863
864sub get_metadata_hashmap {
865 my $self = shift (@_);
866 my ($section, $opt_namespace) = @_;
867
868 my $section_ptr = $self->_lookup_section($section);
869 if (!defined $section_ptr) {
870 print STDERR "doc::get_metadata couldn't find section ",
871 $section, "\n";
872 return;
873 }
874
875 my $metadata_hashmap = {};
876 foreach my $data (@{$section_ptr->{'metadata'}}) {
877 my $metaname = $data->[0];
878
879 if ((!defined $opt_namespace) || ($metaname =~ m/^$opt_namespace\./)) {
880 if (!defined $metadata_hashmap->{$metaname}) {
881 $metadata_hashmap->{$metaname} = [];
882 }
883 my $metaval_list = $metadata_hashmap->{$metaname};
884 push(@$metaval_list, $data->[1]);
885 }
886 }
887
888 return $metadata_hashmap;
889}
890
891# returns a list of the form [[field,value],[field,value],...]
892sub get_all_metadata {
893 my $self = shift (@_);
894 my ($section) = @_;
895
896 my $section_ptr = $self->_lookup_section($section);
897 if (!defined $section_ptr) {
898 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
899 return;
900 }
901
902 return $section_ptr->{'metadata'};
903}
904
905# $value is optional
906sub delete_metadata {
907 my $self = shift (@_);
908 my ($section, $field, $value) = @_;
909
910 my $section_ptr = $self->_lookup_section($section);
911 if (!defined $section_ptr) {
912 print STDERR "doc::delete_metadata couldn't find section ", $section, "$field\n";
913 return;
914 }
915
916 my $i = 0;
917 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
918 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
919 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
920 splice (@{$section_ptr->{'metadata'}}, $i, 1);
921 } else {
922 $i++;
923 }
924 }
925}
926
927sub delete_all_metadata {
928 my $self = shift (@_);
929 my ($section) = @_;
930
931 my $section_ptr = $self->_lookup_section($section);
932 if (!defined $section_ptr) {
933 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
934 return;
935 }
936
937 $section_ptr->{'metadata'} = [];
938}
939
940sub set_metadata_element {
941 my $self = shift (@_);
942 my ($section, $field, $value) = @_;
943
944 $self->set_utf8_metadata_element ($section, $field,
945 &unicode::ascii2utf8(\$value));
946}
947
948# set_utf8_metadata_element assumes the text has already been
949# converted to the UTF-8 encoding.
950sub set_utf8_metadata_element {
951 my $self = shift (@_);
952 my ($section, $field, $value) = @_;
953
954 $self->delete_metadata ($section, $field);
955 $self->add_utf8_metadata ($section, $field, $value);
956}
957
958
959# add_metadata assumes the text is in (extended) ascii form. For
960# text which has already been converted to the UTF-8 format use
961# add_utf8_metadata.
962sub add_metadata {
963 my $self = shift (@_);
964 my ($section, $field, $value) = @_;
965
966 $self->add_utf8_metadata ($section, $field,
967 &unicode::ascii2utf8(\$value));
968}
969
970sub add_utf8_metadata {
971 my $self = shift (@_);
972 my ($section, $field, $value) = @_;
973
974# my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
975# my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
976# print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
977
978 my $section_ptr = $self->_lookup_section($section);
979 if (!defined $section_ptr) {
980 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
981 return;
982 }
983 if (!defined $value) {
984 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
985 return;
986 }
987 if (!defined $field) {
988 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
989 return;
990 }
991
992 #print STDERR "###$field=$value\n";
993
994 # For now, supress this check. Given that text data read in is now
995 # Unicode aware, then the following block of code can (ironically enough)
996 # cause our unicode compliant string to be re-encoded (leading to
997 # a double-encoded UTF-8 string, which we definitely don't want!).
998
999
1000 # double check that the value is utf-8
1001# if (!&unicode::check_is_utf8($value)) {
1002# print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
1003# &unicode::ensure_utf8(\$value);
1004# print STDERR " Tried converting to utf8: $value\n";
1005# }
1006
1007 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1008}
1009
1010
1011# methods for dealing with text
1012
1013# returns the text for a section
1014sub get_text {
1015 my $self = shift (@_);
1016 my ($section) = @_;
1017
1018 my $section_ptr = $self->_lookup_section($section);
1019 if (!defined $section_ptr) {
1020 print STDERR "doc::get_text couldn't find section " .
1021 "$section\n";
1022 return "";
1023 }
1024
1025 return $section_ptr->{'text'};
1026}
1027
1028# returns the (utf-8 encoded) length of the text for a section
1029sub get_text_length {
1030 my $self = shift (@_);
1031 my ($section) = @_;
1032
1033 my $section_ptr = $self->_lookup_section($section);
1034 if (!defined $section_ptr) {
1035 print STDERR "doc::get_text_length couldn't find section " .
1036 "$section\n";
1037 return 0;
1038 }
1039
1040 return length ($section_ptr->{'text'});
1041}
1042
1043# returns the total length for all the sections
1044sub get_total_text_length {
1045 my $self = shift (@_);
1046
1047 my $section = $self->get_top_section();
1048 my $length = 0;
1049 while (defined $section) {
1050 $length += $self->get_text_length($section);
1051 $section = $self->get_next_section($section);
1052 }
1053 return $length;
1054}
1055
1056sub delete_text {
1057 my $self = shift (@_);
1058 my ($section) = @_;
1059
1060 my $section_ptr = $self->_lookup_section($section);
1061 if (!defined $section_ptr) {
1062 print STDERR "doc::delete_text couldn't find section " .
1063 "$section\n";
1064 return;
1065 }
1066
1067 $section_ptr->{'text'} = "";
1068}
1069
1070# add_text assumes the text is in (extended) ascii form. For
1071# text which has been already converted to the UTF-8 format
1072# use add_utf8_text.
1073sub add_text {
1074 my $self = shift (@_);
1075 my ($section, $text) = @_;
1076
1077 # convert the text to UTF-8 encoded unicode characters
1078 # and add the text
1079 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1080}
1081
1082
1083# add_utf8_text assumes the text to be added has already
1084# been converted to the UTF-8 encoding. For ascii text use
1085# add_text
1086sub add_utf8_text {
1087 my $self = shift (@_);
1088 my ($section, $text) = @_;
1089
1090 my $section_ptr = $self->_lookup_section($section);
1091 if (!defined $section_ptr) {
1092 print STDERR "doc::add_utf8_text couldn't find section " .
1093 "$section\n";
1094 return;
1095 }
1096
1097 $section_ptr->{'text'} .= $text;
1098}
1099
1100# returns the Source meta, which is the utf8 filename generated.
1101# Added a separate method here for convenience
1102sub get_source {
1103 my $self = shift (@_);
1104 return $self->get_metadata_element ($self->get_top_section(), "Source");
1105}
1106
1107# returns the SourceFile meta, which is the url reference to the URL-encoded
1108# version of Source (the utf8 filename). Added a separate method here for convenience
1109sub get_sourcefile {
1110 my $self = shift (@_);
1111 return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
1112}
1113
1114# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
1115# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
1116sub get_assocfile_from_sourcefile {
1117 my $self = shift (@_);
1118
1119 # get the SourceFile meta, which is a *URL* to a file on the filesystem
1120 my $top_section = $self->get_top_section();
1121 my $source_file = $self->get_metadata_element($top_section, "SourceFile");
1122
1123 # get the actual filename as it exists on the filesystem which this url refers to
1124 $source_file = &unicode::url_to_filename($source_file);
1125 my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
1126 return $assocfilename;
1127}
1128
1129# methods for dealing with associated files
1130
1131# a file is associated with a document, NOT a section.
1132# if section is defined it is noted in the data structure
1133# only so that files associated from a particular section
1134# may be removed later (using delete_section_assoc_files)
1135sub associate_file {
1136 my $self = shift (@_);
1137 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1138 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1139
1140 # remove all associated files with the same name
1141 $self->delete_assoc_file ($assoc_filename);
1142
1143 # Too harsh a requirement
1144 # Definitely get HTML docs, for example, with some missing
1145 # support files
1146# if (!&util::fd_exists($real_filename)) {
1147# print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n";
1148# exit -1;
1149# }
1150
1151# print STDERR "**** is the following a UTF8 rep of *real* filename?\n $real_filename\n";
1152# print STDERR "****##### so, ensure it is before storing?!?!?\n";
1153## my $utf8_filename = Encode::encode("utf8",$filename);
1154
1155 push (@{$self->{'associated_files'}},
1156 [$real_filename, $assoc_filename, $mime_type, $section]);
1157}
1158
1159# returns a list of associated files in the form
1160# [[real_filename, assoc_filename, mimetype], ...]
1161sub get_assoc_files {
1162 my $self = shift (@_);
1163
1164 return $self->{'associated_files'};
1165}
1166
1167# the following two methods used to keep track of original associated files
1168# for incremental building. eg a txt file used by an item file does not end
1169# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1170# file for incremental build
1171sub associate_source_file {
1172 my $self = shift (@_);
1173 my ($full_filename) = @_;
1174
1175 push (@{$self->{'source_assoc_files'}}, $full_filename);
1176
1177}
1178
1179sub get_source_assoc_files {
1180 my $self = shift (@_);
1181
1182 return $self->{'source_assoc_files'};
1183
1184
1185}
1186sub metadata_file {
1187 my $self = shift (@_);
1188 my ($real_filename, $filename) = @_;
1189
1190 push (@{$self->{'metadata_files'}},
1191 [$real_filename, $filename]);
1192}
1193
1194# used for writing out the archiveinf-doc info database, to list all the metadata files
1195sub get_meta_files {
1196 my $self = shift (@_);
1197
1198 return $self->{'metadata_files'};
1199}
1200
1201sub delete_section_assoc_files {
1202 my $self = shift (@_);
1203 my ($section) = @_;
1204
1205 my $i=0;
1206 while ($i < scalar (@{$self->{'associated_files'}})) {
1207 if (defined $self->{'associated_files'}->[$i]->[3] &&
1208 $self->{'associated_files'}->[$i]->[3] eq $section) {
1209 splice (@{$self->{'associated_files'}}, $i, 1);
1210 } else {
1211 $i++;
1212 }
1213 }
1214}
1215
1216sub delete_assoc_file {
1217 my $self = shift (@_);
1218 my ($assoc_filename) = @_;
1219
1220 my $i=0;
1221 while ($i < scalar (@{$self->{'associated_files'}})) {
1222 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1223 splice (@{$self->{'associated_files'}}, $i, 1);
1224 } else {
1225 $i++;
1226 }
1227 }
1228}
1229
1230sub reset_nextsection_ptr {
1231 my $self = shift (@_);
1232 my ($section) = @_;
1233
1234 my $section_ptr = $self->_lookup_section($section);
1235 $section_ptr->{'next_subsection'} = 1;
1236}
1237
12381;
Note: See TracBrowser for help on using the repository browser.