source: main/trunk/greenstone2/perllib/doc.pm@ 24219

Last change on this file since 24219 was 23939, checked in by ak19, 13 years ago

GS3's OAIserver passes final official oaiserver validation tests: to do with earliestDatestamp. 1. Perl code (inexport, basebuilder, colcfg, buildconfigxml.pm perl files) write out the earliestDatestamp into GS3's buildconfig.xml. Whenever a full-build is performed, the archives directory is recreated. At this stage, inexport creates a new file in archives called earliestDatestamp containing the current time. Whenever an incremental build is performed, this file already exists in archive, so it is left untouched, preserving the time of the full-build (which is the earliestDatestamp). The other perl files are concerned with obtaining this value from the archives directory and writing it out to the build config file. 2. doc.pm and BasePlugout.pm write out the current date and time for each document processed under the new fields oailastmodified and oailastmodifieddate. Changes made in this commit are related to GS3 java src code changes that work in tandem.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 34.2 KB
RevLine 
[537]1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
[8894]10# This program is free software; you can redistr te it and/or modify
[537]11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
[1241]26# base class to hold documents
[4]27
28package doc;
[3834]29eval {require bytes};
[4]30
[832]31BEGIN {
[1241]32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
[832]34}
35
[15894]36use strict;
[1241]37use unicode;
38use util;
39use ghtml;
[8220]40use File::stat;
[1241]41##use hashdoc;
[13172]42use docprint;
[1241]43
[4]44# the document type may be indexed_doc, nonindexed_doc, or
45# classification
46
[18528]47our $OIDcount = 0;
[2327]48
[20412]49# rename_method can be 'url', 'none', 'base64'
[4]50sub new {
51 my $class = shift (@_);
[18319]52 my ($source_filename, $doc_type, $rename_method) = @_;
[1374]53
[13770]54
[1241]55 my $self = bless {'associated_files'=>[],
56 'subsection_order'=>[],
57 'next_subsection'=>1,
58 'subsections'=>{},
59 'metadata'=>[],
[2327]60 'text'=>"",
61 'OIDtype'=>"hash"}, $class;
[4]62
[10217]63 # used to set lastmodified here, but this can screw up the HASH ids, so
64 # the docsave processor now calls set_lastmodified
[13770]65
[23362]66 $self->set_source_path($source_filename);
[10217]67
[7929]68 if (defined $source_filename) {
[15874]69 $source_filename = &util::filename_within_collection($source_filename);
[18508]70 print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
[18319]71 $self->set_source_filename ($source_filename, $rename_method);
[7929]72 }
73
[1374]74 $self->set_doc_type ($doc_type) if defined $doc_type;
[485]75
[4]76 return $self;
77}
[23362]78
79
80sub set_source_path
81{
82 my $self = shift @_;
83 my ($source_filename) = @_;
84
85 if (defined $source_filename) {
86 # On Windows the source_filename can be in terse DOS format
87 # e.g. test~1.txt
88
89 $self->{'terse_source_path'} = $source_filename;
90
91 if (-e $source_filename) {
92 # See if we can do better for Windows with a filename
93 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
94 require Win32;
95 $self->{'source_path'} = Win32::GetLongPathName($source_filename);
96 }
97 else {
98 # For Unix-based systems, there is no difference between the two
99 $self->{'source_path'} = $source_filename;
100 }
101 }
102 else {
103 print STDERR "Warning: In doc::set_source_path(), file\n";
104 print STDERR " $source_filename\n";
105 print STDERR " does not exist\n";
106
107 # (default) Set it to whatever we were given
108 $self->{'source_path'} = $source_filename;
109 }
110 }
111 else {
112 # Previous code for setting source_path allowed for
113 # it to be undefined, so continue this practice
114 $self->{'terse_source_path'} = undef;
115 $self->{'source_path'} = undef;
116 }
117}
118
119
120sub get_source_path
121{
122 my $self = shift @_;
123
124 return $self->{'terse_source_path'};
125}
126
[10217]127# set lastmodified for OAI purposes, added by GRB, moved by kjdon
[23939]128sub set_oailastmodified {
129 my $self = shift (@_);
130
131 my $source_path = $self->{'terse_source_path'};
132
133 if (defined $source_path && (-e $source_path)) {
134 my $current_time = time;
135
136 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
137 $wday, $yday, $isdst) = localtime($current_time);
138
139 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
140
141 $self->add_utf8_metadata($self->get_top_section(), "oailastmodified", $current_time);
142 $self->add_utf8_metadata($self->get_top_section(), "oailastmodifieddate", $date_modified);
143 }
144}
145
146# no longer used for OAI purposes, since lastmodified is not what we want as the
147# Datestamp of a document. This doc metadata may be useful for general purposes.
[10217]148sub set_lastmodified {
149 my $self = shift (@_);
[4]150
[23362]151 my $source_path = $self->{'terse_source_path'};
[13770]152
153 if (defined $source_path && (-e $source_path)) {
154
155 my $file_stat = stat($source_path);
[10217]156 my $mtime = $file_stat->mtime;
[21862]157 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
158 $wday, $yday, $isdst) = localtime($mtime);
159
160 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
161
[23923]162 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $mtime);
[21862]163 $self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
[10217]164 }
165}
166
[1241]167# clone the $self object
168sub duplicate {
169 my $self = shift (@_);
170
171 my $newobj = {};
172
[8716]173 foreach my $k (keys %$self) {
[1241]174 $newobj->{$k} = &clone ($self->{$k});
175 }
176
177 bless $newobj, ref($self);
178 return $newobj;
179}
180
181sub clone {
182 my ($from) = @_;
183 my $type = ref ($from);
184
185 if ($type eq "HASH") {
186 my $to = {};
[8716]187 foreach my $key (keys %$from) {
[1241]188 $to->{$key} = &clone ($from->{$key});
189 }
190 return $to;
191 } elsif ($type eq "ARRAY") {
192 my $to = [];
[8716]193 foreach my $v (@$from) {
[1241]194 push (@$to, &clone ($v));
195 }
196 return $to;
197 } else {
198 return $from;
199 }
200}
201
[2327]202sub set_OIDtype {
203 my $self = shift (@_);
[12268]204 my ($type, $metadata) = @_;
[1241]205
[17025]206 if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|incremental|dirname|assigned)$/) {
[2327]207 $self->{'OIDtype'} = $type;
208 } else {
209 $self->{'OIDtype'} = "hash";
210 }
[16792]211
[12268]212 if ($type =~ /^assigned$/) {
213 if (defined $metadata) {
214 $self->{'OIDmetadata'} = $metadata;
215 } else {
216 $self->{'OIDmetadata'} = "dc.Identifier";
217 }
218 }
[2327]219}
220
[20412]221# rename_method can be 'url', 'none', 'base64'
[1241]222sub set_source_filename {
223 my $self = shift (@_);
[18319]224 my ($source_filename, $rename_method) = @_;
[1241]225
[16578]226 # Since the gsdlsourcefilename element goes into the doc.xml it has
227 # to be utf8. However, it should also *represent* the source filename
228 # (in the import directory) which may not be utf8 at all.
[18319]229 # For instance, if this meta element (gsdlsourcefilename) will be used
230 # by other applications that parse doc.xml in order to locate
231 # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
232 # encode the real filename as this is a binary-to-text encoding meaning
233 # that the resulting string is ASCII (utf8). Decoding will give the original.
[16578]234
[18319]235# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
236
[16670]237 # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
[23278]238 $source_filename = $self->encode_filename($source_filename, $rename_method);
239# my ($srcfilename,$dirname,$suffix)
240# = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
[16792]241# print STDERR "-> $srcfilename -> ";
[23278]242# $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
243# $source_filename = &util::filename_cat($dirname, $srcfilename);
[16792]244# print STDERR "$source_filename\n";
[16670]245
[16578]246 $self->set_utf8_metadata_element ($self->get_top_section(),
[1241]247 "gsdlsourcefilename",
248 $source_filename);
249}
250
[23278]251sub encode_filename {
252 my $self = shift (@_);
253 my ($source_filename, $rename_method) = @_;
254
255 my ($srcfilename,$dirname,$suffix)
256 = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
257# print STDERR "-> $srcfilename -> ";
258 $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
259 $source_filename = &util::filename_cat($dirname, $srcfilename);
260
261 return $source_filename;
262}
263
[7569]264sub set_converted_filename {
265 my $self = shift (@_);
266 my ($converted_filename) = @_;
267
[16578]268 # we know the converted filename is utf8
269 $self->set_utf8_metadata_element ($self->get_top_section(),
[7569]270 "gsdlconvertedfilename",
271 $converted_filename);
272}
273
[1241]274# returns the source_filename as it was provided
[19829]275sub get_unmodified_source_filename {
276 my $self = shift (@_);
277
[23362]278 return $self->{'terse_source_path'};
[19829]279}
280
281# returns the source_filename with whatever rename_method was given
[1241]282sub get_source_filename {
283 my $self = shift (@_);
284
285 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
286}
287
[19829]288
289
[7569]290# returns converted filename if available else returns source filename
291sub get_filename_for_hashing {
292 my $self = shift (@_);
293
294 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
295
296 if (!defined $filename) {
[11097]297 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
[10980]298 # if NULPlug processed file, then don't give a filename
[11097]299 if (defined $plugin_name && $plugin_name eq "NULPlug") {
[10980]300 $filename = undef;
[16670]301 } else { # returns the URL encoded source filename!
[10980]302 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
303 }
[7569]304 }
[23485]305
306 if (!&util::filename_is_absolute($filename)) {
307 $filename = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$filename);
308 }
309
[7569]310 return $filename;
311}
312
[1241]313sub set_doc_type {
314 my $self = shift (@_);
315 my ($doc_type) = @_;
316
317 $self->set_metadata_element ($self->get_top_section(),
318 "gsdldoctype",
319 $doc_type);
320}
321
[10217]322# returns the gsdldoctype as it was provided
[1241]323# the default of "indexed_doc" is used if no document
324# type was provided
325sub get_doc_type {
326 my $self = shift (@_);
327
328 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
329 return $doc_type if (defined $doc_type);
330 return "indexed_doc";
331}
332
333
334# look up the reference to the a particular section
335sub _lookup_section {
336 my $self = shift (@_);
337 my ($section) = @_;
338
339 my ($num);
340 my $sectionref = $self;
341
342 while (defined $section && $section ne "") {
[12327]343
[1241]344 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
[12327]345
346 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
347
[1241]348 $section = "" unless defined $section;
349
[12327]350
[1241]351 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
352 $sectionref = $sectionref->{'subsections'}->{$num};
353 } else {
354 return undef;
355 }
356 }
357
358 return $sectionref;
359}
360
[2327]361# calculate OID by hashing the contents of the document
[1241]362sub _calc_OID {
363 my $self = shift (@_);
364 my ($filename) = @_;
365
[22855]366
[1241]367 my $osexe = &util::get_os_exe();
368
369 my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
370 $ENV{'GSDLOS'},"hashfile$osexe");
[8504]371
[1241]372 my $result = "NULL";
[22855]373
[1679]374
[1241]375 if (-e "$hashfile_exe") {
[1679]376# $result = `\"$hashfile_exe\" \"$filename\"`;
[21862]377# $result = `hashfile$osexe \"$filename\" 2>&1`;
[1679]378 $result = `hashfile$osexe \"$filename\"`;
[21862]379
[1241]380 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
381 } else {
382 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
383 }
384 return "HASH$result";
385}
386
387# methods dealing with OID, not groups of them.
388
[2327]389# if $OID is not provided one is calculated
[1241]390sub set_OID {
391 my $self = shift (@_);
392 my ($OID) = @_;
[8504]393
[8797]394 my $use_hash_oid = 0;
[17057]395 # if an OID wasn't provided calculate one
[1241]396 if (!defined $OID) {
397 $OID = "NULL";
[17025]398 if ($self->{'OIDtype'} =~ /^hash/) {
[8797]399 $use_hash_oid = 1;
400 } elsif ($self->{'OIDtype'} eq "incremental") {
[2327]401 $OID = "D" . $OIDcount;
402 $OIDcount ++;
[8716]403
404 } elsif ($self->{'OIDtype'} eq "dirname") {
405 $OID = 'J';
406 my $filename = $self->get_source_filename();
[8797]407 if (defined($filename)) { # && -e $filename) {
[8716]408 $OID = &File::Basename::dirname($filename);
409 if (defined $OID) {
410 $OID = 'J'.&File::Basename::basename($OID);
[19617]411 $OID = &util::tidy_up_oid($OID);
[8716]412 } else {
[8797]413 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
414 $use_hash_oid = 1;
[8716]415 }
416 } else {
[8797]417 print STDERR "Failed to find filename, generating hash id\n";
418 $use_hash_oid = 1;
[8716]419 }
[2327]420
[8797]421 } elsif ($self->{'OIDtype'} eq "assigned") {
[12268]422 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
[8797]423 if (defined $identifier && $identifier ne "") {
[17025]424 $OID = $identifier;
[19617]425 $OID = &util::tidy_up_oid($OID);
[8797]426 } else {
427 # need a hash id
[12268]428 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
[8797]429 $use_hash_oid = 1;
430 }
431
[2327]432 } else {
[8797]433 $use_hash_oid = 1;
434 }
435
436 if ($use_hash_oid) {
[23562]437 my $hash_on_file = 1;
[17025]438 if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
439 $hash_on_file = 0;
440 }
441 if ($hash_on_file) {
442 # "hash" OID - feed file to hashfile.exe
443 my $filename = $self->get_filename_for_hashing();
[23562]444
[17025]445 # -z: don't want to hash on the file if it is zero size
446 if (defined($filename) && -e $filename && !-z $filename) {
447 $OID = $self->_calc_OID ($filename);
448 } else {
449 $hash_on_file = 0;
450 }
451 }
452 if (!$hash_on_file) {
453 my $filename = &util::get_tmp_filename();
[22855]454 if (!open (OUTFILE, ">:utf8", $filename)) {
[2327]455 print STDERR "doc::set_OID could not write to $filename\n";
456 } else {
[13172]457 my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
458 print OUTFILE $doc_text;
[2327]459 close (OUTFILE);
460 }
461 $OID = $self->_calc_OID ($filename);
462 &util::rm ($filename);
[1374]463 }
[1241]464 }
465 }
466 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
467}
468
469# this uses hashdoc (embedded c thingy) which is faster but still
470# needs a little work to be suffiently stable
471sub ___set_OID {
472 my $self = shift (@_);
473 my ($OID) = @_;
474
475 # if an OID wasn't provided then calculate hash value based on document
476 if (!defined $OID)
477 {
[13172]478 my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
[1241]479 my $hash_len = length($hash_text);
480
481 $OID = &hashdoc::buffer($hash_text,$hash_len);
482 }
483
484 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
485}
486
487# returns the OID for this document
488sub get_OID {
489 my $self = shift (@_);
490 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
491 return $OID if (defined $OID);
492 return "NULL";
493}
494
495sub delete_OID {
496 my $self = shift (@_);
497
498 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
499}
500
501
502# methods for manipulating section names
503
504# returns the name of the top-most section (the top
505# level of the document
506sub get_top_section {
507 my $self = shift (@_);
508
509 return "";
510}
511
512# returns a section
513sub get_parent_section {
514 my $self = shift (@_);
515 my ($section) = @_;
516
517 $section =~ s/(^|\.)\d+$//;
518
519 return $section;
520}
521
522# returns the first child section (or the end child
523# if there isn't any)
524sub get_begin_child {
525 my $self = shift (@_);
526 my ($section) = @_;
527
528 my $section_ptr = $self->_lookup_section($section);
529 return "" unless defined $section_ptr;
530
531 if (defined $section_ptr->{'subsection_order'}->[0]) {
532 return "$section.$section_ptr->{'subsection_order'}->[0]";
533 }
534
535 return $self->get_end_child ($section);
536}
537
538# returns the next child of a parent section
539sub get_next_child {
540 my $self = shift (@_);
541 my ($section) = @_;
542
543 my $parent_section = $self->get_parent_section($section);
544 my $parent_section_ptr = $self->_lookup_section($parent_section);
545 return undef unless defined $parent_section_ptr;
546
547 my ($section_num) = $section =~ /(\d+)$/;
548 return undef unless defined $section_num;
549
550 my $i = 0;
551 my $section_order = $parent_section_ptr->{'subsection_order'};
552 while ($i < scalar(@$section_order)) {
553 last if $section_order->[$i] eq $section_num;
554 $i++;
555 }
556
557 $i++; # the next child
558 if ($i < scalar(@$section_order)) {
559 return $section_order->[$i] if $parent_section eq "";
560 return "$parent_section.$section_order->[$i]";
561 }
562
563 # no more sections in this level
564 return undef;
565}
566
567# returns a reference to a list of children
568sub get_children {
569 my $self = shift (@_);
570 my ($section) = @_;
571
572 my $section_ptr = $self->_lookup_section($section);
573 return [] unless defined $section_ptr;
574
575 my @children = @{$section_ptr->{'subsection_order'}};
576
577 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
578 return \@children;
579}
580
581# returns the child section one past the last one (which
582# is coded as "0")
583sub get_end_child {
584 my $self = shift (@_);
585 my ($section) = @_;
586
587 return $section . ".0" unless $section eq "";
588 return "0";
589}
590
591# returns the next section in book order
592sub get_next_section {
593 my $self = shift (@_);
594 my ($section) = @_;
595
596 return undef unless defined $section;
597
598 my $section_ptr = $self->_lookup_section($section);
599 return undef unless defined $section_ptr;
600
601 # first try to find first child
602 if (defined $section_ptr->{'subsection_order'}->[0]) {
603 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
604 return "$section.$section_ptr->{'subsection_order'}->[0]";
605 }
606
607 do {
608 # try to find sibling
609 my $next_child = $self->get_next_child ($section);
610 return $next_child if (defined $next_child);
611
612 # move up one level
613 $section = $self->get_parent_section ($section);
614 } while $section =~ /\d/;
615
616 return undef;
617}
618
619sub is_leaf_section {
620 my $self = shift (@_);
621 my ($section) = @_;
622
623 my $section_ptr = $self->_lookup_section($section);
624 return 1 unless defined $section_ptr;
625
626 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
627}
628
629# methods for dealing with sections
630
631# returns the name of the inserted section
632sub insert_section {
633 my $self = shift (@_);
634 my ($before_section) = @_;
635
636 # get the child to insert before and its parent section
637 my $parent_section = "";
638 my $before_child = "0";
639 my @before_section = split (/\./, $before_section);
640 if (scalar(@before_section) > 0) {
641 $before_child = pop (@before_section);
642 $parent_section = join (".", @before_section);
643 }
644
645 my $parent_section_ptr = $self->_lookup_section($parent_section);
646 if (!defined $parent_section_ptr) {
647 print STDERR "doc::insert_section couldn't find parent section " .
648 "$parent_section\n";
649 return;
650 }
651
652 # get the next section number
653 my $section_num = $parent_section_ptr->{'next_subsection'}++;
654
655 my $i = 0;
656 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
657 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
658 $i++;
659 }
660
661 # insert the section number into the order list
662 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
663
664 # add this section to the parent section
665 my $section_ptr = {'subsection_order'=>[],
666 'next_subsection'=>1,
667 'subsections'=>{},
668 'metadata'=>[],
669 'text'=>""};
670 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
671
672 # work out the full section number
673 my $section = $parent_section;
674 $section .= "." unless $section eq "";
675 $section .= $section_num;
676
677 return $section;
678}
679
680# creates a pre-named section
681sub create_named_section {
682 my $self = shift (@_);
683 my ($mastersection) = @_;
684
685 my ($num);
686 my $section = $mastersection;
687 my $sectionref = $self;
688
689 while ($section ne "") {
690 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
691 $num =~ s/^0+(\d)/$1/; # remove leading 0s
692 $section = "" unless defined $section;
693
694 if (defined $num) {
695 if (!defined $sectionref->{'subsections'}->{$num}) {
696 push (@{$sectionref->{'subsection_order'}}, $num);
697 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
698 'next_subsection'=>1,
699 'subsections'=>{},
700 'metadata'=>[],
701 'text'=>""};
702 if ($num >= $sectionref->{'next_subsection'}) {
703 $sectionref->{'next_subsection'} = $num + 1;
704 }
705 }
706 $sectionref = $sectionref->{'subsections'}->{$num};
707
708 } else {
709 print STDERR "doc::create_named_section couldn't create section ";
710 print STDERR "$mastersection\n";
711 last;
712 }
713 }
714}
715
716# returns a reference to a list of subsections
717sub list_subsections {
718 my $self = shift (@_);
719 my ($section) = @_;
720
721 my $section_ptr = $self->_lookup_section ($section);
722 if (!defined $section_ptr) {
723 print STDERR "doc::list_subsections couldn't find section $section\n";
724 return [];
725 }
726
727 return [@{$section_ptr->{'subsection_order'}}];
728}
729
730sub delete_section {
731 my $self = shift (@_);
732 my ($section) = @_;
733
734# my $section_ptr = {'subsection_order'=>[],
735# 'next_subsection'=>1,
736# 'subsections'=>{},
737# 'metadata'=>[],
738# 'text'=>""};
739
740 # if this is the top section reset everything
741 if ($section eq "") {
742 $self->{'subsection_order'} = [];
743 $self->{'subsections'} = {};
744 $self->{'metadata'} = [];
745 $self->{'text'} = "";
746 return;
747 }
748
749 # find the parent of the section to delete
750 my $parent_section = "";
751 my $child = "0";
752 my @section = split (/\./, $section);
753 if (scalar(@section) > 0) {
754 $child = pop (@section);
755 $parent_section = join (".", @section);
756 }
757
758 my $parent_section_ptr = $self->_lookup_section($parent_section);
759 if (!defined $parent_section_ptr) {
760 print STDERR "doc::delete_section couldn't find parent section " .
761 "$parent_section\n";
762 return;
763 }
764
765 # remove this section from the subsection_order list
766 my $i = 0;
767 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
768 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
769 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
770 last;
771 }
772 $i++;
773 }
774
775 # remove this section from the subsection hash
776 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
777 undef $parent_section_ptr->{'subsections'}->{$child};
778 }
779}
780
781#--
[4]782# methods for dealing with metadata
783
784# set_metadata_element and get_metadata_element are for metadata
785# which should only have one value. add_meta_data and get_metadata
786# are for metadata which can have more than one value.
787
[1241]788# returns the first metadata value which matches field
[6111]789
790# This version of get metadata element works much like the one above,
791# except it allows for the namespace portion of a metadata element to
792# be ignored, thus if you are searching for dc.Title, the first piece
793# of matching metadata ending with the name Title (once any namespace
794# is removed) would be returned.
795# 28-11-2003 John Thompson
[1241]796sub get_metadata_element {
797 my $self = shift (@_);
[6111]798 my ($section, $field, $ignore_namespace) = @_;
[1241]799 my ($data);
800
[6111]801 $ignore_namespace = 0 unless defined $ignore_namespace;
802
[1241]803 my $section_ptr = $self->_lookup_section($section);
804 if (!defined $section_ptr) {
[8716]805 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
[1241]806 return;
807 }
808
[20417]809 # Remove the namespace if we are being told to ignore them
[6111]810 if($ignore_namespace) {
811 $field =~ s/^\w*\.//;
812 }
813
[1241]814 foreach $data (@{$section_ptr->{'metadata'}}) {
[6111]815
816 my $data_name = $data->[0];
[14966]817
[6111]818 # Remove the any namespace if we are being told to ignore them
819 if($ignore_namespace) {
820 $data_name =~ s/^\w*\.//;
821 }
[22294]822 $data_name =~ s/^ex\.//; # we always remove ex. - it maybe there in doc_obj, but we will never ask for it.
[6111]823 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
[1241]824 }
825
826 return undef; # was not found
827}
828
829# returns a list of the form [value1, value2, ...]
830sub get_metadata {
831 my $self = shift (@_);
[6111]832 my ($section, $field, $ignore_namespace) = @_;
[1241]833 my ($data);
834
[6111]835 $ignore_namespace = 0 unless defined $ignore_namespace;
836
[1241]837 my $section_ptr = $self->_lookup_section($section);
838 if (!defined $section_ptr) {
[8716]839 print STDERR "doc::get_metadata couldn't find section ",
840 $section, "\n";
[1241]841 return;
842 }
843
[6111]844 # Remove the any namespace if we are being told to ignore them
845 if($ignore_namespace) {
846 $field =~ s/^\w*\.//;
847 }
848
[1241]849 my @metadata = ();
850 foreach $data (@{$section_ptr->{'metadata'}}) {
[6111]851
852 my $data_name = $data->[0];
853 # Remove the any namespace if we are being told to ignore them
854 if($ignore_namespace) {
855 $data_name =~ s/^\w*\.//;
856 }
[22294]857 $data_name =~ s/^ex\.//; # we always remove ex. - it maybe there in doc_obj, but we will never ask for it.
[6111]858 push (@metadata, $data->[1]) if ($data_name eq $field);
[1241]859 }
[9241]860
[1241]861 return \@metadata;
862}
863
[23827]864sub get_metadata_hashmap {
865 my $self = shift (@_);
866 my ($section, $opt_namespace) = @_;
867
868 my $section_ptr = $self->_lookup_section($section);
869 if (!defined $section_ptr) {
870 print STDERR "doc::get_metadata couldn't find section ",
871 $section, "\n";
872 return;
873 }
874
875 my $metadata_hashmap = {};
876 foreach my $data (@{$section_ptr->{'metadata'}}) {
877 my $metaname = $data->[0];
878
879 if ((!defined $opt_namespace) || ($metaname =~ m/^$opt_namespace\./)) {
880 if (!defined $metadata_hashmap->{$metaname}) {
881 $metadata_hashmap->{$metaname} = [];
882 }
883 my $metaval_list = $metadata_hashmap->{$metaname};
884 push(@$metaval_list, $data->[1]);
885 }
886 }
887
888 return $metadata_hashmap;
889}
890
[1241]891# returns a list of the form [[field,value],[field,value],...]
892sub get_all_metadata {
893 my $self = shift (@_);
894 my ($section) = @_;
895
896 my $section_ptr = $self->_lookup_section($section);
897 if (!defined $section_ptr) {
[8716]898 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
[1241]899 return;
900 }
901
902 return $section_ptr->{'metadata'};
903}
904
905# $value is optional
906sub delete_metadata {
907 my $self = shift (@_);
908 my ($section, $field, $value) = @_;
909
910 my $section_ptr = $self->_lookup_section($section);
911 if (!defined $section_ptr) {
[23562]912 print STDERR "doc::delete_metadata couldn't find section ", $section, "$field\n";
[1241]913 return;
914 }
915
916 my $i = 0;
917 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
918 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
919 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
920 splice (@{$section_ptr->{'metadata'}}, $i, 1);
921 } else {
922 $i++;
923 }
924 }
925}
926
927sub delete_all_metadata {
928 my $self = shift (@_);
929 my ($section) = @_;
930
931 my $section_ptr = $self->_lookup_section($section);
932 if (!defined $section_ptr) {
[8716]933 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
[1241]934 return;
935 }
936
937 $section_ptr->{'metadata'} = [];
938}
939
[4]940sub set_metadata_element {
941 my $self = shift (@_);
942 my ($section, $field, $value) = @_;
943
[97]944 $self->set_utf8_metadata_element ($section, $field,
[1870]945 &unicode::ascii2utf8(\$value));
[73]946}
947
948# set_utf8_metadata_element assumes the text has already been
949# converted to the UTF-8 encoding.
950sub set_utf8_metadata_element {
951 my $self = shift (@_);
952 my ($section, $field, $value) = @_;
953
[4]954 $self->delete_metadata ($section, $field);
[73]955 $self->add_utf8_metadata ($section, $field, $value);
[4]956}
957
958
[73]959# add_metadata assumes the text is in (extended) ascii form. For
[8220]960# text which has already been converted to the UTF-8 format use
[73]961# add_utf8_metadata.
[4]962sub add_metadata {
963 my $self = shift (@_);
964 my ($section, $field, $value) = @_;
965
[97]966 $self->add_utf8_metadata ($section, $field,
[1870]967 &unicode::ascii2utf8(\$value));
[73]968}
969
970sub add_utf8_metadata {
971 my $self = shift (@_);
972 my ($section, $field, $value) = @_;
973
[22950]974# my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
975# my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
976# print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
977
[4]978 my $section_ptr = $self->_lookup_section($section);
979 if (!defined $section_ptr) {
[8716]980 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
[4]981 return;
982 }
[1732]983 if (!defined $value) {
984 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
985 return;
986 }
987 if (!defined $field) {
988 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
989 return;
990 }
[4]991
[8894]992 #print STDERR "###$field=$value\n";
[22950]993
994 # For now, supress this check. Given that text data read in is now
995 # Unicode aware, then the following block of code can (ironically enough)
996 # cause our unicode compliant string to be re-encoded (leading to
997 # a double-encoded UTF-8 string, which we definitely don't want!).
998
999
[7798]1000 # double check that the value is utf-8
[22950]1001# if (!&unicode::check_is_utf8($value)) {
1002# print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
1003# &unicode::ensure_utf8(\$value);
1004# print STDERR " Tried converting to utf8: $value\n";
1005# }
[7798]1006
[4]1007 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1008}
1009
1010
1011# methods for dealing with text
1012
[1241]1013# returns the text for a section
1014sub get_text {
1015 my $self = shift (@_);
1016 my ($section) = @_;
1017
1018 my $section_ptr = $self->_lookup_section($section);
1019 if (!defined $section_ptr) {
1020 print STDERR "doc::get_text couldn't find section " .
1021 "$section\n";
1022 return "";
1023 }
1024
1025 return $section_ptr->{'text'};
1026}
1027
1028# returns the (utf-8 encoded) length of the text for a section
1029sub get_text_length {
1030 my $self = shift (@_);
1031 my ($section) = @_;
1032
1033 my $section_ptr = $self->_lookup_section($section);
1034 if (!defined $section_ptr) {
1035 print STDERR "doc::get_text_length couldn't find section " .
1036 "$section\n";
1037 return 0;
1038 }
1039
1040 return length ($section_ptr->{'text'});
1041}
1042
[23131]1043# returns the total length for all the sections
1044sub get_total_text_length {
1045 my $self = shift (@_);
1046
1047 my $section = $self->get_top_section();
1048 my $length = 0;
1049 while (defined $section) {
1050 $length += $self->get_text_length($section);
1051 $section = $self->get_next_section($section);
1052 }
1053 return $length;
1054}
1055
[1241]1056sub delete_text {
1057 my $self = shift (@_);
1058 my ($section) = @_;
1059
1060 my $section_ptr = $self->_lookup_section($section);
1061 if (!defined $section_ptr) {
1062 print STDERR "doc::delete_text couldn't find section " .
1063 "$section\n";
1064 return;
1065 }
1066
1067 $section_ptr->{'text'} = "";
1068}
1069
[73]1070# add_text assumes the text is in (extended) ascii form. For
1071# text which has been already converted to the UTF-8 format
1072# use add_utf8_text.
[4]1073sub add_text {
1074 my $self = shift (@_);
1075 my ($section, $text) = @_;
1076
[73]1077 # convert the text to UTF-8 encoded unicode characters
1078 # and add the text
[1870]1079 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
[73]1080}
1081
1082
1083# add_utf8_text assumes the text to be added has already
1084# been converted to the UTF-8 encoding. For ascii text use
1085# add_text
1086sub add_utf8_text {
1087 my $self = shift (@_);
1088 my ($section, $text) = @_;
1089
[4]1090 my $section_ptr = $self->_lookup_section($section);
1091 if (!defined $section_ptr) {
[73]1092 print STDERR "doc::add_utf8_text couldn't find section " .
[4]1093 "$section\n";
1094 return;
1095 }
1096
1097 $section_ptr->{'text'} .= $text;
1098}
1099
[16950]1100# returns the Source meta, which is the utf8 filename generated.
1101# Added a separate method here for convenience
1102sub get_source {
[16924]1103 my $self = shift (@_);
[16950]1104 return $self->get_metadata_element ($self->get_top_section(), "Source");
1105}
1106
1107# returns the SourceFile meta, which is the url reference to the URL-encoded
1108# version of Source (the utf8 filename). Added a separate method here for convenience
1109sub get_sourcefile {
1110 my $self = shift (@_);
1111 return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
1112}
1113
1114# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
1115# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
1116sub get_assocfile_from_sourcefile {
1117 my $self = shift (@_);
[16924]1118
1119 # get the SourceFile meta, which is a *URL* to a file on the filesystem
1120 my $top_section = $self->get_top_section();
1121 my $source_file = $self->get_metadata_element($top_section, "SourceFile");
[4]1122
[16924]1123 # get the actual filename as it exists on the filesystem which this url refers to
[16928]1124 $source_file = &unicode::url_to_filename($source_file);
[16924]1125 my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
1126 return $assocfilename;
1127}
1128
[1241]1129# methods for dealing with associated files
1130
1131# a file is associated with a document, NOT a section.
1132# if section is defined it is noted in the data structure
1133# only so that files associated from a particular section
1134# may be removed later (using delete_section_assoc_files)
1135sub associate_file {
1136 my $self = shift (@_);
1137 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1138 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1139
1140 # remove all associated files with the same name
1141 $self->delete_assoc_file ($assoc_filename);
[23413]1142
1143 # Too harsh a requirement
1144 # Definitely get HTML docs, for example, with some missing
1145 # support files
1146# if (!&util::fd_exists($real_filename)) {
1147# print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n";
1148# exit -1;
1149# }
1150
[23387]1151# print STDERR "**** is the following a UTF8 rep of *real* filename?\n $real_filename\n";
1152# print STDERR "****##### so, ensure it is before storing?!?!?\n";
1153## my $utf8_filename = Encode::encode("utf8",$filename);
1154
[1241]1155 push (@{$self->{'associated_files'}},
1156 [$real_filename, $assoc_filename, $mime_type, $section]);
1157}
1158
1159# returns a list of associated files in the form
1160# [[real_filename, assoc_filename, mimetype], ...]
1161sub get_assoc_files {
1162 my $self = shift (@_);
1163
1164 return $self->{'associated_files'};
1165}
1166
[20775]1167# the following two methods used to keep track of original associated files
1168# for incremental building. eg a txt file used by an item file does not end
1169# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1170# file for incremental build
1171sub associate_source_file {
1172 my $self = shift (@_);
1173 my ($full_filename) = @_;
[19494]1174
[20775]1175 push (@{$self->{'source_assoc_files'}}, $full_filename);
1176
1177}
1178
1179sub get_source_assoc_files {
1180 my $self = shift (@_);
1181
1182 return $self->{'source_assoc_files'};
1183
1184
1185}
[19494]1186sub metadata_file {
1187 my $self = shift (@_);
1188 my ($real_filename, $filename) = @_;
1189
1190 push (@{$self->{'metadata_files'}},
1191 [$real_filename, $filename]);
1192}
1193
[21566]1194# used for writing out the archiveinf-doc info database, to list all the metadata files
[19494]1195sub get_meta_files {
1196 my $self = shift (@_);
1197
1198 return $self->{'metadata_files'};
1199}
1200
[1241]1201sub delete_section_assoc_files {
1202 my $self = shift (@_);
1203 my ($section) = @_;
1204
1205 my $i=0;
1206 while ($i < scalar (@{$self->{'associated_files'}})) {
1207 if (defined $self->{'associated_files'}->[$i]->[3] &&
1208 $self->{'associated_files'}->[$i]->[3] eq $section) {
1209 splice (@{$self->{'associated_files'}}, $i, 1);
1210 } else {
1211 $i++;
1212 }
1213 }
1214}
1215
1216sub delete_assoc_file {
1217 my $self = shift (@_);
1218 my ($assoc_filename) = @_;
1219
1220 my $i=0;
1221 while ($i < scalar (@{$self->{'associated_files'}})) {
1222 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1223 splice (@{$self->{'associated_files'}}, $i, 1);
1224 } else {
1225 $i++;
1226 }
1227 }
1228}
1229
1230sub reset_nextsection_ptr {
1231 my $self = shift (@_);
1232 my ($section) = @_;
1233
1234 my $section_ptr = $self->_lookup_section($section);
1235 $section_ptr->{'next_subsection'} = 1;
1236}
1237
[4]12381;
Note: See TracBrowser for help on using the repository browser.