source: main/trunk/greenstone2/perllib/doc.pm@ 27350

Last change on this file since 27350 was 27350, checked in by kjdon, 11 years ago

removing the J from directory name for dirname option for OID. If there are only digits will get a D in front.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 38.2 KB
RevLine 
[537]1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
[8894]10# This program is free software; you can redistr te it and/or modify
[537]11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
[1241]26# base class to hold documents
[4]27
28package doc;
[3834]29eval {require bytes};
[4]30
[832]31BEGIN {
[1241]32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
[832]34}
35
[15894]36use strict;
[1241]37use unicode;
38use util;
[27306]39use FileUtils;
[1241]40use ghtml;
[8220]41use File::stat;
[1241]42##use hashdoc;
[13172]43use docprint;
[1241]44
[4]45# the document type may be indexed_doc, nonindexed_doc, or
46# classification
47
[18528]48our $OIDcount = 0;
[2327]49
[20412]50# rename_method can be 'url', 'none', 'base64'
[4]51sub new {
52 my $class = shift (@_);
[18319]53 my ($source_filename, $doc_type, $rename_method) = @_;
[1374]54
[13770]55
[1241]56 my $self = bless {'associated_files'=>[],
57 'subsection_order'=>[],
58 'next_subsection'=>1,
59 'subsections'=>{},
60 'metadata'=>[],
[2327]61 'text'=>"",
62 'OIDtype'=>"hash"}, $class;
[4]63
[10217]64 # used to set lastmodified here, but this can screw up the HASH ids, so
65 # the docsave processor now calls set_lastmodified
[13770]66
[23362]67 $self->set_source_path($source_filename);
[10217]68
[7929]69 if (defined $source_filename) {
[15874]70 $source_filename = &util::filename_within_collection($source_filename);
[18508]71 print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
[18319]72 $self->set_source_filename ($source_filename, $rename_method);
[7929]73 }
74
[1374]75 $self->set_doc_type ($doc_type) if defined $doc_type;
[485]76
[4]77 return $self;
78}
[23362]79
80
81sub set_source_path
82{
83 my $self = shift @_;
84 my ($source_filename) = @_;
85
86 if (defined $source_filename) {
87 # On Windows the source_filename can be in terse DOS format
88 # e.g. test~1.txt
89
90 $self->{'terse_source_path'} = $source_filename;
91
[27306]92 # Use the FileUtil library methods as they are aware of more special
93 # cases such as HDFS [jmt12]
94 ##if (&util::file_exists($source_filename)) {
[23362]95 if (-e $source_filename) {
96 # See if we can do better for Windows with a filename
97 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
98 require Win32;
99 $self->{'source_path'} = Win32::GetLongPathName($source_filename);
100 }
101 else {
102 # For Unix-based systems, there is no difference between the two
103 $self->{'source_path'} = $source_filename;
104 }
105 }
106 else {
107 print STDERR "Warning: In doc::set_source_path(), file\n";
108 print STDERR " $source_filename\n";
109 print STDERR " does not exist\n";
110
111 # (default) Set it to whatever we were given
112 $self->{'source_path'} = $source_filename;
113 }
114 }
115 else {
116 # Previous code for setting source_path allowed for
117 # it to be undefined, so continue this practice
118 $self->{'terse_source_path'} = undef;
119 $self->{'source_path'} = undef;
120 }
121}
122
123
124sub get_source_path
125{
126 my $self = shift @_;
127
128 return $self->{'terse_source_path'};
129}
130
[10217]131# set lastmodified for OAI purposes, added by GRB, moved by kjdon
[23939]132sub set_oailastmodified {
133 my $self = shift (@_);
134
135 my $source_path = $self->{'terse_source_path'};
136
137 if (defined $source_path && (-e $source_path)) {
138 my $current_time = time;
139
140 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
141 $wday, $yday, $isdst) = localtime($current_time);
142
143 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
144
145 $self->add_utf8_metadata($self->get_top_section(), "oailastmodified", $current_time);
146 $self->add_utf8_metadata($self->get_top_section(), "oailastmodifieddate", $date_modified);
147 }
148}
149
150# no longer used for OAI purposes, since lastmodified is not what we want as the
151# Datestamp of a document. This doc metadata may be useful for general purposes.
[10217]152sub set_lastmodified {
153 my $self = shift (@_);
[4]154
[23362]155 my $source_path = $self->{'terse_source_path'};
[13770]156
157 if (defined $source_path && (-e $source_path)) {
158
159 my $file_stat = stat($source_path);
[10217]160 my $mtime = $file_stat->mtime;
[21862]161 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
162 $wday, $yday, $isdst) = localtime($mtime);
163
164 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
165
[23923]166 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $mtime);
[21862]167 $self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
[10217]168 }
169}
170
[1241]171# clone the $self object
172sub duplicate {
173 my $self = shift (@_);
174
175 my $newobj = {};
176
[8716]177 foreach my $k (keys %$self) {
[1241]178 $newobj->{$k} = &clone ($self->{$k});
179 }
180
181 bless $newobj, ref($self);
182 return $newobj;
183}
184
185sub clone {
186 my ($from) = @_;
187 my $type = ref ($from);
188
189 if ($type eq "HASH") {
190 my $to = {};
[8716]191 foreach my $key (keys %$from) {
[1241]192 $to->{$key} = &clone ($from->{$key});
193 }
194 return $to;
195 } elsif ($type eq "ARRAY") {
196 my $to = [];
[8716]197 foreach my $v (@$from) {
[1241]198 push (@$to, &clone ($v));
199 }
200 return $to;
201 } else {
202 return $from;
203 }
204}
205
[2327]206sub set_OIDtype {
207 my $self = shift (@_);
[12268]208 my ($type, $metadata) = @_;
[1241]209
[26536]210 if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|hash_on_full_filename|incremental|filename|dirname|full_filename|assigned)$/) {
[2327]211 $self->{'OIDtype'} = $type;
212 } else {
213 $self->{'OIDtype'} = "hash";
214 }
[16792]215
[12268]216 if ($type =~ /^assigned$/) {
217 if (defined $metadata) {
218 $self->{'OIDmetadata'} = $metadata;
219 } else {
220 $self->{'OIDmetadata'} = "dc.Identifier";
221 }
222 }
[2327]223}
224
[20412]225# rename_method can be 'url', 'none', 'base64'
[1241]226sub set_source_filename {
227 my $self = shift (@_);
[18319]228 my ($source_filename, $rename_method) = @_;
[1241]229
[16578]230 # Since the gsdlsourcefilename element goes into the doc.xml it has
231 # to be utf8. However, it should also *represent* the source filename
232 # (in the import directory) which may not be utf8 at all.
[18319]233 # For instance, if this meta element (gsdlsourcefilename) will be used
234 # by other applications that parse doc.xml in order to locate
235 # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
236 # encode the real filename as this is a binary-to-text encoding meaning
237 # that the resulting string is ASCII (utf8). Decoding will give the original.
[16578]238
[18319]239# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
240
[16670]241 # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
[23278]242 $source_filename = $self->encode_filename($source_filename, $rename_method);
243# my ($srcfilename,$dirname,$suffix)
244# = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
[16792]245# print STDERR "-> $srcfilename -> ";
[23278]246# $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
[27306]247# $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
[16792]248# print STDERR "$source_filename\n";
[16670]249
[16578]250 $self->set_utf8_metadata_element ($self->get_top_section(),
[1241]251 "gsdlsourcefilename",
252 $source_filename);
253}
254
[23278]255sub encode_filename {
256 my $self = shift (@_);
257 my ($source_filename, $rename_method) = @_;
258
259 my ($srcfilename,$dirname,$suffix)
260 = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
261# print STDERR "-> $srcfilename -> ";
262 $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
[27306]263 $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
[23278]264
265 return $source_filename;
266}
267
[7569]268sub set_converted_filename {
269 my $self = shift (@_);
270 my ($converted_filename) = @_;
271
[16578]272 # we know the converted filename is utf8
273 $self->set_utf8_metadata_element ($self->get_top_section(),
[7569]274 "gsdlconvertedfilename",
275 $converted_filename);
276}
277
[1241]278# returns the source_filename as it was provided
[19829]279sub get_unmodified_source_filename {
280 my $self = shift (@_);
281
[23362]282 return $self->{'terse_source_path'};
[19829]283}
284
285# returns the source_filename with whatever rename_method was given
[1241]286sub get_source_filename {
287 my $self = shift (@_);
288
289 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
290}
291
[19829]292
293
[7569]294# returns converted filename if available else returns source filename
295sub get_filename_for_hashing {
296 my $self = shift (@_);
297
298 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
299
300 if (!defined $filename) {
[11097]301 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
[10980]302 # if NULPlug processed file, then don't give a filename
[11097]303 if (defined $plugin_name && $plugin_name eq "NULPlug") {
[10980]304 $filename = undef;
[16670]305 } else { # returns the URL encoded source filename!
[10980]306 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
307 }
[7569]308 }
[23485]309
[27306]310 if (!&FileUtils::isFilenameAbsolute($filename)) {
311 $filename = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$filename);
[23485]312 }
313
[7569]314 return $filename;
315}
316
[1241]317sub set_doc_type {
318 my $self = shift (@_);
319 my ($doc_type) = @_;
320
321 $self->set_metadata_element ($self->get_top_section(),
322 "gsdldoctype",
323 $doc_type);
324}
325
[10217]326# returns the gsdldoctype as it was provided
[1241]327# the default of "indexed_doc" is used if no document
328# type was provided
329sub get_doc_type {
330 my $self = shift (@_);
331
332 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
333 return $doc_type if (defined $doc_type);
334 return "indexed_doc";
335}
336
337
338# look up the reference to the a particular section
339sub _lookup_section {
340 my $self = shift (@_);
341 my ($section) = @_;
342
343 my ($num);
344 my $sectionref = $self;
345
346 while (defined $section && $section ne "") {
[12327]347
[1241]348 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
[12327]349
350 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
351
[1241]352 $section = "" unless defined $section;
353
[12327]354
[1241]355 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
356 $sectionref = $sectionref->{'subsections'}->{$num};
357 } else {
358 return undef;
359 }
360 }
361
362 return $sectionref;
363}
364
[2327]365# calculate OID by hashing the contents of the document
[1241]366sub _calc_OID {
367 my $self = shift (@_);
368 my ($filename) = @_;
369
[22855]370
[1241]371 my $osexe = &util::get_os_exe();
372
[27306]373 my $hashfile_exe = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin",
[1241]374 $ENV{'GSDLOS'},"hashfile$osexe");
[8504]375
[1241]376 my $result = "NULL";
[22855]377
[1679]378
[1241]379 if (-e "$hashfile_exe") {
[1679]380# $result = `\"$hashfile_exe\" \"$filename\"`;
[21862]381# $result = `hashfile$osexe \"$filename\" 2>&1`;
[1679]382 $result = `hashfile$osexe \"$filename\"`;
[21862]383
[1241]384 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
385 } else {
386 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
387 }
388 return "HASH$result";
389}
390
391# methods dealing with OID, not groups of them.
392
[2327]393# if $OID is not provided one is calculated
[1241]394sub set_OID {
395 my $self = shift (@_);
396 my ($OID) = @_;
[8504]397
[8797]398 my $use_hash_oid = 0;
[17057]399 # if an OID wasn't provided calculate one
[1241]400 if (!defined $OID) {
401 $OID = "NULL";
[17025]402 if ($self->{'OIDtype'} =~ /^hash/) {
[8797]403 $use_hash_oid = 1;
404 } elsif ($self->{'OIDtype'} eq "incremental") {
[2327]405 $OID = "D" . $OIDcount;
406 $OIDcount ++;
[26221]407 } elsif ($self->{'OIDtype'} eq "filename") {
408 my $filename = $self->get_source_filename();
409 $OID = &File::Basename::fileparse($filename, qr/\.[^.]*/);
410 $OID = &util::tidy_up_oid($OID);
[26536]411 } elsif ($self->{'OIDtype'} eq "full_filename") {
412 my $source_filename = $self->get_source_filename();
413 my $dirsep = &util::get_os_dirsep();
414
415 $source_filename =~ s/^import$dirsep//;
416 $source_filename =~ s/$dirsep/-/g;
417 $source_filename =~ s/\./_/g;
418
419 $OID = $source_filename;
420 $OID = &util::tidy_up_oid($OID);
[8716]421 } elsif ($self->{'OIDtype'} eq "dirname") {
422 my $filename = $self->get_source_filename();
[8797]423 if (defined($filename)) { # && -e $filename) {
[27350]424 # get the immediate parent directory
[8716]425 $OID = &File::Basename::dirname($filename);
426 if (defined $OID) {
[27350]427 $OID = &File::Basename::basename($OID);
[19617]428 $OID = &util::tidy_up_oid($OID);
[8716]429 } else {
[8797]430 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
431 $use_hash_oid = 1;
[8716]432 }
433 } else {
[27350]434 print STDERR "Failed to find a filename, generating hash id\n";
[8797]435 $use_hash_oid = 1;
[8716]436 }
[2327]437
[8797]438 } elsif ($self->{'OIDtype'} eq "assigned") {
[12268]439 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
[8797]440 if (defined $identifier && $identifier ne "") {
[17025]441 $OID = $identifier;
[19617]442 $OID = &util::tidy_up_oid($OID);
[8797]443 } else {
444 # need a hash id
[12268]445 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
[8797]446 $use_hash_oid = 1;
447 }
448
[2327]449 } else {
[8797]450 $use_hash_oid = 1;
451 }
452
453 if ($use_hash_oid) {
[23562]454 my $hash_on_file = 1;
[26536]455 my $hash_on_ga_xml = 0;
456
[17025]457 if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
458 $hash_on_file = 0;
[26536]459 $hash_on_ga_xml = 1;
[17025]460 }
[26536]461
462 if ($self->{'OIDtype'} eq "hash_on_full_filename") {
463 $hash_on_file = 0;
464 $hash_on_ga_xml = 0;
465
466 my $source_filename = $self->get_source_filename();
467 my $dirsep = &util::get_os_dirsep();
468
469 $source_filename =~ s/^import$dirsep//;
470 $source_filename =~ s/$dirsep/-/g;
471 $source_filename =~ s/\./_/g;
472
473 # If the filename is very short then (handled naively)
474 # this can cause conjestion in the hash-values
475 # computed, leading documents sharing the same leading
476 # Hex values in the computed has.
477 #
478 # The solution taken here is to replace the name of
479 # the file name a sufficient number of times (up to
480 # the character limit defined in 'rep_limit' and
481 # make that the content that is hashed on
482
483 # *** Think twice before changing the following value
484 # as it will break backward compatability of computed
485 # document HASH values
486
487 my $rep_limit = 256;
488 my $hash_content = undef;
489
490 if (length($source_filename)<$rep_limit) {
491 my $rep_string = "$source_filename|";
492 my $rs_len = length($rep_string);
493
494 my $clone_times = int(($rep_limit-1)/$rs_len) +1;
495
496 $hash_content = substr($rep_string x $clone_times, 0, $rep_limit);
497 }
498 else {
499 $hash_content = $source_filename;
500 }
501
502 my $filename = &util::get_tmp_filename();
503 if (!open (OUTFILE, ">:utf8", $filename)) {
504 print STDERR "doc::set_OID could not write to $filename\n";
505 } else {
506 print OUTFILE $hash_content;
507 close (OUTFILE);
508 }
509 $OID = $self->_calc_OID ($filename);
510
511 &util::rm ($filename);
512 }
513
[17025]514 if ($hash_on_file) {
515 # "hash" OID - feed file to hashfile.exe
516 my $filename = $self->get_filename_for_hashing();
[23562]517
[17025]518 # -z: don't want to hash on the file if it is zero size
519 if (defined($filename) && -e $filename && !-z $filename) {
520 $OID = $self->_calc_OID ($filename);
521 } else {
[26536]522 $hash_on_ga_xml = 1; # switch to back-up plan, and hash on GA file instead
[17025]523 }
524 }
[26536]525
526 if ($hash_on_ga_xml) {
527 # In addition being asked to explicity calculate the has based on the GA file,
528 # can also end up coming into this block is doing 'hash_on_file' but the file
529 # itself is of zero bytes (as could be the case with 'doc.nul' file
530
[17025]531 my $filename = &util::get_tmp_filename();
[22855]532 if (!open (OUTFILE, ">:utf8", $filename)) {
[2327]533 print STDERR "doc::set_OID could not write to $filename\n";
534 } else {
[13172]535 my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
536 print OUTFILE $doc_text;
[2327]537 close (OUTFILE);
538 }
539 $OID = $self->_calc_OID ($filename);
540 &util::rm ($filename);
[1374]541 }
[1241]542 }
543 }
544 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
545}
546
547# this uses hashdoc (embedded c thingy) which is faster but still
548# needs a little work to be suffiently stable
549sub ___set_OID {
550 my $self = shift (@_);
551 my ($OID) = @_;
552
553 # if an OID wasn't provided then calculate hash value based on document
554 if (!defined $OID)
555 {
[13172]556 my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
[1241]557 my $hash_len = length($hash_text);
558
559 $OID = &hashdoc::buffer($hash_text,$hash_len);
560 }
561
562 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
563}
564
565# returns the OID for this document
566sub get_OID {
567 my $self = shift (@_);
568 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
569 return $OID if (defined $OID);
570 return "NULL";
571}
572
573sub delete_OID {
574 my $self = shift (@_);
575
576 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
577}
578
579
580# methods for manipulating section names
581
582# returns the name of the top-most section (the top
583# level of the document
584sub get_top_section {
585 my $self = shift (@_);
586
587 return "";
588}
589
590# returns a section
591sub get_parent_section {
592 my $self = shift (@_);
593 my ($section) = @_;
594
595 $section =~ s/(^|\.)\d+$//;
596
597 return $section;
598}
599
600# returns the first child section (or the end child
601# if there isn't any)
602sub get_begin_child {
603 my $self = shift (@_);
604 my ($section) = @_;
605
606 my $section_ptr = $self->_lookup_section($section);
607 return "" unless defined $section_ptr;
608
609 if (defined $section_ptr->{'subsection_order'}->[0]) {
610 return "$section.$section_ptr->{'subsection_order'}->[0]";
611 }
612
613 return $self->get_end_child ($section);
614}
615
616# returns the next child of a parent section
617sub get_next_child {
618 my $self = shift (@_);
619 my ($section) = @_;
620
621 my $parent_section = $self->get_parent_section($section);
622 my $parent_section_ptr = $self->_lookup_section($parent_section);
623 return undef unless defined $parent_section_ptr;
624
625 my ($section_num) = $section =~ /(\d+)$/;
626 return undef unless defined $section_num;
627
628 my $i = 0;
629 my $section_order = $parent_section_ptr->{'subsection_order'};
630 while ($i < scalar(@$section_order)) {
631 last if $section_order->[$i] eq $section_num;
632 $i++;
633 }
634
635 $i++; # the next child
636 if ($i < scalar(@$section_order)) {
637 return $section_order->[$i] if $parent_section eq "";
638 return "$parent_section.$section_order->[$i]";
639 }
640
641 # no more sections in this level
642 return undef;
643}
644
645# returns a reference to a list of children
646sub get_children {
647 my $self = shift (@_);
648 my ($section) = @_;
649
650 my $section_ptr = $self->_lookup_section($section);
651 return [] unless defined $section_ptr;
652
653 my @children = @{$section_ptr->{'subsection_order'}};
654
655 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
656 return \@children;
657}
658
659# returns the child section one past the last one (which
660# is coded as "0")
661sub get_end_child {
662 my $self = shift (@_);
663 my ($section) = @_;
664
665 return $section . ".0" unless $section eq "";
666 return "0";
667}
668
669# returns the next section in book order
670sub get_next_section {
671 my $self = shift (@_);
672 my ($section) = @_;
673
674 return undef unless defined $section;
675
676 my $section_ptr = $self->_lookup_section($section);
677 return undef unless defined $section_ptr;
678
679 # first try to find first child
680 if (defined $section_ptr->{'subsection_order'}->[0]) {
681 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
682 return "$section.$section_ptr->{'subsection_order'}->[0]";
683 }
684
685 do {
686 # try to find sibling
687 my $next_child = $self->get_next_child ($section);
688 return $next_child if (defined $next_child);
689
690 # move up one level
691 $section = $self->get_parent_section ($section);
692 } while $section =~ /\d/;
693
694 return undef;
695}
696
697sub is_leaf_section {
698 my $self = shift (@_);
699 my ($section) = @_;
700
701 my $section_ptr = $self->_lookup_section($section);
702 return 1 unless defined $section_ptr;
703
704 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
705}
706
707# methods for dealing with sections
708
709# returns the name of the inserted section
710sub insert_section {
711 my $self = shift (@_);
712 my ($before_section) = @_;
713
714 # get the child to insert before and its parent section
715 my $parent_section = "";
716 my $before_child = "0";
717 my @before_section = split (/\./, $before_section);
718 if (scalar(@before_section) > 0) {
719 $before_child = pop (@before_section);
720 $parent_section = join (".", @before_section);
721 }
722
723 my $parent_section_ptr = $self->_lookup_section($parent_section);
724 if (!defined $parent_section_ptr) {
725 print STDERR "doc::insert_section couldn't find parent section " .
726 "$parent_section\n";
727 return;
728 }
729
730 # get the next section number
731 my $section_num = $parent_section_ptr->{'next_subsection'}++;
732
733 my $i = 0;
734 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
735 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
736 $i++;
737 }
738
739 # insert the section number into the order list
740 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
741
742 # add this section to the parent section
743 my $section_ptr = {'subsection_order'=>[],
744 'next_subsection'=>1,
745 'subsections'=>{},
746 'metadata'=>[],
747 'text'=>""};
748 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
749
750 # work out the full section number
751 my $section = $parent_section;
752 $section .= "." unless $section eq "";
753 $section .= $section_num;
754
755 return $section;
756}
757
758# creates a pre-named section
759sub create_named_section {
760 my $self = shift (@_);
761 my ($mastersection) = @_;
762
763 my ($num);
764 my $section = $mastersection;
765 my $sectionref = $self;
766
767 while ($section ne "") {
768 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
769 $num =~ s/^0+(\d)/$1/; # remove leading 0s
770 $section = "" unless defined $section;
771
772 if (defined $num) {
773 if (!defined $sectionref->{'subsections'}->{$num}) {
774 push (@{$sectionref->{'subsection_order'}}, $num);
775 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
776 'next_subsection'=>1,
777 'subsections'=>{},
778 'metadata'=>[],
779 'text'=>""};
780 if ($num >= $sectionref->{'next_subsection'}) {
781 $sectionref->{'next_subsection'} = $num + 1;
782 }
783 }
784 $sectionref = $sectionref->{'subsections'}->{$num};
785
786 } else {
787 print STDERR "doc::create_named_section couldn't create section ";
788 print STDERR "$mastersection\n";
789 last;
790 }
791 }
792}
793
794# returns a reference to a list of subsections
795sub list_subsections {
796 my $self = shift (@_);
797 my ($section) = @_;
798
799 my $section_ptr = $self->_lookup_section ($section);
800 if (!defined $section_ptr) {
801 print STDERR "doc::list_subsections couldn't find section $section\n";
802 return [];
803 }
804
805 return [@{$section_ptr->{'subsection_order'}}];
806}
807
808sub delete_section {
809 my $self = shift (@_);
810 my ($section) = @_;
811
812# my $section_ptr = {'subsection_order'=>[],
813# 'next_subsection'=>1,
814# 'subsections'=>{},
815# 'metadata'=>[],
816# 'text'=>""};
817
818 # if this is the top section reset everything
819 if ($section eq "") {
820 $self->{'subsection_order'} = [];
821 $self->{'subsections'} = {};
822 $self->{'metadata'} = [];
823 $self->{'text'} = "";
824 return;
825 }
826
827 # find the parent of the section to delete
828 my $parent_section = "";
829 my $child = "0";
830 my @section = split (/\./, $section);
831 if (scalar(@section) > 0) {
832 $child = pop (@section);
833 $parent_section = join (".", @section);
834 }
835
836 my $parent_section_ptr = $self->_lookup_section($parent_section);
837 if (!defined $parent_section_ptr) {
838 print STDERR "doc::delete_section couldn't find parent section " .
839 "$parent_section\n";
840 return;
841 }
842
843 # remove this section from the subsection_order list
844 my $i = 0;
845 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
846 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
847 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
848 last;
849 }
850 $i++;
851 }
852
853 # remove this section from the subsection hash
854 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
855 undef $parent_section_ptr->{'subsections'}->{$child};
856 }
857}
858
859#--
[4]860# methods for dealing with metadata
861
862# set_metadata_element and get_metadata_element are for metadata
863# which should only have one value. add_meta_data and get_metadata
864# are for metadata which can have more than one value.
865
[1241]866# returns the first metadata value which matches field
[6111]867
868# This version of get metadata element works much like the one above,
869# except it allows for the namespace portion of a metadata element to
870# be ignored, thus if you are searching for dc.Title, the first piece
871# of matching metadata ending with the name Title (once any namespace
872# is removed) would be returned.
873# 28-11-2003 John Thompson
[1241]874sub get_metadata_element {
875 my $self = shift (@_);
[6111]876 my ($section, $field, $ignore_namespace) = @_;
[1241]877 my ($data);
878
[6111]879 $ignore_namespace = 0 unless defined $ignore_namespace;
880
[1241]881 my $section_ptr = $self->_lookup_section($section);
882 if (!defined $section_ptr) {
[8716]883 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
[1241]884 return;
885 }
886
[24404]887 # Remove any namespace if we are being told to ignore them
[6111]888 if($ignore_namespace) {
[24404]889 $field =~ s/^.*\.//; #$field =~ s/^\w*\.//;
[6111]890 }
891
[1241]892 foreach $data (@{$section_ptr->{'metadata'}}) {
[6111]893
894 my $data_name = $data->[0];
[14966]895
[24404]896 # Remove any namespace if we are being told to ignore them
[6111]897 if($ignore_namespace) {
[24404]898 $data_name =~ s/^.*\.//; #$data_name =~ s/^\w*\.//;
[6111]899 }
[24404]900 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
901 $data_name =~ s/^ex\.([^.]+)$/$1/; #$data_name =~ s/^ex\.//;
902
[6111]903 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
[1241]904 }
905
906 return undef; # was not found
907}
908
909# returns a list of the form [value1, value2, ...]
910sub get_metadata {
911 my $self = shift (@_);
[6111]912 my ($section, $field, $ignore_namespace) = @_;
[1241]913 my ($data);
914
[6111]915 $ignore_namespace = 0 unless defined $ignore_namespace;
916
[1241]917 my $section_ptr = $self->_lookup_section($section);
918 if (!defined $section_ptr) {
[8716]919 print STDERR "doc::get_metadata couldn't find section ",
920 $section, "\n";
[1241]921 return;
922 }
923
[24404]924 # Remove any namespace if we are being told to ignore them
[6111]925 if($ignore_namespace) {
[24404]926 $field =~ s/^.*\.//;
[6111]927 }
928
[1241]929 my @metadata = ();
930 foreach $data (@{$section_ptr->{'metadata'}}) {
[6111]931
932 my $data_name = $data->[0];
[24404]933
934 # Remove any namespace if we are being told to ignore them
[6111]935 if($ignore_namespace) {
[24404]936 $data_name =~ s/^.*\.//;
937 }
938 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
939 $data_name =~ s/^ex\.([^.]+)$/$1/;
940
[6111]941 push (@metadata, $data->[1]) if ($data_name eq $field);
[1241]942 }
[9241]943
[1241]944 return \@metadata;
945}
946
[23827]947sub get_metadata_hashmap {
948 my $self = shift (@_);
949 my ($section, $opt_namespace) = @_;
950
951 my $section_ptr = $self->_lookup_section($section);
952 if (!defined $section_ptr) {
953 print STDERR "doc::get_metadata couldn't find section ",
954 $section, "\n";
955 return;
956 }
957
958 my $metadata_hashmap = {};
959 foreach my $data (@{$section_ptr->{'metadata'}}) {
960 my $metaname = $data->[0];
961
962 if ((!defined $opt_namespace) || ($metaname =~ m/^$opt_namespace\./)) {
963 if (!defined $metadata_hashmap->{$metaname}) {
964 $metadata_hashmap->{$metaname} = [];
965 }
966 my $metaval_list = $metadata_hashmap->{$metaname};
967 push(@$metaval_list, $data->[1]);
968 }
969 }
970
971 return $metadata_hashmap;
972}
973
[1241]974# returns a list of the form [[field,value],[field,value],...]
975sub get_all_metadata {
976 my $self = shift (@_);
977 my ($section) = @_;
978
979 my $section_ptr = $self->_lookup_section($section);
980 if (!defined $section_ptr) {
[8716]981 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
[1241]982 return;
983 }
984
985 return $section_ptr->{'metadata'};
986}
987
988# $value is optional
989sub delete_metadata {
990 my $self = shift (@_);
991 my ($section, $field, $value) = @_;
992
993 my $section_ptr = $self->_lookup_section($section);
994 if (!defined $section_ptr) {
[23562]995 print STDERR "doc::delete_metadata couldn't find section ", $section, "$field\n";
[1241]996 return;
997 }
998
999 my $i = 0;
1000 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
1001 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
1002 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
1003 splice (@{$section_ptr->{'metadata'}}, $i, 1);
1004 } else {
1005 $i++;
1006 }
1007 }
1008}
1009
1010sub delete_all_metadata {
1011 my $self = shift (@_);
1012 my ($section) = @_;
1013
1014 my $section_ptr = $self->_lookup_section($section);
1015 if (!defined $section_ptr) {
[8716]1016 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
[1241]1017 return;
1018 }
1019
1020 $section_ptr->{'metadata'} = [];
1021}
1022
[4]1023sub set_metadata_element {
1024 my $self = shift (@_);
1025 my ($section, $field, $value) = @_;
1026
[97]1027 $self->set_utf8_metadata_element ($section, $field,
[1870]1028 &unicode::ascii2utf8(\$value));
[73]1029}
1030
1031# set_utf8_metadata_element assumes the text has already been
1032# converted to the UTF-8 encoding.
1033sub set_utf8_metadata_element {
1034 my $self = shift (@_);
1035 my ($section, $field, $value) = @_;
[25557]1036
[4]1037 $self->delete_metadata ($section, $field);
[73]1038 $self->add_utf8_metadata ($section, $field, $value);
[4]1039}
1040
1041
[73]1042# add_metadata assumes the text is in (extended) ascii form. For
[8220]1043# text which has already been converted to the UTF-8 format use
[73]1044# add_utf8_metadata.
[4]1045sub add_metadata {
1046 my $self = shift (@_);
1047 my ($section, $field, $value) = @_;
[25557]1048
[97]1049 $self->add_utf8_metadata ($section, $field,
[1870]1050 &unicode::ascii2utf8(\$value));
[73]1051}
1052
1053sub add_utf8_metadata {
1054 my $self = shift (@_);
1055 my ($section, $field, $value) = @_;
[25557]1056
1057 # my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1058 # my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
1059 # print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
[73]1060
[4]1061 my $section_ptr = $self->_lookup_section($section);
1062 if (!defined $section_ptr) {
[8716]1063 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
[4]1064 return;
1065 }
[1732]1066 if (!defined $value) {
1067 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
1068 return;
1069 }
1070 if (!defined $field) {
1071 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
1072 return;
1073 }
[4]1074
[8894]1075 #print STDERR "###$field=$value\n";
[22950]1076
1077 # For now, supress this check. Given that text data read in is now
1078 # Unicode aware, then the following block of code can (ironically enough)
1079 # cause our unicode compliant string to be re-encoded (leading to
1080 # a double-encoded UTF-8 string, which we definitely don't want!).
1081
1082
[7798]1083 # double check that the value is utf-8
[25557]1084 # if (!&unicode::check_is_utf8($value)) {
1085 # print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
1086 # &unicode::ensure_utf8(\$value);
1087 # print STDERR " Tried converting to utf8: $value\n";
1088 # }
[7798]1089
[25557]1090 #If the metadata value is either a latitude or a longitude value then we want to save a shortened version for spacial searching purposes
[26850]1091 if ($field =~ m/^(.+\.)?Latitude$/ || $field =~ m/^(.+\.)?Longitude$/)
[25557]1092 {
[26850]1093 my ($mdprefix,$metaname) = ($field =~ m/(.+)\.(.+)$/);
1094 if (defined $mdprefix) {
1095 # Add in a version of Latitude/Longitude without the metadata namespace prefix to keep Runtime happy
1096 push (@{$section_ptr->{'metadata'}}, [$metaname, $value]);
1097 }
1098
[25557]1099 my $direction;
1100 if($value =~ m/^-/)
1101 {
1102 $direction = ($field eq "Latitude") ? "S" : "W";
1103 }
1104 else
1105 {
1106 $direction = ($field eq "Latitude") ? "N" : "E";
1107 }
1108
1109 my ($beforeDec, $afterDec) = ($value =~ m/^-?([0-9]+)\.([0-9]+)$/);
1110 if(defined $beforeDec && defined $afterDec)
1111 {
1112 my $name = ($field eq "Latitude") ? "LatShort" : "LngShort";
1113 push (@{$section_ptr->{'metadata'}}, [$name, $beforeDec . $direction]);
1114
1115 for(my $i = 2; $i <= 4; $i++)
1116 {
1117 if(length($afterDec) >= $i)
1118 {
1119 push (@{$section_ptr->{'metadata'}}, [$name, substr($afterDec, 0, $i)]);
1120 }
1121 }
1122
1123 #Only add the metadata if it has not already been added
1124 my $metaMap = $self->get_metadata_hashmap($section);
1125 }
1126 }
1127
[4]1128 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1129}
1130
1131
1132# methods for dealing with text
1133
[1241]1134# returns the text for a section
1135sub get_text {
1136 my $self = shift (@_);
1137 my ($section) = @_;
1138
1139 my $section_ptr = $self->_lookup_section($section);
1140 if (!defined $section_ptr) {
1141 print STDERR "doc::get_text couldn't find section " .
1142 "$section\n";
1143 return "";
1144 }
1145
1146 return $section_ptr->{'text'};
1147}
1148
1149# returns the (utf-8 encoded) length of the text for a section
1150sub get_text_length {
1151 my $self = shift (@_);
1152 my ($section) = @_;
1153
1154 my $section_ptr = $self->_lookup_section($section);
1155 if (!defined $section_ptr) {
1156 print STDERR "doc::get_text_length couldn't find section " .
1157 "$section\n";
1158 return 0;
1159 }
1160
1161 return length ($section_ptr->{'text'});
1162}
1163
[23131]1164# returns the total length for all the sections
1165sub get_total_text_length {
1166 my $self = shift (@_);
1167
1168 my $section = $self->get_top_section();
1169 my $length = 0;
1170 while (defined $section) {
1171 $length += $self->get_text_length($section);
1172 $section = $self->get_next_section($section);
1173 }
1174 return $length;
1175}
1176
[1241]1177sub delete_text {
1178 my $self = shift (@_);
1179 my ($section) = @_;
1180
1181 my $section_ptr = $self->_lookup_section($section);
1182 if (!defined $section_ptr) {
1183 print STDERR "doc::delete_text couldn't find section " .
1184 "$section\n";
1185 return;
1186 }
1187
1188 $section_ptr->{'text'} = "";
1189}
1190
[73]1191# add_text assumes the text is in (extended) ascii form. For
1192# text which has been already converted to the UTF-8 format
1193# use add_utf8_text.
[4]1194sub add_text {
1195 my $self = shift (@_);
1196 my ($section, $text) = @_;
1197
[73]1198 # convert the text to UTF-8 encoded unicode characters
1199 # and add the text
[1870]1200 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
[73]1201}
1202
1203
1204# add_utf8_text assumes the text to be added has already
1205# been converted to the UTF-8 encoding. For ascii text use
1206# add_text
1207sub add_utf8_text {
1208 my $self = shift (@_);
1209 my ($section, $text) = @_;
1210
[4]1211 my $section_ptr = $self->_lookup_section($section);
1212 if (!defined $section_ptr) {
[73]1213 print STDERR "doc::add_utf8_text couldn't find section " .
[4]1214 "$section\n";
1215 return;
1216 }
1217
1218 $section_ptr->{'text'} .= $text;
1219}
1220
[16950]1221# returns the Source meta, which is the utf8 filename generated.
1222# Added a separate method here for convenience
1223sub get_source {
[16924]1224 my $self = shift (@_);
[16950]1225 return $self->get_metadata_element ($self->get_top_section(), "Source");
1226}
1227
1228# returns the SourceFile meta, which is the url reference to the URL-encoded
1229# version of Source (the utf8 filename). Added a separate method here for convenience
1230sub get_sourcefile {
1231 my $self = shift (@_);
1232 return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
1233}
1234
1235# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
1236# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
1237sub get_assocfile_from_sourcefile {
1238 my $self = shift (@_);
[16924]1239
1240 # get the SourceFile meta, which is a *URL* to a file on the filesystem
1241 my $top_section = $self->get_top_section();
1242 my $source_file = $self->get_metadata_element($top_section, "SourceFile");
[4]1243
[16924]1244 # get the actual filename as it exists on the filesystem which this url refers to
[16928]1245 $source_file = &unicode::url_to_filename($source_file);
[16924]1246 my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
1247 return $assocfilename;
1248}
1249
[1241]1250# methods for dealing with associated files
1251
1252# a file is associated with a document, NOT a section.
1253# if section is defined it is noted in the data structure
1254# only so that files associated from a particular section
1255# may be removed later (using delete_section_assoc_files)
1256sub associate_file {
1257 my $self = shift (@_);
1258 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1259 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1260
1261 # remove all associated files with the same name
1262 $self->delete_assoc_file ($assoc_filename);
[23413]1263
1264 # Too harsh a requirement
1265 # Definitely get HTML docs, for example, with some missing
1266 # support files
1267# if (!&util::fd_exists($real_filename)) {
1268# print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n";
1269# exit -1;
1270# }
1271
[23387]1272# print STDERR "**** is the following a UTF8 rep of *real* filename?\n $real_filename\n";
1273# print STDERR "****##### so, ensure it is before storing?!?!?\n";
1274## my $utf8_filename = Encode::encode("utf8",$filename);
1275
[1241]1276 push (@{$self->{'associated_files'}},
1277 [$real_filename, $assoc_filename, $mime_type, $section]);
1278}
1279
1280# returns a list of associated files in the form
1281# [[real_filename, assoc_filename, mimetype], ...]
1282sub get_assoc_files {
1283 my $self = shift (@_);
1284
1285 return $self->{'associated_files'};
1286}
1287
[20775]1288# the following two methods used to keep track of original associated files
1289# for incremental building. eg a txt file used by an item file does not end
1290# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1291# file for incremental build
1292sub associate_source_file {
1293 my $self = shift (@_);
1294 my ($full_filename) = @_;
[19494]1295
[20775]1296 push (@{$self->{'source_assoc_files'}}, $full_filename);
1297
1298}
1299
1300sub get_source_assoc_files {
1301 my $self = shift (@_);
1302
1303 return $self->{'source_assoc_files'};
1304
1305
1306}
[19494]1307sub metadata_file {
1308 my $self = shift (@_);
1309 my ($real_filename, $filename) = @_;
1310
1311 push (@{$self->{'metadata_files'}},
1312 [$real_filename, $filename]);
1313}
1314
[21566]1315# used for writing out the archiveinf-doc info database, to list all the metadata files
[19494]1316sub get_meta_files {
1317 my $self = shift (@_);
1318
1319 return $self->{'metadata_files'};
1320}
1321
[1241]1322sub delete_section_assoc_files {
1323 my $self = shift (@_);
1324 my ($section) = @_;
1325
1326 my $i=0;
1327 while ($i < scalar (@{$self->{'associated_files'}})) {
1328 if (defined $self->{'associated_files'}->[$i]->[3] &&
1329 $self->{'associated_files'}->[$i]->[3] eq $section) {
1330 splice (@{$self->{'associated_files'}}, $i, 1);
1331 } else {
1332 $i++;
1333 }
1334 }
1335}
1336
1337sub delete_assoc_file {
1338 my $self = shift (@_);
1339 my ($assoc_filename) = @_;
1340
1341 my $i=0;
1342 while ($i < scalar (@{$self->{'associated_files'}})) {
1343 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1344 splice (@{$self->{'associated_files'}}, $i, 1);
1345 } else {
1346 $i++;
1347 }
1348 }
1349}
1350
1351sub reset_nextsection_ptr {
1352 my $self = shift (@_);
1353 my ($section) = @_;
1354
1355 my $section_ptr = $self->_lookup_section($section);
1356 $section_ptr->{'next_subsection'} = 1;
1357}
1358
[4]13591;
Note: See TracBrowser for help on using the repository browser.