source: main/trunk/greenstone2/perllib/doc.pm@ 26536

Last change on this file since 26536 was 26536, checked in by davidb, 8 years ago

Introduction of two new OIDtype values (hash_on_full_filename and full_filename) designed to help provide more stable document IDs for collections that are rebuilt over time, including rebuilt after the Greenstone install has been upgraded

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 37.8 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use ghtml;
40use File::stat;
41##use hashdoc;
42use docprint;
43
44# the document type may be indexed_doc, nonindexed_doc, or
45# classification
46
47our $OIDcount = 0;
48
49# rename_method can be 'url', 'none', 'base64'
50sub new {
51 my $class = shift (@_);
52 my ($source_filename, $doc_type, $rename_method) = @_;
53
54
55 my $self = bless {'associated_files'=>[],
56 'subsection_order'=>[],
57 'next_subsection'=>1,
58 'subsections'=>{},
59 'metadata'=>[],
60 'text'=>"",
61 'OIDtype'=>"hash"}, $class;
62
63 # used to set lastmodified here, but this can screw up the HASH ids, so
64 # the docsave processor now calls set_lastmodified
65
66 $self->set_source_path($source_filename);
67
68 if (defined $source_filename) {
69 $source_filename = &util::filename_within_collection($source_filename);
70 print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
71 $self->set_source_filename ($source_filename, $rename_method);
72 }
73
74 $self->set_doc_type ($doc_type) if defined $doc_type;
75
76 return $self;
77}
78
79
80sub set_source_path
81{
82 my $self = shift @_;
83 my ($source_filename) = @_;
84
85 if (defined $source_filename) {
86 # On Windows the source_filename can be in terse DOS format
87 # e.g. test~1.txt
88
89 $self->{'terse_source_path'} = $source_filename;
90
91 if (-e $source_filename) {
92 # See if we can do better for Windows with a filename
93 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
94 require Win32;
95 $self->{'source_path'} = Win32::GetLongPathName($source_filename);
96 }
97 else {
98 # For Unix-based systems, there is no difference between the two
99 $self->{'source_path'} = $source_filename;
100 }
101 }
102 else {
103 print STDERR "Warning: In doc::set_source_path(), file\n";
104 print STDERR " $source_filename\n";
105 print STDERR " does not exist\n";
106
107 # (default) Set it to whatever we were given
108 $self->{'source_path'} = $source_filename;
109 }
110 }
111 else {
112 # Previous code for setting source_path allowed for
113 # it to be undefined, so continue this practice
114 $self->{'terse_source_path'} = undef;
115 $self->{'source_path'} = undef;
116 }
117}
118
119
120sub get_source_path
121{
122 my $self = shift @_;
123
124 return $self->{'terse_source_path'};
125}
126
127# set lastmodified for OAI purposes, added by GRB, moved by kjdon
128sub set_oailastmodified {
129 my $self = shift (@_);
130
131 my $source_path = $self->{'terse_source_path'};
132
133 if (defined $source_path && (-e $source_path)) {
134 my $current_time = time;
135
136 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
137 $wday, $yday, $isdst) = localtime($current_time);
138
139 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
140
141 $self->add_utf8_metadata($self->get_top_section(), "oailastmodified", $current_time);
142 $self->add_utf8_metadata($self->get_top_section(), "oailastmodifieddate", $date_modified);
143 }
144}
145
146# no longer used for OAI purposes, since lastmodified is not what we want as the
147# Datestamp of a document. This doc metadata may be useful for general purposes.
148sub set_lastmodified {
149 my $self = shift (@_);
150
151 my $source_path = $self->{'terse_source_path'};
152
153 if (defined $source_path && (-e $source_path)) {
154
155 my $file_stat = stat($source_path);
156 my $mtime = $file_stat->mtime;
157 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
158 $wday, $yday, $isdst) = localtime($mtime);
159
160 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
161
162 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $mtime);
163 $self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
164 }
165}
166
167# clone the $self object
168sub duplicate {
169 my $self = shift (@_);
170
171 my $newobj = {};
172
173 foreach my $k (keys %$self) {
174 $newobj->{$k} = &clone ($self->{$k});
175 }
176
177 bless $newobj, ref($self);
178 return $newobj;
179}
180
181sub clone {
182 my ($from) = @_;
183 my $type = ref ($from);
184
185 if ($type eq "HASH") {
186 my $to = {};
187 foreach my $key (keys %$from) {
188 $to->{$key} = &clone ($from->{$key});
189 }
190 return $to;
191 } elsif ($type eq "ARRAY") {
192 my $to = [];
193 foreach my $v (@$from) {
194 push (@$to, &clone ($v));
195 }
196 return $to;
197 } else {
198 return $from;
199 }
200}
201
202sub set_OIDtype {
203 my $self = shift (@_);
204 my ($type, $metadata) = @_;
205
206 if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|hash_on_full_filename|incremental|filename|dirname|full_filename|assigned)$/) {
207 $self->{'OIDtype'} = $type;
208 } else {
209 $self->{'OIDtype'} = "hash";
210 }
211
212 if ($type =~ /^assigned$/) {
213 if (defined $metadata) {
214 $self->{'OIDmetadata'} = $metadata;
215 } else {
216 $self->{'OIDmetadata'} = "dc.Identifier";
217 }
218 }
219}
220
221# rename_method can be 'url', 'none', 'base64'
222sub set_source_filename {
223 my $self = shift (@_);
224 my ($source_filename, $rename_method) = @_;
225
226 # Since the gsdlsourcefilename element goes into the doc.xml it has
227 # to be utf8. However, it should also *represent* the source filename
228 # (in the import directory) which may not be utf8 at all.
229 # For instance, if this meta element (gsdlsourcefilename) will be used
230 # by other applications that parse doc.xml in order to locate
231 # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
232 # encode the real filename as this is a binary-to-text encoding meaning
233 # that the resulting string is ASCII (utf8). Decoding will give the original.
234
235# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
236
237 # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
238 $source_filename = $self->encode_filename($source_filename, $rename_method);
239# my ($srcfilename,$dirname,$suffix)
240# = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
241# print STDERR "-> $srcfilename -> ";
242# $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
243# $source_filename = &util::filename_cat($dirname, $srcfilename);
244# print STDERR "$source_filename\n";
245
246 $self->set_utf8_metadata_element ($self->get_top_section(),
247 "gsdlsourcefilename",
248 $source_filename);
249}
250
251sub encode_filename {
252 my $self = shift (@_);
253 my ($source_filename, $rename_method) = @_;
254
255 my ($srcfilename,$dirname,$suffix)
256 = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
257# print STDERR "-> $srcfilename -> ";
258 $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
259 $source_filename = &util::filename_cat($dirname, $srcfilename);
260
261 return $source_filename;
262}
263
264sub set_converted_filename {
265 my $self = shift (@_);
266 my ($converted_filename) = @_;
267
268 # we know the converted filename is utf8
269 $self->set_utf8_metadata_element ($self->get_top_section(),
270 "gsdlconvertedfilename",
271 $converted_filename);
272}
273
274# returns the source_filename as it was provided
275sub get_unmodified_source_filename {
276 my $self = shift (@_);
277
278 return $self->{'terse_source_path'};
279}
280
281# returns the source_filename with whatever rename_method was given
282sub get_source_filename {
283 my $self = shift (@_);
284
285 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
286}
287
288
289
290# returns converted filename if available else returns source filename
291sub get_filename_for_hashing {
292 my $self = shift (@_);
293
294 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
295
296 if (!defined $filename) {
297 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
298 # if NULPlug processed file, then don't give a filename
299 if (defined $plugin_name && $plugin_name eq "NULPlug") {
300 $filename = undef;
301 } else { # returns the URL encoded source filename!
302 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
303 }
304 }
305
306 if (!&util::filename_is_absolute($filename)) {
307 $filename = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$filename);
308 }
309
310 return $filename;
311}
312
313sub set_doc_type {
314 my $self = shift (@_);
315 my ($doc_type) = @_;
316
317 $self->set_metadata_element ($self->get_top_section(),
318 "gsdldoctype",
319 $doc_type);
320}
321
322# returns the gsdldoctype as it was provided
323# the default of "indexed_doc" is used if no document
324# type was provided
325sub get_doc_type {
326 my $self = shift (@_);
327
328 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
329 return $doc_type if (defined $doc_type);
330 return "indexed_doc";
331}
332
333
334# look up the reference to the a particular section
335sub _lookup_section {
336 my $self = shift (@_);
337 my ($section) = @_;
338
339 my ($num);
340 my $sectionref = $self;
341
342 while (defined $section && $section ne "") {
343
344 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
345
346 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
347
348 $section = "" unless defined $section;
349
350
351 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
352 $sectionref = $sectionref->{'subsections'}->{$num};
353 } else {
354 return undef;
355 }
356 }
357
358 return $sectionref;
359}
360
361# calculate OID by hashing the contents of the document
362sub _calc_OID {
363 my $self = shift (@_);
364 my ($filename) = @_;
365
366
367 my $osexe = &util::get_os_exe();
368
369 my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
370 $ENV{'GSDLOS'},"hashfile$osexe");
371
372 my $result = "NULL";
373
374
375 if (-e "$hashfile_exe") {
376# $result = `\"$hashfile_exe\" \"$filename\"`;
377# $result = `hashfile$osexe \"$filename\" 2>&1`;
378 $result = `hashfile$osexe \"$filename\"`;
379
380 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
381 } else {
382 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
383 }
384 return "HASH$result";
385}
386
387# methods dealing with OID, not groups of them.
388
389# if $OID is not provided one is calculated
390sub set_OID {
391 my $self = shift (@_);
392 my ($OID) = @_;
393
394 my $use_hash_oid = 0;
395 # if an OID wasn't provided calculate one
396 if (!defined $OID) {
397 $OID = "NULL";
398 if ($self->{'OIDtype'} =~ /^hash/) {
399 $use_hash_oid = 1;
400 } elsif ($self->{'OIDtype'} eq "incremental") {
401 $OID = "D" . $OIDcount;
402 $OIDcount ++;
403 } elsif ($self->{'OIDtype'} eq "filename") {
404 my $filename = $self->get_source_filename();
405 $OID = &File::Basename::fileparse($filename, qr/\.[^.]*/);
406 $OID = &util::tidy_up_oid($OID);
407 } elsif ($self->{'OIDtype'} eq "full_filename") {
408 my $source_filename = $self->get_source_filename();
409 my $dirsep = &util::get_os_dirsep();
410
411 $source_filename =~ s/^import$dirsep//;
412 $source_filename =~ s/$dirsep/-/g;
413 $source_filename =~ s/\./_/g;
414
415 $OID = $source_filename;
416 $OID = &util::tidy_up_oid($OID);
417 } elsif ($self->{'OIDtype'} eq "dirname") {
418 $OID = 'J';
419 my $filename = $self->get_source_filename();
420 if (defined($filename)) { # && -e $filename) {
421 $OID = &File::Basename::dirname($filename);
422 if (defined $OID) {
423 $OID = 'J'.&File::Basename::basename($OID);
424 $OID = &util::tidy_up_oid($OID);
425 } else {
426 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
427 $use_hash_oid = 1;
428 }
429 } else {
430 print STDERR "Failed to find filename, generating hash id\n";
431 $use_hash_oid = 1;
432 }
433
434 } elsif ($self->{'OIDtype'} eq "assigned") {
435 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
436 if (defined $identifier && $identifier ne "") {
437 $OID = $identifier;
438 $OID = &util::tidy_up_oid($OID);
439 } else {
440 # need a hash id
441 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
442 $use_hash_oid = 1;
443 }
444
445 } else {
446 $use_hash_oid = 1;
447 }
448
449 if ($use_hash_oid) {
450 my $hash_on_file = 1;
451 my $hash_on_ga_xml = 0;
452
453 if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
454 $hash_on_file = 0;
455 $hash_on_ga_xml = 1;
456 }
457
458 if ($self->{'OIDtype'} eq "hash_on_full_filename") {
459 $hash_on_file = 0;
460 $hash_on_ga_xml = 0;
461
462 my $source_filename = $self->get_source_filename();
463 my $dirsep = &util::get_os_dirsep();
464
465 $source_filename =~ s/^import$dirsep//;
466 $source_filename =~ s/$dirsep/-/g;
467 $source_filename =~ s/\./_/g;
468
469 # If the filename is very short then (handled naively)
470 # this can cause conjestion in the hash-values
471 # computed, leading documents sharing the same leading
472 # Hex values in the computed has.
473 #
474 # The solution taken here is to replace the name of
475 # the file name a sufficient number of times (up to
476 # the character limit defined in 'rep_limit' and
477 # make that the content that is hashed on
478
479 # *** Think twice before changing the following value
480 # as it will break backward compatability of computed
481 # document HASH values
482
483 my $rep_limit = 256;
484 my $hash_content = undef;
485
486 if (length($source_filename)<$rep_limit) {
487 my $rep_string = "$source_filename|";
488 my $rs_len = length($rep_string);
489
490 my $clone_times = int(($rep_limit-1)/$rs_len) +1;
491
492 $hash_content = substr($rep_string x $clone_times, 0, $rep_limit);
493 }
494 else {
495 $hash_content = $source_filename;
496 }
497
498 my $filename = &util::get_tmp_filename();
499 if (!open (OUTFILE, ">:utf8", $filename)) {
500 print STDERR "doc::set_OID could not write to $filename\n";
501 } else {
502 print OUTFILE $hash_content;
503 close (OUTFILE);
504 }
505 $OID = $self->_calc_OID ($filename);
506
507 print STDERR "****!!! the computed hash for: '", $source_filename, "' is: ", $OID,"\n\n";
508
509 &util::rm ($filename);
510 }
511
512 if ($hash_on_file) {
513 # "hash" OID - feed file to hashfile.exe
514 my $filename = $self->get_filename_for_hashing();
515
516 # -z: don't want to hash on the file if it is zero size
517 if (defined($filename) && -e $filename && !-z $filename) {
518 $OID = $self->_calc_OID ($filename);
519 } else {
520 $hash_on_ga_xml = 1; # switch to back-up plan, and hash on GA file instead
521 }
522 }
523
524 if ($hash_on_ga_xml) {
525 # In addition being asked to explicity calculate the has based on the GA file,
526 # can also end up coming into this block is doing 'hash_on_file' but the file
527 # itself is of zero bytes (as could be the case with 'doc.nul' file
528
529 my $filename = &util::get_tmp_filename();
530 if (!open (OUTFILE, ">:utf8", $filename)) {
531 print STDERR "doc::set_OID could not write to $filename\n";
532 } else {
533 my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
534 print OUTFILE $doc_text;
535 close (OUTFILE);
536 }
537 $OID = $self->_calc_OID ($filename);
538 &util::rm ($filename);
539 }
540 }
541 }
542 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
543}
544
545# this uses hashdoc (embedded c thingy) which is faster but still
546# needs a little work to be suffiently stable
547sub ___set_OID {
548 my $self = shift (@_);
549 my ($OID) = @_;
550
551 # if an OID wasn't provided then calculate hash value based on document
552 if (!defined $OID)
553 {
554 my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
555 my $hash_len = length($hash_text);
556
557 $OID = &hashdoc::buffer($hash_text,$hash_len);
558 }
559
560 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
561}
562
563# returns the OID for this document
564sub get_OID {
565 my $self = shift (@_);
566 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
567 return $OID if (defined $OID);
568 return "NULL";
569}
570
571sub delete_OID {
572 my $self = shift (@_);
573
574 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
575}
576
577
578# methods for manipulating section names
579
580# returns the name of the top-most section (the top
581# level of the document
582sub get_top_section {
583 my $self = shift (@_);
584
585 return "";
586}
587
588# returns a section
589sub get_parent_section {
590 my $self = shift (@_);
591 my ($section) = @_;
592
593 $section =~ s/(^|\.)\d+$//;
594
595 return $section;
596}
597
598# returns the first child section (or the end child
599# if there isn't any)
600sub get_begin_child {
601 my $self = shift (@_);
602 my ($section) = @_;
603
604 my $section_ptr = $self->_lookup_section($section);
605 return "" unless defined $section_ptr;
606
607 if (defined $section_ptr->{'subsection_order'}->[0]) {
608 return "$section.$section_ptr->{'subsection_order'}->[0]";
609 }
610
611 return $self->get_end_child ($section);
612}
613
614# returns the next child of a parent section
615sub get_next_child {
616 my $self = shift (@_);
617 my ($section) = @_;
618
619 my $parent_section = $self->get_parent_section($section);
620 my $parent_section_ptr = $self->_lookup_section($parent_section);
621 return undef unless defined $parent_section_ptr;
622
623 my ($section_num) = $section =~ /(\d+)$/;
624 return undef unless defined $section_num;
625
626 my $i = 0;
627 my $section_order = $parent_section_ptr->{'subsection_order'};
628 while ($i < scalar(@$section_order)) {
629 last if $section_order->[$i] eq $section_num;
630 $i++;
631 }
632
633 $i++; # the next child
634 if ($i < scalar(@$section_order)) {
635 return $section_order->[$i] if $parent_section eq "";
636 return "$parent_section.$section_order->[$i]";
637 }
638
639 # no more sections in this level
640 return undef;
641}
642
643# returns a reference to a list of children
644sub get_children {
645 my $self = shift (@_);
646 my ($section) = @_;
647
648 my $section_ptr = $self->_lookup_section($section);
649 return [] unless defined $section_ptr;
650
651 my @children = @{$section_ptr->{'subsection_order'}};
652
653 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
654 return \@children;
655}
656
657# returns the child section one past the last one (which
658# is coded as "0")
659sub get_end_child {
660 my $self = shift (@_);
661 my ($section) = @_;
662
663 return $section . ".0" unless $section eq "";
664 return "0";
665}
666
667# returns the next section in book order
668sub get_next_section {
669 my $self = shift (@_);
670 my ($section) = @_;
671
672 return undef unless defined $section;
673
674 my $section_ptr = $self->_lookup_section($section);
675 return undef unless defined $section_ptr;
676
677 # first try to find first child
678 if (defined $section_ptr->{'subsection_order'}->[0]) {
679 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
680 return "$section.$section_ptr->{'subsection_order'}->[0]";
681 }
682
683 do {
684 # try to find sibling
685 my $next_child = $self->get_next_child ($section);
686 return $next_child if (defined $next_child);
687
688 # move up one level
689 $section = $self->get_parent_section ($section);
690 } while $section =~ /\d/;
691
692 return undef;
693}
694
695sub is_leaf_section {
696 my $self = shift (@_);
697 my ($section) = @_;
698
699 my $section_ptr = $self->_lookup_section($section);
700 return 1 unless defined $section_ptr;
701
702 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
703}
704
705# methods for dealing with sections
706
707# returns the name of the inserted section
708sub insert_section {
709 my $self = shift (@_);
710 my ($before_section) = @_;
711
712 # get the child to insert before and its parent section
713 my $parent_section = "";
714 my $before_child = "0";
715 my @before_section = split (/\./, $before_section);
716 if (scalar(@before_section) > 0) {
717 $before_child = pop (@before_section);
718 $parent_section = join (".", @before_section);
719 }
720
721 my $parent_section_ptr = $self->_lookup_section($parent_section);
722 if (!defined $parent_section_ptr) {
723 print STDERR "doc::insert_section couldn't find parent section " .
724 "$parent_section\n";
725 return;
726 }
727
728 # get the next section number
729 my $section_num = $parent_section_ptr->{'next_subsection'}++;
730
731 my $i = 0;
732 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
733 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
734 $i++;
735 }
736
737 # insert the section number into the order list
738 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
739
740 # add this section to the parent section
741 my $section_ptr = {'subsection_order'=>[],
742 'next_subsection'=>1,
743 'subsections'=>{},
744 'metadata'=>[],
745 'text'=>""};
746 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
747
748 # work out the full section number
749 my $section = $parent_section;
750 $section .= "." unless $section eq "";
751 $section .= $section_num;
752
753 return $section;
754}
755
756# creates a pre-named section
757sub create_named_section {
758 my $self = shift (@_);
759 my ($mastersection) = @_;
760
761 my ($num);
762 my $section = $mastersection;
763 my $sectionref = $self;
764
765 while ($section ne "") {
766 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
767 $num =~ s/^0+(\d)/$1/; # remove leading 0s
768 $section = "" unless defined $section;
769
770 if (defined $num) {
771 if (!defined $sectionref->{'subsections'}->{$num}) {
772 push (@{$sectionref->{'subsection_order'}}, $num);
773 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
774 'next_subsection'=>1,
775 'subsections'=>{},
776 'metadata'=>[],
777 'text'=>""};
778 if ($num >= $sectionref->{'next_subsection'}) {
779 $sectionref->{'next_subsection'} = $num + 1;
780 }
781 }
782 $sectionref = $sectionref->{'subsections'}->{$num};
783
784 } else {
785 print STDERR "doc::create_named_section couldn't create section ";
786 print STDERR "$mastersection\n";
787 last;
788 }
789 }
790}
791
792# returns a reference to a list of subsections
793sub list_subsections {
794 my $self = shift (@_);
795 my ($section) = @_;
796
797 my $section_ptr = $self->_lookup_section ($section);
798 if (!defined $section_ptr) {
799 print STDERR "doc::list_subsections couldn't find section $section\n";
800 return [];
801 }
802
803 return [@{$section_ptr->{'subsection_order'}}];
804}
805
806sub delete_section {
807 my $self = shift (@_);
808 my ($section) = @_;
809
810# my $section_ptr = {'subsection_order'=>[],
811# 'next_subsection'=>1,
812# 'subsections'=>{},
813# 'metadata'=>[],
814# 'text'=>""};
815
816 # if this is the top section reset everything
817 if ($section eq "") {
818 $self->{'subsection_order'} = [];
819 $self->{'subsections'} = {};
820 $self->{'metadata'} = [];
821 $self->{'text'} = "";
822 return;
823 }
824
825 # find the parent of the section to delete
826 my $parent_section = "";
827 my $child = "0";
828 my @section = split (/\./, $section);
829 if (scalar(@section) > 0) {
830 $child = pop (@section);
831 $parent_section = join (".", @section);
832 }
833
834 my $parent_section_ptr = $self->_lookup_section($parent_section);
835 if (!defined $parent_section_ptr) {
836 print STDERR "doc::delete_section couldn't find parent section " .
837 "$parent_section\n";
838 return;
839 }
840
841 # remove this section from the subsection_order list
842 my $i = 0;
843 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
844 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
845 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
846 last;
847 }
848 $i++;
849 }
850
851 # remove this section from the subsection hash
852 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
853 undef $parent_section_ptr->{'subsections'}->{$child};
854 }
855}
856
857#--
858# methods for dealing with metadata
859
860# set_metadata_element and get_metadata_element are for metadata
861# which should only have one value. add_meta_data and get_metadata
862# are for metadata which can have more than one value.
863
864# returns the first metadata value which matches field
865
866# This version of get metadata element works much like the one above,
867# except it allows for the namespace portion of a metadata element to
868# be ignored, thus if you are searching for dc.Title, the first piece
869# of matching metadata ending with the name Title (once any namespace
870# is removed) would be returned.
871# 28-11-2003 John Thompson
872sub get_metadata_element {
873 my $self = shift (@_);
874 my ($section, $field, $ignore_namespace) = @_;
875 my ($data);
876
877 $ignore_namespace = 0 unless defined $ignore_namespace;
878
879 my $section_ptr = $self->_lookup_section($section);
880 if (!defined $section_ptr) {
881 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
882 return;
883 }
884
885 # Remove any namespace if we are being told to ignore them
886 if($ignore_namespace) {
887 $field =~ s/^.*\.//; #$field =~ s/^\w*\.//;
888 }
889
890 foreach $data (@{$section_ptr->{'metadata'}}) {
891
892 my $data_name = $data->[0];
893
894 # Remove any namespace if we are being told to ignore them
895 if($ignore_namespace) {
896 $data_name =~ s/^.*\.//; #$data_name =~ s/^\w*\.//;
897 }
898 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
899 $data_name =~ s/^ex\.([^.]+)$/$1/; #$data_name =~ s/^ex\.//;
900
901 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
902 }
903
904 return undef; # was not found
905}
906
907# returns a list of the form [value1, value2, ...]
908sub get_metadata {
909 my $self = shift (@_);
910 my ($section, $field, $ignore_namespace) = @_;
911 my ($data);
912
913 $ignore_namespace = 0 unless defined $ignore_namespace;
914
915 my $section_ptr = $self->_lookup_section($section);
916 if (!defined $section_ptr) {
917 print STDERR "doc::get_metadata couldn't find section ",
918 $section, "\n";
919 return;
920 }
921
922 # Remove any namespace if we are being told to ignore them
923 if($ignore_namespace) {
924 $field =~ s/^.*\.//;
925 }
926
927 my @metadata = ();
928 foreach $data (@{$section_ptr->{'metadata'}}) {
929
930 my $data_name = $data->[0];
931
932 # Remove any namespace if we are being told to ignore them
933 if($ignore_namespace) {
934 $data_name =~ s/^.*\.//;
935 }
936 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
937 $data_name =~ s/^ex\.([^.]+)$/$1/;
938
939 push (@metadata, $data->[1]) if ($data_name eq $field);
940 }
941
942 return \@metadata;
943}
944
945sub get_metadata_hashmap {
946 my $self = shift (@_);
947 my ($section, $opt_namespace) = @_;
948
949 my $section_ptr = $self->_lookup_section($section);
950 if (!defined $section_ptr) {
951 print STDERR "doc::get_metadata couldn't find section ",
952 $section, "\n";
953 return;
954 }
955
956 my $metadata_hashmap = {};
957 foreach my $data (@{$section_ptr->{'metadata'}}) {
958 my $metaname = $data->[0];
959
960 if ((!defined $opt_namespace) || ($metaname =~ m/^$opt_namespace\./)) {
961 if (!defined $metadata_hashmap->{$metaname}) {
962 $metadata_hashmap->{$metaname} = [];
963 }
964 my $metaval_list = $metadata_hashmap->{$metaname};
965 push(@$metaval_list, $data->[1]);
966 }
967 }
968
969 return $metadata_hashmap;
970}
971
972# returns a list of the form [[field,value],[field,value],...]
973sub get_all_metadata {
974 my $self = shift (@_);
975 my ($section) = @_;
976
977 my $section_ptr = $self->_lookup_section($section);
978 if (!defined $section_ptr) {
979 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
980 return;
981 }
982
983 return $section_ptr->{'metadata'};
984}
985
986# $value is optional
987sub delete_metadata {
988 my $self = shift (@_);
989 my ($section, $field, $value) = @_;
990
991 my $section_ptr = $self->_lookup_section($section);
992 if (!defined $section_ptr) {
993 print STDERR "doc::delete_metadata couldn't find section ", $section, "$field\n";
994 return;
995 }
996
997 my $i = 0;
998 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
999 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
1000 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
1001 splice (@{$section_ptr->{'metadata'}}, $i, 1);
1002 } else {
1003 $i++;
1004 }
1005 }
1006}
1007
1008sub delete_all_metadata {
1009 my $self = shift (@_);
1010 my ($section) = @_;
1011
1012 my $section_ptr = $self->_lookup_section($section);
1013 if (!defined $section_ptr) {
1014 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
1015 return;
1016 }
1017
1018 $section_ptr->{'metadata'} = [];
1019}
1020
1021sub set_metadata_element {
1022 my $self = shift (@_);
1023 my ($section, $field, $value) = @_;
1024
1025 $self->set_utf8_metadata_element ($section, $field,
1026 &unicode::ascii2utf8(\$value));
1027}
1028
1029# set_utf8_metadata_element assumes the text has already been
1030# converted to the UTF-8 encoding.
1031sub set_utf8_metadata_element {
1032 my $self = shift (@_);
1033 my ($section, $field, $value) = @_;
1034
1035 $self->delete_metadata ($section, $field);
1036 $self->add_utf8_metadata ($section, $field, $value);
1037}
1038
1039
1040# add_metadata assumes the text is in (extended) ascii form. For
1041# text which has already been converted to the UTF-8 format use
1042# add_utf8_metadata.
1043sub add_metadata {
1044 my $self = shift (@_);
1045 my ($section, $field, $value) = @_;
1046
1047 $self->add_utf8_metadata ($section, $field,
1048 &unicode::ascii2utf8(\$value));
1049}
1050
1051sub add_utf8_metadata {
1052 my $self = shift (@_);
1053 my ($section, $field, $value) = @_;
1054
1055 # my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1056 # my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
1057 # print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
1058
1059 my $section_ptr = $self->_lookup_section($section);
1060 if (!defined $section_ptr) {
1061 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
1062 return;
1063 }
1064 if (!defined $value) {
1065 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
1066 return;
1067 }
1068 if (!defined $field) {
1069 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
1070 return;
1071 }
1072
1073 #print STDERR "###$field=$value\n";
1074
1075 # For now, supress this check. Given that text data read in is now
1076 # Unicode aware, then the following block of code can (ironically enough)
1077 # cause our unicode compliant string to be re-encoded (leading to
1078 # a double-encoded UTF-8 string, which we definitely don't want!).
1079
1080
1081 # double check that the value is utf-8
1082 # if (!&unicode::check_is_utf8($value)) {
1083 # print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
1084 # &unicode::ensure_utf8(\$value);
1085 # print STDERR " Tried converting to utf8: $value\n";
1086 # }
1087
1088 #If the metadata value is either a latitude or a longitude value then we want to save a shortened version for spacial searching purposes
1089 if($field eq "Latitude" || $field eq "Longitude")
1090 {
1091 my $direction;
1092 if($value =~ m/^-/)
1093 {
1094 $direction = ($field eq "Latitude") ? "S" : "W";
1095 }
1096 else
1097 {
1098 $direction = ($field eq "Latitude") ? "N" : "E";
1099 }
1100
1101 my ($beforeDec, $afterDec) = ($value =~ m/^-?([0-9]+)\.([0-9]+)$/);
1102 if(defined $beforeDec && defined $afterDec)
1103 {
1104 my $name = ($field eq "Latitude") ? "LatShort" : "LngShort";
1105 push (@{$section_ptr->{'metadata'}}, [$name, $beforeDec . $direction]);
1106
1107 for(my $i = 2; $i <= 4; $i++)
1108 {
1109 if(length($afterDec) >= $i)
1110 {
1111 push (@{$section_ptr->{'metadata'}}, [$name, substr($afterDec, 0, $i)]);
1112 }
1113 }
1114
1115 #Only add the metadata if it has not already been added
1116 my $metaMap = $self->get_metadata_hashmap($section);
1117 }
1118 }
1119
1120 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1121}
1122
1123
1124# methods for dealing with text
1125
1126# returns the text for a section
1127sub get_text {
1128 my $self = shift (@_);
1129 my ($section) = @_;
1130
1131 my $section_ptr = $self->_lookup_section($section);
1132 if (!defined $section_ptr) {
1133 print STDERR "doc::get_text couldn't find section " .
1134 "$section\n";
1135 return "";
1136 }
1137
1138 return $section_ptr->{'text'};
1139}
1140
1141# returns the (utf-8 encoded) length of the text for a section
1142sub get_text_length {
1143 my $self = shift (@_);
1144 my ($section) = @_;
1145
1146 my $section_ptr = $self->_lookup_section($section);
1147 if (!defined $section_ptr) {
1148 print STDERR "doc::get_text_length couldn't find section " .
1149 "$section\n";
1150 return 0;
1151 }
1152
1153 return length ($section_ptr->{'text'});
1154}
1155
1156# returns the total length for all the sections
1157sub get_total_text_length {
1158 my $self = shift (@_);
1159
1160 my $section = $self->get_top_section();
1161 my $length = 0;
1162 while (defined $section) {
1163 $length += $self->get_text_length($section);
1164 $section = $self->get_next_section($section);
1165 }
1166 return $length;
1167}
1168
1169sub delete_text {
1170 my $self = shift (@_);
1171 my ($section) = @_;
1172
1173 my $section_ptr = $self->_lookup_section($section);
1174 if (!defined $section_ptr) {
1175 print STDERR "doc::delete_text couldn't find section " .
1176 "$section\n";
1177 return;
1178 }
1179
1180 $section_ptr->{'text'} = "";
1181}
1182
1183# add_text assumes the text is in (extended) ascii form. For
1184# text which has been already converted to the UTF-8 format
1185# use add_utf8_text.
1186sub add_text {
1187 my $self = shift (@_);
1188 my ($section, $text) = @_;
1189
1190 # convert the text to UTF-8 encoded unicode characters
1191 # and add the text
1192 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1193}
1194
1195
1196# add_utf8_text assumes the text to be added has already
1197# been converted to the UTF-8 encoding. For ascii text use
1198# add_text
1199sub add_utf8_text {
1200 my $self = shift (@_);
1201 my ($section, $text) = @_;
1202
1203 my $section_ptr = $self->_lookup_section($section);
1204 if (!defined $section_ptr) {
1205 print STDERR "doc::add_utf8_text couldn't find section " .
1206 "$section\n";
1207 return;
1208 }
1209
1210 $section_ptr->{'text'} .= $text;
1211}
1212
1213# returns the Source meta, which is the utf8 filename generated.
1214# Added a separate method here for convenience
1215sub get_source {
1216 my $self = shift (@_);
1217 return $self->get_metadata_element ($self->get_top_section(), "Source");
1218}
1219
1220# returns the SourceFile meta, which is the url reference to the URL-encoded
1221# version of Source (the utf8 filename). Added a separate method here for convenience
1222sub get_sourcefile {
1223 my $self = shift (@_);
1224 return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
1225}
1226
1227# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
1228# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
1229sub get_assocfile_from_sourcefile {
1230 my $self = shift (@_);
1231
1232 # get the SourceFile meta, which is a *URL* to a file on the filesystem
1233 my $top_section = $self->get_top_section();
1234 my $source_file = $self->get_metadata_element($top_section, "SourceFile");
1235
1236 # get the actual filename as it exists on the filesystem which this url refers to
1237 $source_file = &unicode::url_to_filename($source_file);
1238 my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
1239 return $assocfilename;
1240}
1241
1242# methods for dealing with associated files
1243
1244# a file is associated with a document, NOT a section.
1245# if section is defined it is noted in the data structure
1246# only so that files associated from a particular section
1247# may be removed later (using delete_section_assoc_files)
1248sub associate_file {
1249 my $self = shift (@_);
1250 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1251 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1252
1253 # remove all associated files with the same name
1254 $self->delete_assoc_file ($assoc_filename);
1255
1256 # Too harsh a requirement
1257 # Definitely get HTML docs, for example, with some missing
1258 # support files
1259# if (!&util::fd_exists($real_filename)) {
1260# print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n";
1261# exit -1;
1262# }
1263
1264# print STDERR "**** is the following a UTF8 rep of *real* filename?\n $real_filename\n";
1265# print STDERR "****##### so, ensure it is before storing?!?!?\n";
1266## my $utf8_filename = Encode::encode("utf8",$filename);
1267
1268 push (@{$self->{'associated_files'}},
1269 [$real_filename, $assoc_filename, $mime_type, $section]);
1270}
1271
1272# returns a list of associated files in the form
1273# [[real_filename, assoc_filename, mimetype], ...]
1274sub get_assoc_files {
1275 my $self = shift (@_);
1276
1277 return $self->{'associated_files'};
1278}
1279
1280# the following two methods used to keep track of original associated files
1281# for incremental building. eg a txt file used by an item file does not end
1282# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1283# file for incremental build
1284sub associate_source_file {
1285 my $self = shift (@_);
1286 my ($full_filename) = @_;
1287
1288 push (@{$self->{'source_assoc_files'}}, $full_filename);
1289
1290}
1291
1292sub get_source_assoc_files {
1293 my $self = shift (@_);
1294
1295 return $self->{'source_assoc_files'};
1296
1297
1298}
1299sub metadata_file {
1300 my $self = shift (@_);
1301 my ($real_filename, $filename) = @_;
1302
1303 push (@{$self->{'metadata_files'}},
1304 [$real_filename, $filename]);
1305}
1306
1307# used for writing out the archiveinf-doc info database, to list all the metadata files
1308sub get_meta_files {
1309 my $self = shift (@_);
1310
1311 return $self->{'metadata_files'};
1312}
1313
1314sub delete_section_assoc_files {
1315 my $self = shift (@_);
1316 my ($section) = @_;
1317
1318 my $i=0;
1319 while ($i < scalar (@{$self->{'associated_files'}})) {
1320 if (defined $self->{'associated_files'}->[$i]->[3] &&
1321 $self->{'associated_files'}->[$i]->[3] eq $section) {
1322 splice (@{$self->{'associated_files'}}, $i, 1);
1323 } else {
1324 $i++;
1325 }
1326 }
1327}
1328
1329sub delete_assoc_file {
1330 my $self = shift (@_);
1331 my ($assoc_filename) = @_;
1332
1333 my $i=0;
1334 while ($i < scalar (@{$self->{'associated_files'}})) {
1335 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1336 splice (@{$self->{'associated_files'}}, $i, 1);
1337 } else {
1338 $i++;
1339 }
1340 }
1341}
1342
1343sub reset_nextsection_ptr {
1344 my $self = shift (@_);
1345 my ($section) = @_;
1346
1347 my $section_ptr = $self->_lookup_section($section);
1348 $section_ptr->{'next_subsection'} = 1;
1349}
1350
13511;
Note: See TracBrowser for help on using the repository browser.