source: gs2-extensions/parallel-building/trunk/src/perllib/doc.pm@ 26958

Last change on this file since 26958 was 26958, checked in by jmt12, 11 years ago

Using the util library instead of built-in perl file tests

File size: 34.5 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33# unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use ghtml;
40use File::stat;
41##use hashdoc;
42use docprint;
43
44# the document type may be indexed_doc, nonindexed_doc, or
45# classification
46
47our $OIDcount = 0;
48
49# rename_method can be 'url', 'none', 'base64'
50sub new {
51 my $class = shift (@_);
52 my ($source_filename, $doc_type, $rename_method) = @_;
53
54
55 my $self = bless {'associated_files'=>[],
56 'subsection_order'=>[],
57 'next_subsection'=>1,
58 'subsections'=>{},
59 'metadata'=>[],
60 'text'=>"",
61 'OIDtype'=>"hash"}, $class;
62
63 # used to set lastmodified here, but this can screw up the HASH ids, so
64 # the docsave processor now calls set_lastmodified
65
66 $self->set_source_path($source_filename);
67
68 if (defined $source_filename) {
69 $source_filename = &util::filename_within_collection($source_filename);
70 print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
71 $self->set_source_filename ($source_filename, $rename_method);
72 }
73
74 $self->set_doc_type ($doc_type) if defined $doc_type;
75
76 return $self;
77}
78
79
80sub set_source_path
81{
82 my $self = shift @_;
83 my ($source_filename) = @_;
84
85 if (defined $source_filename) {
86 # On Windows the source_filename can be in terse DOS format
87 # e.g. test~1.txt
88
89 $self->{'terse_source_path'} = $source_filename;
90
91 # Use the Util library methods as they are aware of more special cases
92 # such as HDFS [jmt12]
93 if (&util::file_exists($source_filename)) {
94 # See if we can do better for Windows with a filename
95 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
96 require Win32;
97 $self->{'source_path'} = Win32::GetLongPathName($source_filename);
98 }
99 else {
100 # For Unix-based systems, there is no difference between the two
101 $self->{'source_path'} = $source_filename;
102 }
103 }
104 else {
105 print STDERR "Warning: In doc::set_source_path(), file\n";
106 print STDERR " $source_filename\n";
107 print STDERR " does not exist\n";
108
109 # (default) Set it to whatever we were given
110 $self->{'source_path'} = $source_filename;
111 }
112 }
113 else {
114 # Previous code for setting source_path allowed for
115 # it to be undefined, so continue this practice
116 $self->{'terse_source_path'} = undef;
117 $self->{'source_path'} = undef;
118 }
119}
120
121
122sub get_source_path
123{
124 my $self = shift @_;
125
126 return $self->{'terse_source_path'};
127}
128
129# set lastmodified for OAI purposes, added by GRB, moved by kjdon
130sub set_oailastmodified {
131 my $self = shift (@_);
132
133 my $source_path = $self->{'terse_source_path'};
134
135 if (defined $source_path && (-e $source_path)) {
136 my $current_time = time;
137
138 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
139 $wday, $yday, $isdst) = localtime($current_time);
140
141 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
142
143 $self->add_utf8_metadata($self->get_top_section(), "oailastmodified", $current_time);
144 $self->add_utf8_metadata($self->get_top_section(), "oailastmodifieddate", $date_modified);
145 }
146}
147
148# no longer used for OAI purposes, since lastmodified is not what we want as the
149# Datestamp of a document. This doc metadata may be useful for general purposes.
150sub set_lastmodified {
151 my $self = shift (@_);
152
153 my $source_path = $self->{'terse_source_path'};
154
155 if (defined $source_path && (-e $source_path)) {
156
157 my $file_stat = stat($source_path);
158 my $mtime = $file_stat->mtime;
159 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
160 $wday, $yday, $isdst) = localtime($mtime);
161
162 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
163
164 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $mtime);
165 $self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
166 }
167}
168
169# clone the $self object
170sub duplicate {
171 my $self = shift (@_);
172
173 my $newobj = {};
174
175 foreach my $k (keys %$self) {
176 $newobj->{$k} = &clone ($self->{$k});
177 }
178
179 bless $newobj, ref($self);
180 return $newobj;
181}
182
183sub clone {
184 my ($from) = @_;
185 my $type = ref ($from);
186
187 if ($type eq "HASH") {
188 my $to = {};
189 foreach my $key (keys %$from) {
190 $to->{$key} = &clone ($from->{$key});
191 }
192 return $to;
193 } elsif ($type eq "ARRAY") {
194 my $to = [];
195 foreach my $v (@$from) {
196 push (@$to, &clone ($v));
197 }
198 return $to;
199 } else {
200 return $from;
201 }
202}
203
204sub set_OIDtype {
205 my $self = shift (@_);
206 my ($type, $metadata) = @_;
207
208 if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|incremental|dirname|assigned)$/) {
209 $self->{'OIDtype'} = $type;
210 } else {
211 $self->{'OIDtype'} = "hash";
212 }
213
214 if ($type =~ /^assigned$/) {
215 if (defined $metadata) {
216 $self->{'OIDmetadata'} = $metadata;
217 } else {
218 $self->{'OIDmetadata'} = "dc.Identifier";
219 }
220 }
221}
222
223# rename_method can be 'url', 'none', 'base64'
224sub set_source_filename {
225 my $self = shift (@_);
226 my ($source_filename, $rename_method) = @_;
227
228 # Since the gsdlsourcefilename element goes into the doc.xml it has
229 # to be utf8. However, it should also *represent* the source filename
230 # (in the import directory) which may not be utf8 at all.
231 # For instance, if this meta element (gsdlsourcefilename) will be used
232 # by other applications that parse doc.xml in order to locate
233 # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
234 # encode the real filename as this is a binary-to-text encoding meaning
235 # that the resulting string is ASCII (utf8). Decoding will give the original.
236
237# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
238
239 # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
240 $source_filename = $self->encode_filename($source_filename, $rename_method);
241# my ($srcfilename,$dirname,$suffix)
242# = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
243# print STDERR "-> $srcfilename -> ";
244# $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
245# $source_filename = &util::filename_cat($dirname, $srcfilename);
246# print STDERR "$source_filename\n";
247
248 $self->set_utf8_metadata_element ($self->get_top_section(),
249 "gsdlsourcefilename",
250 $source_filename);
251}
252
253sub encode_filename {
254 my $self = shift (@_);
255 my ($source_filename, $rename_method) = @_;
256
257 my ($srcfilename,$dirname,$suffix)
258 = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
259# print STDERR "-> $srcfilename -> ";
260 $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
261 $source_filename = &util::filename_cat($dirname, $srcfilename);
262
263 return $source_filename;
264}
265
266sub set_converted_filename {
267 my $self = shift (@_);
268 my ($converted_filename) = @_;
269
270 # we know the converted filename is utf8
271 $self->set_utf8_metadata_element ($self->get_top_section(),
272 "gsdlconvertedfilename",
273 $converted_filename);
274}
275
276# returns the source_filename as it was provided
277sub get_unmodified_source_filename {
278 my $self = shift (@_);
279
280 return $self->{'terse_source_path'};
281}
282
283# returns the source_filename with whatever rename_method was given
284sub get_source_filename {
285 my $self = shift (@_);
286
287 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
288}
289
290
291
292# returns converted filename if available else returns source filename
293sub get_filename_for_hashing {
294 my $self = shift (@_);
295
296 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
297
298 if (!defined $filename) {
299 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
300 # if NULPlug processed file, then don't give a filename
301 if (defined $plugin_name && $plugin_name eq "NULPlug") {
302 $filename = undef;
303 } else { # returns the URL encoded source filename!
304 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
305 }
306 }
307
308 if (!&util::filename_is_absolute($filename)) {
309 $filename = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$filename);
310 }
311
312 return $filename;
313}
314
315sub set_doc_type {
316 my $self = shift (@_);
317 my ($doc_type) = @_;
318
319 $self->set_metadata_element ($self->get_top_section(),
320 "gsdldoctype",
321 $doc_type);
322}
323
324# returns the gsdldoctype as it was provided
325# the default of "indexed_doc" is used if no document
326# type was provided
327sub get_doc_type {
328 my $self = shift (@_);
329
330 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
331 return $doc_type if (defined $doc_type);
332 return "indexed_doc";
333}
334
335
336# look up the reference to the a particular section
337sub _lookup_section {
338 my $self = shift (@_);
339 my ($section) = @_;
340
341 my ($num);
342 my $sectionref = $self;
343
344 while (defined $section && $section ne "") {
345
346 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
347
348 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
349
350 $section = "" unless defined $section;
351
352
353 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
354 $sectionref = $sectionref->{'subsections'}->{$num};
355 } else {
356 return undef;
357 }
358 }
359
360 return $sectionref;
361}
362
363# calculate OID by hashing the contents of the document
364sub _calc_OID {
365 my $self = shift (@_);
366 my ($filename) = @_;
367
368
369 my $osexe = &util::get_os_exe();
370
371 my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
372 $ENV{'GSDLOS'},"hashfile$osexe");
373
374 my $result = "NULL";
375
376
377 if (-e "$hashfile_exe") {
378# $result = `\"$hashfile_exe\" \"$filename\"`;
379# $result = `hashfile$osexe \"$filename\" 2>&1`;
380 $result = `hashfile$osexe \"$filename\"`;
381
382 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
383 } else {
384 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
385 }
386 return "HASH$result";
387}
388
389# methods dealing with OID, not groups of them.
390
391# if $OID is not provided one is calculated
392sub set_OID {
393 my $self = shift (@_);
394 my ($OID) = @_;
395
396 my $use_hash_oid = 0;
397 # if an OID wasn't provided calculate one
398 if (!defined $OID) {
399 $OID = "NULL";
400 if ($self->{'OIDtype'} =~ /^hash/) {
401 $use_hash_oid = 1;
402 } elsif ($self->{'OIDtype'} eq "incremental") {
403 $OID = "D" . $OIDcount;
404 $OIDcount ++;
405
406 } elsif ($self->{'OIDtype'} eq "dirname") {
407 $OID = 'J';
408 my $filename = $self->get_source_filename();
409 if (defined($filename)) { # && -e $filename) {
410 $OID = &File::Basename::dirname($filename);
411 if (defined $OID) {
412 $OID = 'J'.&File::Basename::basename($OID);
413 $OID = &util::tidy_up_oid($OID);
414 } else {
415 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
416 $use_hash_oid = 1;
417 }
418 } else {
419 print STDERR "Failed to find filename, generating hash id\n";
420 $use_hash_oid = 1;
421 }
422
423 } elsif ($self->{'OIDtype'} eq "assigned") {
424 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
425 if (defined $identifier && $identifier ne "") {
426 $OID = $identifier;
427 $OID = &util::tidy_up_oid($OID);
428 } else {
429 # need a hash id
430 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
431 $use_hash_oid = 1;
432 }
433
434 } else {
435 $use_hash_oid = 1;
436 }
437
438 if ($use_hash_oid) {
439 my $hash_on_file = 1;
440 if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
441 $hash_on_file = 0;
442 }
443 if ($hash_on_file) {
444 # "hash" OID - feed file to hashfile.exe
445 my $filename = $self->get_filename_for_hashing();
446
447 # -z: don't want to hash on the file if it is zero size
448 if (defined($filename) && -e $filename && !-z $filename) {
449 $OID = $self->_calc_OID ($filename);
450 } else {
451 $hash_on_file = 0;
452 }
453 }
454 if (!$hash_on_file) {
455 my $filename = &util::get_tmp_filename();
456 if (!open (OUTFILE, ">:utf8", $filename)) {
457 print STDERR "doc::set_OID could not write to $filename\n";
458 } else {
459 my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
460 print OUTFILE $doc_text;
461 close (OUTFILE);
462 }
463 $OID = $self->_calc_OID ($filename);
464 &util::rm ($filename);
465 }
466 }
467 }
468 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
469}
470
471# this uses hashdoc (embedded c thingy) which is faster but still
472# needs a little work to be suffiently stable
473sub ___set_OID {
474 my $self = shift (@_);
475 my ($OID) = @_;
476
477 # if an OID wasn't provided then calculate hash value based on document
478 if (!defined $OID)
479 {
480 my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
481 my $hash_len = length($hash_text);
482
483 $OID = &hashdoc::buffer($hash_text,$hash_len);
484 }
485
486 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
487}
488
489# returns the OID for this document
490sub get_OID {
491 my $self = shift (@_);
492 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
493 return $OID if (defined $OID);
494 return "NULL";
495}
496
497sub delete_OID {
498 my $self = shift (@_);
499
500 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
501}
502
503
504# methods for manipulating section names
505
506# returns the name of the top-most section (the top
507# level of the document
508sub get_top_section {
509 my $self = shift (@_);
510
511 return "";
512}
513
514# returns a section
515sub get_parent_section {
516 my $self = shift (@_);
517 my ($section) = @_;
518
519 $section =~ s/(^|\.)\d+$//;
520
521 return $section;
522}
523
524# returns the first child section (or the end child
525# if there isn't any)
526sub get_begin_child {
527 my $self = shift (@_);
528 my ($section) = @_;
529
530 my $section_ptr = $self->_lookup_section($section);
531 return "" unless defined $section_ptr;
532
533 if (defined $section_ptr->{'subsection_order'}->[0]) {
534 return "$section.$section_ptr->{'subsection_order'}->[0]";
535 }
536
537 return $self->get_end_child ($section);
538}
539
540# returns the next child of a parent section
541sub get_next_child {
542 my $self = shift (@_);
543 my ($section) = @_;
544
545 my $parent_section = $self->get_parent_section($section);
546 my $parent_section_ptr = $self->_lookup_section($parent_section);
547 return undef unless defined $parent_section_ptr;
548
549 my ($section_num) = $section =~ /(\d+)$/;
550 return undef unless defined $section_num;
551
552 my $i = 0;
553 my $section_order = $parent_section_ptr->{'subsection_order'};
554 while ($i < scalar(@$section_order)) {
555 last if $section_order->[$i] eq $section_num;
556 $i++;
557 }
558
559 $i++; # the next child
560 if ($i < scalar(@$section_order)) {
561 return $section_order->[$i] if $parent_section eq "";
562 return "$parent_section.$section_order->[$i]";
563 }
564
565 # no more sections in this level
566 return undef;
567}
568
569# returns a reference to a list of children
570sub get_children {
571 my $self = shift (@_);
572 my ($section) = @_;
573
574 my $section_ptr = $self->_lookup_section($section);
575 return [] unless defined $section_ptr;
576
577 my @children = @{$section_ptr->{'subsection_order'}};
578
579 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
580 return \@children;
581}
582
583# returns the child section one past the last one (which
584# is coded as "0")
585sub get_end_child {
586 my $self = shift (@_);
587 my ($section) = @_;
588
589 return $section . ".0" unless $section eq "";
590 return "0";
591}
592
593# returns the next section in book order
594sub get_next_section {
595 my $self = shift (@_);
596 my ($section) = @_;
597
598 return undef unless defined $section;
599
600 my $section_ptr = $self->_lookup_section($section);
601 return undef unless defined $section_ptr;
602
603 # first try to find first child
604 if (defined $section_ptr->{'subsection_order'}->[0]) {
605 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
606 return "$section.$section_ptr->{'subsection_order'}->[0]";
607 }
608
609 do {
610 # try to find sibling
611 my $next_child = $self->get_next_child ($section);
612 return $next_child if (defined $next_child);
613
614 # move up one level
615 $section = $self->get_parent_section ($section);
616 } while $section =~ /\d/;
617
618 return undef;
619}
620
621sub is_leaf_section {
622 my $self = shift (@_);
623 my ($section) = @_;
624
625 my $section_ptr = $self->_lookup_section($section);
626 return 1 unless defined $section_ptr;
627
628 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
629}
630
631# methods for dealing with sections
632
633# returns the name of the inserted section
634sub insert_section {
635 my $self = shift (@_);
636 my ($before_section) = @_;
637
638 # get the child to insert before and its parent section
639 my $parent_section = "";
640 my $before_child = "0";
641 my @before_section = split (/\./, $before_section);
642 if (scalar(@before_section) > 0) {
643 $before_child = pop (@before_section);
644 $parent_section = join (".", @before_section);
645 }
646
647 my $parent_section_ptr = $self->_lookup_section($parent_section);
648 if (!defined $parent_section_ptr) {
649 print STDERR "doc::insert_section couldn't find parent section " .
650 "$parent_section\n";
651 return;
652 }
653
654 # get the next section number
655 my $section_num = $parent_section_ptr->{'next_subsection'}++;
656
657 my $i = 0;
658 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
659 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
660 $i++;
661 }
662
663 # insert the section number into the order list
664 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
665
666 # add this section to the parent section
667 my $section_ptr = {'subsection_order'=>[],
668 'next_subsection'=>1,
669 'subsections'=>{},
670 'metadata'=>[],
671 'text'=>""};
672 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
673
674 # work out the full section number
675 my $section = $parent_section;
676 $section .= "." unless $section eq "";
677 $section .= $section_num;
678
679 return $section;
680}
681
682# creates a pre-named section
683sub create_named_section {
684 my $self = shift (@_);
685 my ($mastersection) = @_;
686
687 my ($num);
688 my $section = $mastersection;
689 my $sectionref = $self;
690
691 while ($section ne "") {
692 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
693 $num =~ s/^0+(\d)/$1/; # remove leading 0s
694 $section = "" unless defined $section;
695
696 if (defined $num) {
697 if (!defined $sectionref->{'subsections'}->{$num}) {
698 push (@{$sectionref->{'subsection_order'}}, $num);
699 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
700 'next_subsection'=>1,
701 'subsections'=>{},
702 'metadata'=>[],
703 'text'=>""};
704 if ($num >= $sectionref->{'next_subsection'}) {
705 $sectionref->{'next_subsection'} = $num + 1;
706 }
707 }
708 $sectionref = $sectionref->{'subsections'}->{$num};
709
710 } else {
711 print STDERR "doc::create_named_section couldn't create section ";
712 print STDERR "$mastersection\n";
713 last;
714 }
715 }
716}
717
718# returns a reference to a list of subsections
719sub list_subsections {
720 my $self = shift (@_);
721 my ($section) = @_;
722
723 my $section_ptr = $self->_lookup_section ($section);
724 if (!defined $section_ptr) {
725 print STDERR "doc::list_subsections couldn't find section $section\n";
726 return [];
727 }
728
729 return [@{$section_ptr->{'subsection_order'}}];
730}
731
732sub delete_section {
733 my $self = shift (@_);
734 my ($section) = @_;
735
736# my $section_ptr = {'subsection_order'=>[],
737# 'next_subsection'=>1,
738# 'subsections'=>{},
739# 'metadata'=>[],
740# 'text'=>""};
741
742 # if this is the top section reset everything
743 if ($section eq "") {
744 $self->{'subsection_order'} = [];
745 $self->{'subsections'} = {};
746 $self->{'metadata'} = [];
747 $self->{'text'} = "";
748 return;
749 }
750
751 # find the parent of the section to delete
752 my $parent_section = "";
753 my $child = "0";
754 my @section = split (/\./, $section);
755 if (scalar(@section) > 0) {
756 $child = pop (@section);
757 $parent_section = join (".", @section);
758 }
759
760 my $parent_section_ptr = $self->_lookup_section($parent_section);
761 if (!defined $parent_section_ptr) {
762 print STDERR "doc::delete_section couldn't find parent section " .
763 "$parent_section\n";
764 return;
765 }
766
767 # remove this section from the subsection_order list
768 my $i = 0;
769 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
770 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
771 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
772 last;
773 }
774 $i++;
775 }
776
777 # remove this section from the subsection hash
778 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
779 undef $parent_section_ptr->{'subsections'}->{$child};
780 }
781}
782
783#--
784# methods for dealing with metadata
785
786# set_metadata_element and get_metadata_element are for metadata
787# which should only have one value. add_meta_data and get_metadata
788# are for metadata which can have more than one value.
789
790# returns the first metadata value which matches field
791
792# This version of get metadata element works much like the one above,
793# except it allows for the namespace portion of a metadata element to
794# be ignored, thus if you are searching for dc.Title, the first piece
795# of matching metadata ending with the name Title (once any namespace
796# is removed) would be returned.
797# 28-11-2003 John Thompson
798sub get_metadata_element {
799 my $self = shift (@_);
800 my ($section, $field, $ignore_namespace) = @_;
801 my ($data);
802
803 $ignore_namespace = 0 unless defined $ignore_namespace;
804
805 my $section_ptr = $self->_lookup_section($section);
806 if (!defined $section_ptr) {
807 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
808 return;
809 }
810
811 # Remove any namespace if we are being told to ignore them
812 if($ignore_namespace) {
813 $field =~ s/^.*\.//; #$field =~ s/^\w*\.//;
814 }
815
816 foreach $data (@{$section_ptr->{'metadata'}}) {
817
818 my $data_name = $data->[0];
819
820 # Remove any namespace if we are being told to ignore them
821 if($ignore_namespace) {
822 $data_name =~ s/^.*\.//; #$data_name =~ s/^\w*\.//;
823 }
824 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
825 $data_name =~ s/^ex\.([^.]+)$/$1/; #$data_name =~ s/^ex\.//;
826
827 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
828 }
829
830 return undef; # was not found
831}
832
833# returns a list of the form [value1, value2, ...]
834sub get_metadata {
835 my $self = shift (@_);
836 my ($section, $field, $ignore_namespace) = @_;
837 my ($data);
838
839 $ignore_namespace = 0 unless defined $ignore_namespace;
840
841 my $section_ptr = $self->_lookup_section($section);
842 if (!defined $section_ptr) {
843 print STDERR "doc::get_metadata couldn't find section ",
844 $section, "\n";
845 return;
846 }
847
848 # Remove any namespace if we are being told to ignore them
849 if($ignore_namespace) {
850 $field =~ s/^.*\.//;
851 }
852
853 my @metadata = ();
854 foreach $data (@{$section_ptr->{'metadata'}}) {
855
856 my $data_name = $data->[0];
857
858 # Remove any namespace if we are being told to ignore them
859 if($ignore_namespace) {
860 $data_name =~ s/^.*\.//;
861 }
862 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
863 $data_name =~ s/^ex\.([^.]+)$/$1/;
864
865 push (@metadata, $data->[1]) if ($data_name eq $field);
866 }
867
868 return \@metadata;
869}
870
871sub get_metadata_hashmap {
872 my $self = shift (@_);
873 my ($section, $opt_namespace) = @_;
874
875 my $section_ptr = $self->_lookup_section($section);
876 if (!defined $section_ptr) {
877 print STDERR "doc::get_metadata couldn't find section ",
878 $section, "\n";
879 return;
880 }
881
882 my $metadata_hashmap = {};
883 foreach my $data (@{$section_ptr->{'metadata'}}) {
884 my $metaname = $data->[0];
885
886 if ((!defined $opt_namespace) || ($metaname =~ m/^$opt_namespace\./)) {
887 if (!defined $metadata_hashmap->{$metaname}) {
888 $metadata_hashmap->{$metaname} = [];
889 }
890 my $metaval_list = $metadata_hashmap->{$metaname};
891 push(@$metaval_list, $data->[1]);
892 }
893 }
894
895 return $metadata_hashmap;
896}
897
898# returns a list of the form [[field,value],[field,value],...]
899sub get_all_metadata {
900 my $self = shift (@_);
901 my ($section) = @_;
902
903 my $section_ptr = $self->_lookup_section($section);
904 if (!defined $section_ptr) {
905 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
906 return;
907 }
908
909 return $section_ptr->{'metadata'};
910}
911
912# $value is optional
913sub delete_metadata {
914 my $self = shift (@_);
915 my ($section, $field, $value) = @_;
916
917 my $section_ptr = $self->_lookup_section($section);
918 if (!defined $section_ptr) {
919 print STDERR "doc::delete_metadata couldn't find section ", $section, "$field\n";
920 return;
921 }
922
923 my $i = 0;
924 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
925 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
926 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
927 splice (@{$section_ptr->{'metadata'}}, $i, 1);
928 } else {
929 $i++;
930 }
931 }
932}
933
934sub delete_all_metadata {
935 my $self = shift (@_);
936 my ($section) = @_;
937
938 my $section_ptr = $self->_lookup_section($section);
939 if (!defined $section_ptr) {
940 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
941 return;
942 }
943
944 $section_ptr->{'metadata'} = [];
945}
946
947sub set_metadata_element {
948 my $self = shift (@_);
949 my ($section, $field, $value) = @_;
950
951 $self->set_utf8_metadata_element ($section, $field,
952 &unicode::ascii2utf8(\$value));
953}
954
955# set_utf8_metadata_element assumes the text has already been
956# converted to the UTF-8 encoding.
957sub set_utf8_metadata_element {
958 my $self = shift (@_);
959 my ($section, $field, $value) = @_;
960
961 $self->delete_metadata ($section, $field);
962 $self->add_utf8_metadata ($section, $field, $value);
963}
964
965
966# add_metadata assumes the text is in (extended) ascii form. For
967# text which has already been converted to the UTF-8 format use
968# add_utf8_metadata.
969sub add_metadata {
970 my $self = shift (@_);
971 my ($section, $field, $value) = @_;
972
973 $self->add_utf8_metadata ($section, $field,
974 &unicode::ascii2utf8(\$value));
975}
976
977sub add_utf8_metadata {
978 my $self = shift (@_);
979 my ($section, $field, $value) = @_;
980
981# my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
982# my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
983# print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
984
985 my $section_ptr = $self->_lookup_section($section);
986 if (!defined $section_ptr) {
987 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
988 return;
989 }
990 if (!defined $value) {
991 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
992 return;
993 }
994 if (!defined $field) {
995 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
996 return;
997 }
998
999 #print STDERR "###$field=$value\n";
1000
1001 # For now, supress this check. Given that text data read in is now
1002 # Unicode aware, then the following block of code can (ironically enough)
1003 # cause our unicode compliant string to be re-encoded (leading to
1004 # a double-encoded UTF-8 string, which we definitely don't want!).
1005
1006
1007 # double check that the value is utf-8
1008# if (!&unicode::check_is_utf8($value)) {
1009# print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
1010# &unicode::ensure_utf8(\$value);
1011# print STDERR " Tried converting to utf8: $value\n";
1012# }
1013
1014 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1015}
1016
1017
1018# methods for dealing with text
1019
1020# returns the text for a section
1021sub get_text {
1022 my $self = shift (@_);
1023 my ($section) = @_;
1024
1025 my $section_ptr = $self->_lookup_section($section);
1026 if (!defined $section_ptr) {
1027 print STDERR "doc::get_text couldn't find section " .
1028 "$section\n";
1029 return "";
1030 }
1031
1032 return $section_ptr->{'text'};
1033}
1034
1035# returns the (utf-8 encoded) length of the text for a section
1036sub get_text_length {
1037 my $self = shift (@_);
1038 my ($section) = @_;
1039
1040 my $section_ptr = $self->_lookup_section($section);
1041 if (!defined $section_ptr) {
1042 print STDERR "doc::get_text_length couldn't find section " .
1043 "$section\n";
1044 return 0;
1045 }
1046
1047 return length ($section_ptr->{'text'});
1048}
1049
1050# returns the total length for all the sections
1051sub get_total_text_length {
1052 my $self = shift (@_);
1053
1054 my $section = $self->get_top_section();
1055 my $length = 0;
1056 while (defined $section) {
1057 $length += $self->get_text_length($section);
1058 $section = $self->get_next_section($section);
1059 }
1060 return $length;
1061}
1062
1063sub delete_text {
1064 my $self = shift (@_);
1065 my ($section) = @_;
1066
1067 my $section_ptr = $self->_lookup_section($section);
1068 if (!defined $section_ptr) {
1069 print STDERR "doc::delete_text couldn't find section " .
1070 "$section\n";
1071 return;
1072 }
1073
1074 $section_ptr->{'text'} = "";
1075}
1076
1077# add_text assumes the text is in (extended) ascii form. For
1078# text which has been already converted to the UTF-8 format
1079# use add_utf8_text.
1080sub add_text {
1081 my $self = shift (@_);
1082 my ($section, $text) = @_;
1083
1084 # convert the text to UTF-8 encoded unicode characters
1085 # and add the text
1086 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1087}
1088
1089
1090# add_utf8_text assumes the text to be added has already
1091# been converted to the UTF-8 encoding. For ascii text use
1092# add_text
1093sub add_utf8_text {
1094 my $self = shift (@_);
1095 my ($section, $text) = @_;
1096
1097 my $section_ptr = $self->_lookup_section($section);
1098 if (!defined $section_ptr) {
1099 print STDERR "doc::add_utf8_text couldn't find section " .
1100 "$section\n";
1101 return;
1102 }
1103
1104 $section_ptr->{'text'} .= $text;
1105}
1106
1107# returns the Source meta, which is the utf8 filename generated.
1108# Added a separate method here for convenience
1109sub get_source {
1110 my $self = shift (@_);
1111 return $self->get_metadata_element ($self->get_top_section(), "Source");
1112}
1113
1114# returns the SourceFile meta, which is the url reference to the URL-encoded
1115# version of Source (the utf8 filename). Added a separate method here for convenience
1116sub get_sourcefile {
1117 my $self = shift (@_);
1118 return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
1119}
1120
1121# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
1122# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
1123sub get_assocfile_from_sourcefile {
1124 my $self = shift (@_);
1125
1126 # get the SourceFile meta, which is a *URL* to a file on the filesystem
1127 my $top_section = $self->get_top_section();
1128 my $source_file = $self->get_metadata_element($top_section, "SourceFile");
1129
1130 # get the actual filename as it exists on the filesystem which this url refers to
1131 $source_file = &unicode::url_to_filename($source_file);
1132 my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
1133 return $assocfilename;
1134}
1135
1136# methods for dealing with associated files
1137
1138# a file is associated with a document, NOT a section.
1139# if section is defined it is noted in the data structure
1140# only so that files associated from a particular section
1141# may be removed later (using delete_section_assoc_files)
1142sub associate_file {
1143 my $self = shift (@_);
1144 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1145 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1146
1147 # remove all associated files with the same name
1148 $self->delete_assoc_file ($assoc_filename);
1149
1150 # Too harsh a requirement
1151 # Definitely get HTML docs, for example, with some missing
1152 # support files
1153# if (!&util::fd_exists($real_filename)) {
1154# print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n";
1155# exit -1;
1156# }
1157
1158# print STDERR "**** is the following a UTF8 rep of *real* filename?\n $real_filename\n";
1159# print STDERR "****##### so, ensure it is before storing?!?!?\n";
1160## my $utf8_filename = Encode::encode("utf8",$filename);
1161
1162 push (@{$self->{'associated_files'}},
1163 [$real_filename, $assoc_filename, $mime_type, $section]);
1164}
1165
1166# returns a list of associated files in the form
1167# [[real_filename, assoc_filename, mimetype], ...]
1168sub get_assoc_files {
1169 my $self = shift (@_);
1170
1171 return $self->{'associated_files'};
1172}
1173
1174# the following two methods used to keep track of original associated files
1175# for incremental building. eg a txt file used by an item file does not end
1176# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1177# file for incremental build
1178sub associate_source_file {
1179 my $self = shift (@_);
1180 my ($full_filename) = @_;
1181
1182 push (@{$self->{'source_assoc_files'}}, $full_filename);
1183
1184}
1185
1186sub get_source_assoc_files {
1187 my $self = shift (@_);
1188
1189 return $self->{'source_assoc_files'};
1190
1191
1192}
1193sub metadata_file {
1194 my $self = shift (@_);
1195 my ($real_filename, $filename) = @_;
1196
1197 push (@{$self->{'metadata_files'}},
1198 [$real_filename, $filename]);
1199}
1200
1201# used for writing out the archiveinf-doc info database, to list all the metadata files
1202sub get_meta_files {
1203 my $self = shift (@_);
1204
1205 return $self->{'metadata_files'};
1206}
1207
1208sub delete_section_assoc_files {
1209 my $self = shift (@_);
1210 my ($section) = @_;
1211
1212 my $i=0;
1213 while ($i < scalar (@{$self->{'associated_files'}})) {
1214 if (defined $self->{'associated_files'}->[$i]->[3] &&
1215 $self->{'associated_files'}->[$i]->[3] eq $section) {
1216 splice (@{$self->{'associated_files'}}, $i, 1);
1217 } else {
1218 $i++;
1219 }
1220 }
1221}
1222
1223sub delete_assoc_file {
1224 my $self = shift (@_);
1225 my ($assoc_filename) = @_;
1226
1227 my $i=0;
1228 while ($i < scalar (@{$self->{'associated_files'}})) {
1229 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1230 splice (@{$self->{'associated_files'}}, $i, 1);
1231 } else {
1232 $i++;
1233 }
1234 }
1235}
1236
1237sub reset_nextsection_ptr {
1238 my $self = shift (@_);
1239 my ($section) = @_;
1240
1241 my $section_ptr = $self->_lookup_section($section);
1242 $section_ptr->{'next_subsection'} = 1;
1243}
1244
12451;
Note: See TracBrowser for help on using the repository browser.