source: main/trunk/greenstone2/perllib/doc.pm@ 23923

Last change on this file since 23923 was 23923, checked in by ak19, 13 years ago

Dr Bainbridge replaced a reference to an object.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 33.5 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use ghtml;
40use File::stat;
41##use hashdoc;
42use docprint;
43
44# the document type may be indexed_doc, nonindexed_doc, or
45# classification
46
47our $OIDcount = 0;
48
49# rename_method can be 'url', 'none', 'base64'
50sub new {
51 my $class = shift (@_);
52 my ($source_filename, $doc_type, $rename_method) = @_;
53
54
55 my $self = bless {'associated_files'=>[],
56 'subsection_order'=>[],
57 'next_subsection'=>1,
58 'subsections'=>{},
59 'metadata'=>[],
60 'text'=>"",
61 'OIDtype'=>"hash"}, $class;
62
63 # used to set lastmodified here, but this can screw up the HASH ids, so
64 # the docsave processor now calls set_lastmodified
65
66 $self->set_source_path($source_filename);
67
68 if (defined $source_filename) {
69 $source_filename = &util::filename_within_collection($source_filename);
70 print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
71 $self->set_source_filename ($source_filename, $rename_method);
72 }
73
74 $self->set_doc_type ($doc_type) if defined $doc_type;
75
76 return $self;
77}
78
79
80sub set_source_path
81{
82 my $self = shift @_;
83 my ($source_filename) = @_;
84
85 if (defined $source_filename) {
86 # On Windows the source_filename can be in terse DOS format
87 # e.g. test~1.txt
88
89 $self->{'terse_source_path'} = $source_filename;
90
91 if (-e $source_filename) {
92 # See if we can do better for Windows with a filename
93 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
94 require Win32;
95 $self->{'source_path'} = Win32::GetLongPathName($source_filename);
96 }
97 else {
98 # For Unix-based systems, there is no difference between the two
99 $self->{'source_path'} = $source_filename;
100 }
101 }
102 else {
103 print STDERR "Warning: In doc::set_source_path(), file\n";
104 print STDERR " $source_filename\n";
105 print STDERR " does not exist\n";
106
107 # (default) Set it to whatever we were given
108 $self->{'source_path'} = $source_filename;
109 }
110 }
111 else {
112 # Previous code for setting source_path allowed for
113 # it to be undefined, so continue this practice
114 $self->{'terse_source_path'} = undef;
115 $self->{'source_path'} = undef;
116 }
117}
118
119
120sub get_source_path
121{
122 my $self = shift @_;
123
124 return $self->{'terse_source_path'};
125}
126
127# set lastmodified for OAI purposes, added by GRB, moved by kjdon
128sub set_lastmodified {
129 my $self = shift (@_);
130
131 my $source_path = $self->{'terse_source_path'};
132
133 if (defined $source_path && (-e $source_path)) {
134
135 my $file_stat = stat($source_path);
136 my $mtime = $file_stat->mtime;
137 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
138 $wday, $yday, $isdst) = localtime($mtime);
139
140 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
141
142 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $mtime);
143 $self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
144 }
145}
146
147# clone the $self object
148sub duplicate {
149 my $self = shift (@_);
150
151 my $newobj = {};
152
153 foreach my $k (keys %$self) {
154 $newobj->{$k} = &clone ($self->{$k});
155 }
156
157 bless $newobj, ref($self);
158 return $newobj;
159}
160
161sub clone {
162 my ($from) = @_;
163 my $type = ref ($from);
164
165 if ($type eq "HASH") {
166 my $to = {};
167 foreach my $key (keys %$from) {
168 $to->{$key} = &clone ($from->{$key});
169 }
170 return $to;
171 } elsif ($type eq "ARRAY") {
172 my $to = [];
173 foreach my $v (@$from) {
174 push (@$to, &clone ($v));
175 }
176 return $to;
177 } else {
178 return $from;
179 }
180}
181
182sub set_OIDtype {
183 my $self = shift (@_);
184 my ($type, $metadata) = @_;
185
186 if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|incremental|dirname|assigned)$/) {
187 $self->{'OIDtype'} = $type;
188 } else {
189 $self->{'OIDtype'} = "hash";
190 }
191
192 if ($type =~ /^assigned$/) {
193 if (defined $metadata) {
194 $self->{'OIDmetadata'} = $metadata;
195 } else {
196 $self->{'OIDmetadata'} = "dc.Identifier";
197 }
198 }
199}
200
201# rename_method can be 'url', 'none', 'base64'
202sub set_source_filename {
203 my $self = shift (@_);
204 my ($source_filename, $rename_method) = @_;
205
206 # Since the gsdlsourcefilename element goes into the doc.xml it has
207 # to be utf8. However, it should also *represent* the source filename
208 # (in the import directory) which may not be utf8 at all.
209 # For instance, if this meta element (gsdlsourcefilename) will be used
210 # by other applications that parse doc.xml in order to locate
211 # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
212 # encode the real filename as this is a binary-to-text encoding meaning
213 # that the resulting string is ASCII (utf8). Decoding will give the original.
214
215# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
216
217 # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
218 $source_filename = $self->encode_filename($source_filename, $rename_method);
219# my ($srcfilename,$dirname,$suffix)
220# = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
221# print STDERR "-> $srcfilename -> ";
222# $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
223# $source_filename = &util::filename_cat($dirname, $srcfilename);
224# print STDERR "$source_filename\n";
225
226 $self->set_utf8_metadata_element ($self->get_top_section(),
227 "gsdlsourcefilename",
228 $source_filename);
229}
230
231sub encode_filename {
232 my $self = shift (@_);
233 my ($source_filename, $rename_method) = @_;
234
235 my ($srcfilename,$dirname,$suffix)
236 = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
237# print STDERR "-> $srcfilename -> ";
238 $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
239 $source_filename = &util::filename_cat($dirname, $srcfilename);
240
241 return $source_filename;
242}
243
244sub set_converted_filename {
245 my $self = shift (@_);
246 my ($converted_filename) = @_;
247
248 # we know the converted filename is utf8
249 $self->set_utf8_metadata_element ($self->get_top_section(),
250 "gsdlconvertedfilename",
251 $converted_filename);
252}
253
254# returns the source_filename as it was provided
255sub get_unmodified_source_filename {
256 my $self = shift (@_);
257
258 return $self->{'terse_source_path'};
259}
260
261# returns the source_filename with whatever rename_method was given
262sub get_source_filename {
263 my $self = shift (@_);
264
265 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
266}
267
268
269
270# returns converted filename if available else returns source filename
271sub get_filename_for_hashing {
272 my $self = shift (@_);
273
274 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
275
276 if (!defined $filename) {
277 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
278 # if NULPlug processed file, then don't give a filename
279 if (defined $plugin_name && $plugin_name eq "NULPlug") {
280 $filename = undef;
281 } else { # returns the URL encoded source filename!
282 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
283 }
284 }
285
286 if (!&util::filename_is_absolute($filename)) {
287 $filename = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$filename);
288 }
289
290 return $filename;
291}
292
293sub set_doc_type {
294 my $self = shift (@_);
295 my ($doc_type) = @_;
296
297 $self->set_metadata_element ($self->get_top_section(),
298 "gsdldoctype",
299 $doc_type);
300}
301
302# returns the gsdldoctype as it was provided
303# the default of "indexed_doc" is used if no document
304# type was provided
305sub get_doc_type {
306 my $self = shift (@_);
307
308 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
309 return $doc_type if (defined $doc_type);
310 return "indexed_doc";
311}
312
313
314# look up the reference to the a particular section
315sub _lookup_section {
316 my $self = shift (@_);
317 my ($section) = @_;
318
319 my ($num);
320 my $sectionref = $self;
321
322 while (defined $section && $section ne "") {
323
324 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
325
326 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
327
328 $section = "" unless defined $section;
329
330
331 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
332 $sectionref = $sectionref->{'subsections'}->{$num};
333 } else {
334 return undef;
335 }
336 }
337
338 return $sectionref;
339}
340
341# calculate OID by hashing the contents of the document
342sub _calc_OID {
343 my $self = shift (@_);
344 my ($filename) = @_;
345
346
347 my $osexe = &util::get_os_exe();
348
349 my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
350 $ENV{'GSDLOS'},"hashfile$osexe");
351
352 my $result = "NULL";
353
354
355 if (-e "$hashfile_exe") {
356# $result = `\"$hashfile_exe\" \"$filename\"`;
357# $result = `hashfile$osexe \"$filename\" 2>&1`;
358 $result = `hashfile$osexe \"$filename\"`;
359
360 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
361 } else {
362 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
363 }
364 return "HASH$result";
365}
366
367# methods dealing with OID, not groups of them.
368
369# if $OID is not provided one is calculated
370sub set_OID {
371 my $self = shift (@_);
372 my ($OID) = @_;
373
374 my $use_hash_oid = 0;
375 # if an OID wasn't provided calculate one
376 if (!defined $OID) {
377 $OID = "NULL";
378 if ($self->{'OIDtype'} =~ /^hash/) {
379 $use_hash_oid = 1;
380 } elsif ($self->{'OIDtype'} eq "incremental") {
381 $OID = "D" . $OIDcount;
382 $OIDcount ++;
383
384 } elsif ($self->{'OIDtype'} eq "dirname") {
385 $OID = 'J';
386 my $filename = $self->get_source_filename();
387 if (defined($filename)) { # && -e $filename) {
388 $OID = &File::Basename::dirname($filename);
389 if (defined $OID) {
390 $OID = 'J'.&File::Basename::basename($OID);
391 $OID = &util::tidy_up_oid($OID);
392 } else {
393 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
394 $use_hash_oid = 1;
395 }
396 } else {
397 print STDERR "Failed to find filename, generating hash id\n";
398 $use_hash_oid = 1;
399 }
400
401 } elsif ($self->{'OIDtype'} eq "assigned") {
402 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
403 if (defined $identifier && $identifier ne "") {
404 $OID = $identifier;
405 $OID = &util::tidy_up_oid($OID);
406 } else {
407 # need a hash id
408 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
409 $use_hash_oid = 1;
410 }
411
412 } else {
413 $use_hash_oid = 1;
414 }
415
416 if ($use_hash_oid) {
417 my $hash_on_file = 1;
418 if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
419 $hash_on_file = 0;
420 }
421 if ($hash_on_file) {
422 # "hash" OID - feed file to hashfile.exe
423 my $filename = $self->get_filename_for_hashing();
424
425 # -z: don't want to hash on the file if it is zero size
426 if (defined($filename) && -e $filename && !-z $filename) {
427 $OID = $self->_calc_OID ($filename);
428 } else {
429 $hash_on_file = 0;
430 }
431 }
432 if (!$hash_on_file) {
433 my $filename = &util::get_tmp_filename();
434 if (!open (OUTFILE, ">:utf8", $filename)) {
435 print STDERR "doc::set_OID could not write to $filename\n";
436 } else {
437 my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
438 print OUTFILE $doc_text;
439 close (OUTFILE);
440 }
441 $OID = $self->_calc_OID ($filename);
442 &util::rm ($filename);
443 }
444 }
445 }
446 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
447}
448
449# this uses hashdoc (embedded c thingy) which is faster but still
450# needs a little work to be suffiently stable
451sub ___set_OID {
452 my $self = shift (@_);
453 my ($OID) = @_;
454
455 # if an OID wasn't provided then calculate hash value based on document
456 if (!defined $OID)
457 {
458 my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
459 my $hash_len = length($hash_text);
460
461 $OID = &hashdoc::buffer($hash_text,$hash_len);
462 }
463
464 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
465}
466
467# returns the OID for this document
468sub get_OID {
469 my $self = shift (@_);
470 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
471 return $OID if (defined $OID);
472 return "NULL";
473}
474
475sub delete_OID {
476 my $self = shift (@_);
477
478 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
479}
480
481
482# methods for manipulating section names
483
484# returns the name of the top-most section (the top
485# level of the document
486sub get_top_section {
487 my $self = shift (@_);
488
489 return "";
490}
491
492# returns a section
493sub get_parent_section {
494 my $self = shift (@_);
495 my ($section) = @_;
496
497 $section =~ s/(^|\.)\d+$//;
498
499 return $section;
500}
501
502# returns the first child section (or the end child
503# if there isn't any)
504sub get_begin_child {
505 my $self = shift (@_);
506 my ($section) = @_;
507
508 my $section_ptr = $self->_lookup_section($section);
509 return "" unless defined $section_ptr;
510
511 if (defined $section_ptr->{'subsection_order'}->[0]) {
512 return "$section.$section_ptr->{'subsection_order'}->[0]";
513 }
514
515 return $self->get_end_child ($section);
516}
517
518# returns the next child of a parent section
519sub get_next_child {
520 my $self = shift (@_);
521 my ($section) = @_;
522
523 my $parent_section = $self->get_parent_section($section);
524 my $parent_section_ptr = $self->_lookup_section($parent_section);
525 return undef unless defined $parent_section_ptr;
526
527 my ($section_num) = $section =~ /(\d+)$/;
528 return undef unless defined $section_num;
529
530 my $i = 0;
531 my $section_order = $parent_section_ptr->{'subsection_order'};
532 while ($i < scalar(@$section_order)) {
533 last if $section_order->[$i] eq $section_num;
534 $i++;
535 }
536
537 $i++; # the next child
538 if ($i < scalar(@$section_order)) {
539 return $section_order->[$i] if $parent_section eq "";
540 return "$parent_section.$section_order->[$i]";
541 }
542
543 # no more sections in this level
544 return undef;
545}
546
547# returns a reference to a list of children
548sub get_children {
549 my $self = shift (@_);
550 my ($section) = @_;
551
552 my $section_ptr = $self->_lookup_section($section);
553 return [] unless defined $section_ptr;
554
555 my @children = @{$section_ptr->{'subsection_order'}};
556
557 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
558 return \@children;
559}
560
561# returns the child section one past the last one (which
562# is coded as "0")
563sub get_end_child {
564 my $self = shift (@_);
565 my ($section) = @_;
566
567 return $section . ".0" unless $section eq "";
568 return "0";
569}
570
571# returns the next section in book order
572sub get_next_section {
573 my $self = shift (@_);
574 my ($section) = @_;
575
576 return undef unless defined $section;
577
578 my $section_ptr = $self->_lookup_section($section);
579 return undef unless defined $section_ptr;
580
581 # first try to find first child
582 if (defined $section_ptr->{'subsection_order'}->[0]) {
583 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
584 return "$section.$section_ptr->{'subsection_order'}->[0]";
585 }
586
587 do {
588 # try to find sibling
589 my $next_child = $self->get_next_child ($section);
590 return $next_child if (defined $next_child);
591
592 # move up one level
593 $section = $self->get_parent_section ($section);
594 } while $section =~ /\d/;
595
596 return undef;
597}
598
599sub is_leaf_section {
600 my $self = shift (@_);
601 my ($section) = @_;
602
603 my $section_ptr = $self->_lookup_section($section);
604 return 1 unless defined $section_ptr;
605
606 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
607}
608
609# methods for dealing with sections
610
611# returns the name of the inserted section
612sub insert_section {
613 my $self = shift (@_);
614 my ($before_section) = @_;
615
616 # get the child to insert before and its parent section
617 my $parent_section = "";
618 my $before_child = "0";
619 my @before_section = split (/\./, $before_section);
620 if (scalar(@before_section) > 0) {
621 $before_child = pop (@before_section);
622 $parent_section = join (".", @before_section);
623 }
624
625 my $parent_section_ptr = $self->_lookup_section($parent_section);
626 if (!defined $parent_section_ptr) {
627 print STDERR "doc::insert_section couldn't find parent section " .
628 "$parent_section\n";
629 return;
630 }
631
632 # get the next section number
633 my $section_num = $parent_section_ptr->{'next_subsection'}++;
634
635 my $i = 0;
636 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
637 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
638 $i++;
639 }
640
641 # insert the section number into the order list
642 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
643
644 # add this section to the parent section
645 my $section_ptr = {'subsection_order'=>[],
646 'next_subsection'=>1,
647 'subsections'=>{},
648 'metadata'=>[],
649 'text'=>""};
650 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
651
652 # work out the full section number
653 my $section = $parent_section;
654 $section .= "." unless $section eq "";
655 $section .= $section_num;
656
657 return $section;
658}
659
660# creates a pre-named section
661sub create_named_section {
662 my $self = shift (@_);
663 my ($mastersection) = @_;
664
665 my ($num);
666 my $section = $mastersection;
667 my $sectionref = $self;
668
669 while ($section ne "") {
670 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
671 $num =~ s/^0+(\d)/$1/; # remove leading 0s
672 $section = "" unless defined $section;
673
674 if (defined $num) {
675 if (!defined $sectionref->{'subsections'}->{$num}) {
676 push (@{$sectionref->{'subsection_order'}}, $num);
677 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
678 'next_subsection'=>1,
679 'subsections'=>{},
680 'metadata'=>[],
681 'text'=>""};
682 if ($num >= $sectionref->{'next_subsection'}) {
683 $sectionref->{'next_subsection'} = $num + 1;
684 }
685 }
686 $sectionref = $sectionref->{'subsections'}->{$num};
687
688 } else {
689 print STDERR "doc::create_named_section couldn't create section ";
690 print STDERR "$mastersection\n";
691 last;
692 }
693 }
694}
695
696# returns a reference to a list of subsections
697sub list_subsections {
698 my $self = shift (@_);
699 my ($section) = @_;
700
701 my $section_ptr = $self->_lookup_section ($section);
702 if (!defined $section_ptr) {
703 print STDERR "doc::list_subsections couldn't find section $section\n";
704 return [];
705 }
706
707 return [@{$section_ptr->{'subsection_order'}}];
708}
709
710sub delete_section {
711 my $self = shift (@_);
712 my ($section) = @_;
713
714# my $section_ptr = {'subsection_order'=>[],
715# 'next_subsection'=>1,
716# 'subsections'=>{},
717# 'metadata'=>[],
718# 'text'=>""};
719
720 # if this is the top section reset everything
721 if ($section eq "") {
722 $self->{'subsection_order'} = [];
723 $self->{'subsections'} = {};
724 $self->{'metadata'} = [];
725 $self->{'text'} = "";
726 return;
727 }
728
729 # find the parent of the section to delete
730 my $parent_section = "";
731 my $child = "0";
732 my @section = split (/\./, $section);
733 if (scalar(@section) > 0) {
734 $child = pop (@section);
735 $parent_section = join (".", @section);
736 }
737
738 my $parent_section_ptr = $self->_lookup_section($parent_section);
739 if (!defined $parent_section_ptr) {
740 print STDERR "doc::delete_section couldn't find parent section " .
741 "$parent_section\n";
742 return;
743 }
744
745 # remove this section from the subsection_order list
746 my $i = 0;
747 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
748 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
749 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
750 last;
751 }
752 $i++;
753 }
754
755 # remove this section from the subsection hash
756 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
757 undef $parent_section_ptr->{'subsections'}->{$child};
758 }
759}
760
761#--
762# methods for dealing with metadata
763
764# set_metadata_element and get_metadata_element are for metadata
765# which should only have one value. add_meta_data and get_metadata
766# are for metadata which can have more than one value.
767
768# returns the first metadata value which matches field
769
770# This version of get metadata element works much like the one above,
771# except it allows for the namespace portion of a metadata element to
772# be ignored, thus if you are searching for dc.Title, the first piece
773# of matching metadata ending with the name Title (once any namespace
774# is removed) would be returned.
775# 28-11-2003 John Thompson
776sub get_metadata_element {
777 my $self = shift (@_);
778 my ($section, $field, $ignore_namespace) = @_;
779 my ($data);
780
781 $ignore_namespace = 0 unless defined $ignore_namespace;
782
783 my $section_ptr = $self->_lookup_section($section);
784 if (!defined $section_ptr) {
785 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
786 return;
787 }
788
789 # Remove the namespace if we are being told to ignore them
790 if($ignore_namespace) {
791 $field =~ s/^\w*\.//;
792 }
793
794 foreach $data (@{$section_ptr->{'metadata'}}) {
795
796 my $data_name = $data->[0];
797
798 # Remove the any namespace if we are being told to ignore them
799 if($ignore_namespace) {
800 $data_name =~ s/^\w*\.//;
801 }
802 $data_name =~ s/^ex\.//; # we always remove ex. - it maybe there in doc_obj, but we will never ask for it.
803 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
804 }
805
806 return undef; # was not found
807}
808
809# returns a list of the form [value1, value2, ...]
810sub get_metadata {
811 my $self = shift (@_);
812 my ($section, $field, $ignore_namespace) = @_;
813 my ($data);
814
815 $ignore_namespace = 0 unless defined $ignore_namespace;
816
817 my $section_ptr = $self->_lookup_section($section);
818 if (!defined $section_ptr) {
819 print STDERR "doc::get_metadata couldn't find section ",
820 $section, "\n";
821 return;
822 }
823
824 # Remove the any namespace if we are being told to ignore them
825 if($ignore_namespace) {
826 $field =~ s/^\w*\.//;
827 }
828
829 my @metadata = ();
830 foreach $data (@{$section_ptr->{'metadata'}}) {
831
832 my $data_name = $data->[0];
833 # Remove the any namespace if we are being told to ignore them
834 if($ignore_namespace) {
835 $data_name =~ s/^\w*\.//;
836 }
837 $data_name =~ s/^ex\.//; # we always remove ex. - it maybe there in doc_obj, but we will never ask for it.
838 push (@metadata, $data->[1]) if ($data_name eq $field);
839 }
840
841 return \@metadata;
842}
843
844sub get_metadata_hashmap {
845 my $self = shift (@_);
846 my ($section, $opt_namespace) = @_;
847
848 my $section_ptr = $self->_lookup_section($section);
849 if (!defined $section_ptr) {
850 print STDERR "doc::get_metadata couldn't find section ",
851 $section, "\n";
852 return;
853 }
854
855 my $metadata_hashmap = {};
856 foreach my $data (@{$section_ptr->{'metadata'}}) {
857 my $metaname = $data->[0];
858
859 if ((!defined $opt_namespace) || ($metaname =~ m/^$opt_namespace\./)) {
860 if (!defined $metadata_hashmap->{$metaname}) {
861 $metadata_hashmap->{$metaname} = [];
862 }
863 my $metaval_list = $metadata_hashmap->{$metaname};
864 push(@$metaval_list, $data->[1]);
865 }
866 }
867
868 return $metadata_hashmap;
869}
870
871# returns a list of the form [[field,value],[field,value],...]
872sub get_all_metadata {
873 my $self = shift (@_);
874 my ($section) = @_;
875
876 my $section_ptr = $self->_lookup_section($section);
877 if (!defined $section_ptr) {
878 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
879 return;
880 }
881
882 return $section_ptr->{'metadata'};
883}
884
885# $value is optional
886sub delete_metadata {
887 my $self = shift (@_);
888 my ($section, $field, $value) = @_;
889
890 my $section_ptr = $self->_lookup_section($section);
891 if (!defined $section_ptr) {
892 print STDERR "doc::delete_metadata couldn't find section ", $section, "$field\n";
893 return;
894 }
895
896 my $i = 0;
897 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
898 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
899 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
900 splice (@{$section_ptr->{'metadata'}}, $i, 1);
901 } else {
902 $i++;
903 }
904 }
905}
906
907sub delete_all_metadata {
908 my $self = shift (@_);
909 my ($section) = @_;
910
911 my $section_ptr = $self->_lookup_section($section);
912 if (!defined $section_ptr) {
913 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
914 return;
915 }
916
917 $section_ptr->{'metadata'} = [];
918}
919
920sub set_metadata_element {
921 my $self = shift (@_);
922 my ($section, $field, $value) = @_;
923
924 $self->set_utf8_metadata_element ($section, $field,
925 &unicode::ascii2utf8(\$value));
926}
927
928# set_utf8_metadata_element assumes the text has already been
929# converted to the UTF-8 encoding.
930sub set_utf8_metadata_element {
931 my $self = shift (@_);
932 my ($section, $field, $value) = @_;
933
934 $self->delete_metadata ($section, $field);
935 $self->add_utf8_metadata ($section, $field, $value);
936}
937
938
939# add_metadata assumes the text is in (extended) ascii form. For
940# text which has already been converted to the UTF-8 format use
941# add_utf8_metadata.
942sub add_metadata {
943 my $self = shift (@_);
944 my ($section, $field, $value) = @_;
945
946 $self->add_utf8_metadata ($section, $field,
947 &unicode::ascii2utf8(\$value));
948}
949
950sub add_utf8_metadata {
951 my $self = shift (@_);
952 my ($section, $field, $value) = @_;
953
954# my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
955# my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
956# print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
957
958 my $section_ptr = $self->_lookup_section($section);
959 if (!defined $section_ptr) {
960 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
961 return;
962 }
963 if (!defined $value) {
964 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
965 return;
966 }
967 if (!defined $field) {
968 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
969 return;
970 }
971
972 #print STDERR "###$field=$value\n";
973
974 # For now, supress this check. Given that text data read in is now
975 # Unicode aware, then the following block of code can (ironically enough)
976 # cause our unicode compliant string to be re-encoded (leading to
977 # a double-encoded UTF-8 string, which we definitely don't want!).
978
979
980 # double check that the value is utf-8
981# if (!&unicode::check_is_utf8($value)) {
982# print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
983# &unicode::ensure_utf8(\$value);
984# print STDERR " Tried converting to utf8: $value\n";
985# }
986
987 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
988}
989
990
991# methods for dealing with text
992
993# returns the text for a section
994sub get_text {
995 my $self = shift (@_);
996 my ($section) = @_;
997
998 my $section_ptr = $self->_lookup_section($section);
999 if (!defined $section_ptr) {
1000 print STDERR "doc::get_text couldn't find section " .
1001 "$section\n";
1002 return "";
1003 }
1004
1005 return $section_ptr->{'text'};
1006}
1007
1008# returns the (utf-8 encoded) length of the text for a section
1009sub get_text_length {
1010 my $self = shift (@_);
1011 my ($section) = @_;
1012
1013 my $section_ptr = $self->_lookup_section($section);
1014 if (!defined $section_ptr) {
1015 print STDERR "doc::get_text_length couldn't find section " .
1016 "$section\n";
1017 return 0;
1018 }
1019
1020 return length ($section_ptr->{'text'});
1021}
1022
1023# returns the total length for all the sections
1024sub get_total_text_length {
1025 my $self = shift (@_);
1026
1027 my $section = $self->get_top_section();
1028 my $length = 0;
1029 while (defined $section) {
1030 $length += $self->get_text_length($section);
1031 $section = $self->get_next_section($section);
1032 }
1033 return $length;
1034}
1035
1036sub delete_text {
1037 my $self = shift (@_);
1038 my ($section) = @_;
1039
1040 my $section_ptr = $self->_lookup_section($section);
1041 if (!defined $section_ptr) {
1042 print STDERR "doc::delete_text couldn't find section " .
1043 "$section\n";
1044 return;
1045 }
1046
1047 $section_ptr->{'text'} = "";
1048}
1049
1050# add_text assumes the text is in (extended) ascii form. For
1051# text which has been already converted to the UTF-8 format
1052# use add_utf8_text.
1053sub add_text {
1054 my $self = shift (@_);
1055 my ($section, $text) = @_;
1056
1057 # convert the text to UTF-8 encoded unicode characters
1058 # and add the text
1059 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1060}
1061
1062
1063# add_utf8_text assumes the text to be added has already
1064# been converted to the UTF-8 encoding. For ascii text use
1065# add_text
1066sub add_utf8_text {
1067 my $self = shift (@_);
1068 my ($section, $text) = @_;
1069
1070 my $section_ptr = $self->_lookup_section($section);
1071 if (!defined $section_ptr) {
1072 print STDERR "doc::add_utf8_text couldn't find section " .
1073 "$section\n";
1074 return;
1075 }
1076
1077 $section_ptr->{'text'} .= $text;
1078}
1079
1080# returns the Source meta, which is the utf8 filename generated.
1081# Added a separate method here for convenience
1082sub get_source {
1083 my $self = shift (@_);
1084 return $self->get_metadata_element ($self->get_top_section(), "Source");
1085}
1086
1087# returns the SourceFile meta, which is the url reference to the URL-encoded
1088# version of Source (the utf8 filename). Added a separate method here for convenience
1089sub get_sourcefile {
1090 my $self = shift (@_);
1091 return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
1092}
1093
1094# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
1095# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
1096sub get_assocfile_from_sourcefile {
1097 my $self = shift (@_);
1098
1099 # get the SourceFile meta, which is a *URL* to a file on the filesystem
1100 my $top_section = $self->get_top_section();
1101 my $source_file = $self->get_metadata_element($top_section, "SourceFile");
1102
1103 # get the actual filename as it exists on the filesystem which this url refers to
1104 $source_file = &unicode::url_to_filename($source_file);
1105 my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
1106 return $assocfilename;
1107}
1108
1109# methods for dealing with associated files
1110
1111# a file is associated with a document, NOT a section.
1112# if section is defined it is noted in the data structure
1113# only so that files associated from a particular section
1114# may be removed later (using delete_section_assoc_files)
1115sub associate_file {
1116 my $self = shift (@_);
1117 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1118 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1119
1120 # remove all associated files with the same name
1121 $self->delete_assoc_file ($assoc_filename);
1122
1123 # Too harsh a requirement
1124 # Definitely get HTML docs, for example, with some missing
1125 # support files
1126# if (!&util::fd_exists($real_filename)) {
1127# print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n";
1128# exit -1;
1129# }
1130
1131# print STDERR "**** is the following a UTF8 rep of *real* filename?\n $real_filename\n";
1132# print STDERR "****##### so, ensure it is before storing?!?!?\n";
1133## my $utf8_filename = Encode::encode("utf8",$filename);
1134
1135 push (@{$self->{'associated_files'}},
1136 [$real_filename, $assoc_filename, $mime_type, $section]);
1137}
1138
1139# returns a list of associated files in the form
1140# [[real_filename, assoc_filename, mimetype], ...]
1141sub get_assoc_files {
1142 my $self = shift (@_);
1143
1144 return $self->{'associated_files'};
1145}
1146
1147# the following two methods used to keep track of original associated files
1148# for incremental building. eg a txt file used by an item file does not end
1149# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1150# file for incremental build
1151sub associate_source_file {
1152 my $self = shift (@_);
1153 my ($full_filename) = @_;
1154
1155 push (@{$self->{'source_assoc_files'}}, $full_filename);
1156
1157}
1158
1159sub get_source_assoc_files {
1160 my $self = shift (@_);
1161
1162 return $self->{'source_assoc_files'};
1163
1164
1165}
1166sub metadata_file {
1167 my $self = shift (@_);
1168 my ($real_filename, $filename) = @_;
1169
1170 push (@{$self->{'metadata_files'}},
1171 [$real_filename, $filename]);
1172}
1173
1174# used for writing out the archiveinf-doc info database, to list all the metadata files
1175sub get_meta_files {
1176 my $self = shift (@_);
1177
1178 return $self->{'metadata_files'};
1179}
1180
1181sub delete_section_assoc_files {
1182 my $self = shift (@_);
1183 my ($section) = @_;
1184
1185 my $i=0;
1186 while ($i < scalar (@{$self->{'associated_files'}})) {
1187 if (defined $self->{'associated_files'}->[$i]->[3] &&
1188 $self->{'associated_files'}->[$i]->[3] eq $section) {
1189 splice (@{$self->{'associated_files'}}, $i, 1);
1190 } else {
1191 $i++;
1192 }
1193 }
1194}
1195
1196sub delete_assoc_file {
1197 my $self = shift (@_);
1198 my ($assoc_filename) = @_;
1199
1200 my $i=0;
1201 while ($i < scalar (@{$self->{'associated_files'}})) {
1202 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1203 splice (@{$self->{'associated_files'}}, $i, 1);
1204 } else {
1205 $i++;
1206 }
1207 }
1208}
1209
1210sub reset_nextsection_ptr {
1211 my $self = shift (@_);
1212 my ($section) = @_;
1213
1214 my $section_ptr = $self->_lookup_section($section);
1215 $section_ptr->{'next_subsection'} = 1;
1216}
1217
12181;
Note: See TracBrowser for help on using the repository browser.