source: main/trunk/greenstone2/perllib/doc.pm@ 21862

Last change on this file since 21862 was 21862, checked in by davidb, 14 years ago

Documents now have 'lastmodifieddate' added as metadata in yyyymmdd format

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 29.3 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use ghtml;
40use File::stat;
41##use hashdoc;
42use docprint;
43
44# the document type may be indexed_doc, nonindexed_doc, or
45# classification
46
47our $OIDcount = 0;
48
49# rename_method can be 'url', 'none', 'base64'
50sub new {
51 my $class = shift (@_);
52 my ($source_filename, $doc_type, $rename_method) = @_;
53
54
55 my $self = bless {'associated_files'=>[],
56 'subsection_order'=>[],
57 'next_subsection'=>1,
58 'subsections'=>{},
59 'metadata'=>[],
60 'text'=>"",
61 'OIDtype'=>"hash"}, $class;
62
63 # used to set lastmodified here, but this can screw up the HASH ids, so
64 # the docsave processor now calls set_lastmodified
65
66 $self->{'source_path'} = $source_filename;
67
68 if (defined $source_filename) {
69 $source_filename = &util::filename_within_collection($source_filename);
70 print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
71 $self->set_source_filename ($source_filename, $rename_method);
72 }
73
74 $self->set_doc_type ($doc_type) if defined $doc_type;
75
76 return $self;
77}
78# set lastmodified for OAI purposes, added by GRB, moved by kjdon
79sub set_lastmodified {
80 my $self = shift (@_);
81
82 my $source_path = $self->{'source_path'};
83
84 if (defined $source_path && (-e $source_path)) {
85
86 my $file_stat = stat($source_path);
87 my $mtime = $file_stat->mtime;
88 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
89 $wday, $yday, $isdst) = localtime($mtime);
90
91 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
92
93 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $file_stat->mtime);
94 $self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
95 }
96}
97
98# clone the $self object
99sub duplicate {
100 my $self = shift (@_);
101
102 my $newobj = {};
103
104 foreach my $k (keys %$self) {
105 $newobj->{$k} = &clone ($self->{$k});
106 }
107
108 bless $newobj, ref($self);
109 return $newobj;
110}
111
112sub clone {
113 my ($from) = @_;
114 my $type = ref ($from);
115
116 if ($type eq "HASH") {
117 my $to = {};
118 foreach my $key (keys %$from) {
119 $to->{$key} = &clone ($from->{$key});
120 }
121 return $to;
122 } elsif ($type eq "ARRAY") {
123 my $to = [];
124 foreach my $v (@$from) {
125 push (@$to, &clone ($v));
126 }
127 return $to;
128 } else {
129 return $from;
130 }
131}
132
133sub set_OIDtype {
134 my $self = shift (@_);
135 my ($type, $metadata) = @_;
136
137 if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|incremental|dirname|assigned)$/) {
138 $self->{'OIDtype'} = $type;
139 } else {
140 $self->{'OIDtype'} = "hash";
141 }
142
143 if ($type =~ /^assigned$/) {
144 if (defined $metadata) {
145 $self->{'OIDmetadata'} = $metadata;
146 } else {
147 $self->{'OIDmetadata'} = "dc.Identifier";
148 }
149 }
150}
151
152# rename_method can be 'url', 'none', 'base64'
153sub set_source_filename {
154 my $self = shift (@_);
155 my ($source_filename, $rename_method) = @_;
156
157 # Since the gsdlsourcefilename element goes into the doc.xml it has
158 # to be utf8. However, it should also *represent* the source filename
159 # (in the import directory) which may not be utf8 at all.
160 # For instance, if this meta element (gsdlsourcefilename) will be used
161 # by other applications that parse doc.xml in order to locate
162 # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
163 # encode the real filename as this is a binary-to-text encoding meaning
164 # that the resulting string is ASCII (utf8). Decoding will give the original.
165
166# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
167
168 # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
169 my ($srcfilename,$dirname,$suffix)
170 = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
171# print STDERR "-> $srcfilename -> ";
172 $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
173 $source_filename = &util::filename_cat($dirname, $srcfilename);
174# print STDERR "$source_filename\n";
175
176 $self->set_utf8_metadata_element ($self->get_top_section(),
177 "gsdlsourcefilename",
178 $source_filename);
179}
180
181sub set_converted_filename {
182 my $self = shift (@_);
183 my ($converted_filename) = @_;
184
185 # we know the converted filename is utf8
186 $self->set_utf8_metadata_element ($self->get_top_section(),
187 "gsdlconvertedfilename",
188 $converted_filename);
189}
190
191# returns the source_filename as it was provided
192sub get_unmodified_source_filename {
193 my $self = shift (@_);
194
195 return $self->{'source_path'};
196}
197
198# returns the source_filename with whatever rename_method was given
199sub get_source_filename {
200 my $self = shift (@_);
201
202 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
203}
204
205
206
207# returns converted filename if available else returns source filename
208sub get_filename_for_hashing {
209 my $self = shift (@_);
210
211 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
212
213 if (!defined $filename) {
214 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
215 # if NULPlug processed file, then don't give a filename
216 if (defined $plugin_name && $plugin_name eq "NULPlug") {
217 $filename = undef;
218 } else { # returns the URL encoded source filename!
219 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
220 }
221 }
222 return $filename;
223}
224
225sub set_doc_type {
226 my $self = shift (@_);
227 my ($doc_type) = @_;
228
229 $self->set_metadata_element ($self->get_top_section(),
230 "gsdldoctype",
231 $doc_type);
232}
233
234# returns the gsdldoctype as it was provided
235# the default of "indexed_doc" is used if no document
236# type was provided
237sub get_doc_type {
238 my $self = shift (@_);
239
240 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
241 return $doc_type if (defined $doc_type);
242 return "indexed_doc";
243}
244
245
246# look up the reference to the a particular section
247sub _lookup_section {
248 my $self = shift (@_);
249 my ($section) = @_;
250
251 my ($num);
252 my $sectionref = $self;
253
254 while (defined $section && $section ne "") {
255
256 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
257
258 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
259
260 $section = "" unless defined $section;
261
262
263 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
264 $sectionref = $sectionref->{'subsections'}->{$num};
265 } else {
266 return undef;
267 }
268 }
269
270 return $sectionref;
271}
272
273# calculate OID by hashing the contents of the document
274sub _calc_OID {
275 my $self = shift (@_);
276 my ($filename) = @_;
277
278 my $osexe = &util::get_os_exe();
279
280 my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
281 $ENV{'GSDLOS'},"hashfile$osexe");
282
283 my $result = "NULL";
284
285 if (-e "$hashfile_exe") {
286# $result = `\"$hashfile_exe\" \"$filename\"`;
287# $result = `hashfile$osexe \"$filename\" 2>&1`;
288 $result = `hashfile$osexe \"$filename\"`;
289
290 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
291 } else {
292 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
293 }
294 return "HASH$result";
295}
296
297# methods dealing with OID, not groups of them.
298
299# if $OID is not provided one is calculated
300sub set_OID {
301 my $self = shift (@_);
302 my ($OID) = @_;
303
304 my $use_hash_oid = 0;
305 # if an OID wasn't provided calculate one
306 if (!defined $OID) {
307 $OID = "NULL";
308 if ($self->{'OIDtype'} =~ /^hash/) {
309 $use_hash_oid = 1;
310 } elsif ($self->{'OIDtype'} eq "incremental") {
311 $OID = "D" . $OIDcount;
312 $OIDcount ++;
313
314 } elsif ($self->{'OIDtype'} eq "dirname") {
315 $OID = 'J';
316 my $filename = $self->get_source_filename();
317 if (defined($filename)) { # && -e $filename) {
318 $OID = &File::Basename::dirname($filename);
319 if (defined $OID) {
320 $OID = 'J'.&File::Basename::basename($OID);
321 $OID = &util::tidy_up_oid($OID);
322 } else {
323 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
324 $use_hash_oid = 1;
325 }
326 } else {
327 print STDERR "Failed to find filename, generating hash id\n";
328 $use_hash_oid = 1;
329 }
330
331 } elsif ($self->{'OIDtype'} eq "assigned") {
332 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
333 if (defined $identifier && $identifier ne "") {
334 $OID = $identifier;
335 $OID = &util::tidy_up_oid($OID);
336 } else {
337 # need a hash id
338 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
339 $use_hash_oid = 1;
340 }
341
342 } else {
343 $use_hash_oid = 1;
344 }
345
346 if ($use_hash_oid) {
347 my $hash_on_file = 1;
348 if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
349 $hash_on_file = 0;
350 }
351 if ($hash_on_file) {
352 # "hash" OID - feed file to hashfile.exe
353 my $filename = $self->get_filename_for_hashing();
354 # -z: don't want to hash on the file if it is zero size
355 if (defined($filename) && -e $filename && !-z $filename) {
356 $OID = $self->_calc_OID ($filename);
357 } else {
358 $hash_on_file = 0;
359 }
360 }
361 if (!$hash_on_file) {
362 my $filename = &util::get_tmp_filename();
363 if (!open (OUTFILE, ">$filename")) {
364 print STDERR "doc::set_OID could not write to $filename\n";
365 } else {
366 my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
367 print OUTFILE $doc_text;
368 close (OUTFILE);
369 }
370 $OID = $self->_calc_OID ($filename);
371 &util::rm ($filename);
372 }
373 }
374 }
375 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
376}
377
378# this uses hashdoc (embedded c thingy) which is faster but still
379# needs a little work to be suffiently stable
380sub ___set_OID {
381 my $self = shift (@_);
382 my ($OID) = @_;
383
384 # if an OID wasn't provided then calculate hash value based on document
385 if (!defined $OID)
386 {
387 my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
388 my $hash_len = length($hash_text);
389
390 $OID = &hashdoc::buffer($hash_text,$hash_len);
391 }
392
393 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
394}
395
396# returns the OID for this document
397sub get_OID {
398 my $self = shift (@_);
399 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
400 return $OID if (defined $OID);
401 return "NULL";
402}
403
404sub delete_OID {
405 my $self = shift (@_);
406
407 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
408}
409
410
411# methods for manipulating section names
412
413# returns the name of the top-most section (the top
414# level of the document
415sub get_top_section {
416 my $self = shift (@_);
417
418 return "";
419}
420
421# returns a section
422sub get_parent_section {
423 my $self = shift (@_);
424 my ($section) = @_;
425
426 $section =~ s/(^|\.)\d+$//;
427
428 return $section;
429}
430
431# returns the first child section (or the end child
432# if there isn't any)
433sub get_begin_child {
434 my $self = shift (@_);
435 my ($section) = @_;
436
437 my $section_ptr = $self->_lookup_section($section);
438 return "" unless defined $section_ptr;
439
440 if (defined $section_ptr->{'subsection_order'}->[0]) {
441 return "$section.$section_ptr->{'subsection_order'}->[0]";
442 }
443
444 return $self->get_end_child ($section);
445}
446
447# returns the next child of a parent section
448sub get_next_child {
449 my $self = shift (@_);
450 my ($section) = @_;
451
452 my $parent_section = $self->get_parent_section($section);
453 my $parent_section_ptr = $self->_lookup_section($parent_section);
454 return undef unless defined $parent_section_ptr;
455
456 my ($section_num) = $section =~ /(\d+)$/;
457 return undef unless defined $section_num;
458
459 my $i = 0;
460 my $section_order = $parent_section_ptr->{'subsection_order'};
461 while ($i < scalar(@$section_order)) {
462 last if $section_order->[$i] eq $section_num;
463 $i++;
464 }
465
466 $i++; # the next child
467 if ($i < scalar(@$section_order)) {
468 return $section_order->[$i] if $parent_section eq "";
469 return "$parent_section.$section_order->[$i]";
470 }
471
472 # no more sections in this level
473 return undef;
474}
475
476# returns a reference to a list of children
477sub get_children {
478 my $self = shift (@_);
479 my ($section) = @_;
480
481 my $section_ptr = $self->_lookup_section($section);
482 return [] unless defined $section_ptr;
483
484 my @children = @{$section_ptr->{'subsection_order'}};
485
486 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
487 return \@children;
488}
489
490# returns the child section one past the last one (which
491# is coded as "0")
492sub get_end_child {
493 my $self = shift (@_);
494 my ($section) = @_;
495
496 return $section . ".0" unless $section eq "";
497 return "0";
498}
499
500# returns the next section in book order
501sub get_next_section {
502 my $self = shift (@_);
503 my ($section) = @_;
504
505 return undef unless defined $section;
506
507 my $section_ptr = $self->_lookup_section($section);
508 return undef unless defined $section_ptr;
509
510 # first try to find first child
511 if (defined $section_ptr->{'subsection_order'}->[0]) {
512 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
513 return "$section.$section_ptr->{'subsection_order'}->[0]";
514 }
515
516 do {
517 # try to find sibling
518 my $next_child = $self->get_next_child ($section);
519 return $next_child if (defined $next_child);
520
521 # move up one level
522 $section = $self->get_parent_section ($section);
523 } while $section =~ /\d/;
524
525 return undef;
526}
527
528sub is_leaf_section {
529 my $self = shift (@_);
530 my ($section) = @_;
531
532 my $section_ptr = $self->_lookup_section($section);
533 return 1 unless defined $section_ptr;
534
535 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
536}
537
538# methods for dealing with sections
539
540# returns the name of the inserted section
541sub insert_section {
542 my $self = shift (@_);
543 my ($before_section) = @_;
544
545 # get the child to insert before and its parent section
546 my $parent_section = "";
547 my $before_child = "0";
548 my @before_section = split (/\./, $before_section);
549 if (scalar(@before_section) > 0) {
550 $before_child = pop (@before_section);
551 $parent_section = join (".", @before_section);
552 }
553
554 my $parent_section_ptr = $self->_lookup_section($parent_section);
555 if (!defined $parent_section_ptr) {
556 print STDERR "doc::insert_section couldn't find parent section " .
557 "$parent_section\n";
558 return;
559 }
560
561 # get the next section number
562 my $section_num = $parent_section_ptr->{'next_subsection'}++;
563
564 my $i = 0;
565 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
566 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
567 $i++;
568 }
569
570 # insert the section number into the order list
571 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
572
573 # add this section to the parent section
574 my $section_ptr = {'subsection_order'=>[],
575 'next_subsection'=>1,
576 'subsections'=>{},
577 'metadata'=>[],
578 'text'=>""};
579 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
580
581 # work out the full section number
582 my $section = $parent_section;
583 $section .= "." unless $section eq "";
584 $section .= $section_num;
585
586 return $section;
587}
588
589# creates a pre-named section
590sub create_named_section {
591 my $self = shift (@_);
592 my ($mastersection) = @_;
593
594 my ($num);
595 my $section = $mastersection;
596 my $sectionref = $self;
597
598 while ($section ne "") {
599 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
600 $num =~ s/^0+(\d)/$1/; # remove leading 0s
601 $section = "" unless defined $section;
602
603 if (defined $num) {
604 if (!defined $sectionref->{'subsections'}->{$num}) {
605 push (@{$sectionref->{'subsection_order'}}, $num);
606 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
607 'next_subsection'=>1,
608 'subsections'=>{},
609 'metadata'=>[],
610 'text'=>""};
611 if ($num >= $sectionref->{'next_subsection'}) {
612 $sectionref->{'next_subsection'} = $num + 1;
613 }
614 }
615 $sectionref = $sectionref->{'subsections'}->{$num};
616
617 } else {
618 print STDERR "doc::create_named_section couldn't create section ";
619 print STDERR "$mastersection\n";
620 last;
621 }
622 }
623}
624
625# returns a reference to a list of subsections
626sub list_subsections {
627 my $self = shift (@_);
628 my ($section) = @_;
629
630 my $section_ptr = $self->_lookup_section ($section);
631 if (!defined $section_ptr) {
632 print STDERR "doc::list_subsections couldn't find section $section\n";
633 return [];
634 }
635
636 return [@{$section_ptr->{'subsection_order'}}];
637}
638
639sub delete_section {
640 my $self = shift (@_);
641 my ($section) = @_;
642
643# my $section_ptr = {'subsection_order'=>[],
644# 'next_subsection'=>1,
645# 'subsections'=>{},
646# 'metadata'=>[],
647# 'text'=>""};
648
649 # if this is the top section reset everything
650 if ($section eq "") {
651 $self->{'subsection_order'} = [];
652 $self->{'subsections'} = {};
653 $self->{'metadata'} = [];
654 $self->{'text'} = "";
655 return;
656 }
657
658 # find the parent of the section to delete
659 my $parent_section = "";
660 my $child = "0";
661 my @section = split (/\./, $section);
662 if (scalar(@section) > 0) {
663 $child = pop (@section);
664 $parent_section = join (".", @section);
665 }
666
667 my $parent_section_ptr = $self->_lookup_section($parent_section);
668 if (!defined $parent_section_ptr) {
669 print STDERR "doc::delete_section couldn't find parent section " .
670 "$parent_section\n";
671 return;
672 }
673
674 # remove this section from the subsection_order list
675 my $i = 0;
676 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
677 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
678 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
679 last;
680 }
681 $i++;
682 }
683
684 # remove this section from the subsection hash
685 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
686 undef $parent_section_ptr->{'subsections'}->{$child};
687 }
688}
689
690#--
691# methods for dealing with metadata
692
693# set_metadata_element and get_metadata_element are for metadata
694# which should only have one value. add_meta_data and get_metadata
695# are for metadata which can have more than one value.
696
697# returns the first metadata value which matches field
698
699# This version of get metadata element works much like the one above,
700# except it allows for the namespace portion of a metadata element to
701# be ignored, thus if you are searching for dc.Title, the first piece
702# of matching metadata ending with the name Title (once any namespace
703# is removed) would be returned.
704# 28-11-2003 John Thompson
705sub get_metadata_element {
706 my $self = shift (@_);
707 my ($section, $field, $ignore_namespace) = @_;
708 my ($data);
709
710 $ignore_namespace = 0 unless defined $ignore_namespace;
711
712 my $section_ptr = $self->_lookup_section($section);
713 if (!defined $section_ptr) {
714 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
715 return;
716 }
717
718 # Remove the namespace if we are being told to ignore them
719 if($ignore_namespace) {
720 $field =~ s/^\w*\.//;
721 }
722
723 foreach $data (@{$section_ptr->{'metadata'}}) {
724
725 my $data_name = $data->[0];
726
727 # Remove the any namespace if we are being told to ignore them
728 if($ignore_namespace) {
729 $data_name =~ s/^\w*\.//;
730 }
731
732 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
733 }
734
735 return undef; # was not found
736}
737
738# returns a list of the form [value1, value2, ...]
739sub get_metadata {
740 my $self = shift (@_);
741 my ($section, $field, $ignore_namespace) = @_;
742 my ($data);
743
744 $ignore_namespace = 0 unless defined $ignore_namespace;
745
746 my $section_ptr = $self->_lookup_section($section);
747 if (!defined $section_ptr) {
748 print STDERR "doc::get_metadata couldn't find section ",
749 $section, "\n";
750 return;
751 }
752
753 # Remove the any namespace if we are being told to ignore them
754 if($ignore_namespace) {
755 $field =~ s/^\w*\.//;
756 }
757
758 my @metadata = ();
759 foreach $data (@{$section_ptr->{'metadata'}}) {
760
761 my $data_name = $data->[0];
762 # Remove the any namespace if we are being told to ignore them
763 if($ignore_namespace) {
764 $data_name =~ s/^\w*\.//;
765 }
766
767 push (@metadata, $data->[1]) if ($data_name eq $field);
768 }
769
770 return \@metadata;
771}
772
773# returns a list of the form [[field,value],[field,value],...]
774sub get_all_metadata {
775 my $self = shift (@_);
776 my ($section) = @_;
777
778 my $section_ptr = $self->_lookup_section($section);
779 if (!defined $section_ptr) {
780 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
781 return;
782 }
783
784 return $section_ptr->{'metadata'};
785}
786
787# $value is optional
788sub delete_metadata {
789 my $self = shift (@_);
790 my ($section, $field, $value) = @_;
791
792 my $section_ptr = $self->_lookup_section($section);
793 if (!defined $section_ptr) {
794 print STDERR "doc::delete_metadata couldn't find section ", $section, "\n";
795 return;
796 }
797
798 my $i = 0;
799 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
800 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
801 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
802 splice (@{$section_ptr->{'metadata'}}, $i, 1);
803 } else {
804 $i++;
805 }
806 }
807}
808
809sub delete_all_metadata {
810 my $self = shift (@_);
811 my ($section) = @_;
812
813 my $section_ptr = $self->_lookup_section($section);
814 if (!defined $section_ptr) {
815 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
816 return;
817 }
818
819 $section_ptr->{'metadata'} = [];
820}
821
822sub set_metadata_element {
823 my $self = shift (@_);
824 my ($section, $field, $value) = @_;
825
826 $self->set_utf8_metadata_element ($section, $field,
827 &unicode::ascii2utf8(\$value));
828}
829
830# set_utf8_metadata_element assumes the text has already been
831# converted to the UTF-8 encoding.
832sub set_utf8_metadata_element {
833 my $self = shift (@_);
834 my ($section, $field, $value) = @_;
835
836 $self->delete_metadata ($section, $field);
837 $self->add_utf8_metadata ($section, $field, $value);
838}
839
840
841# add_metadata assumes the text is in (extended) ascii form. For
842# text which has already been converted to the UTF-8 format use
843# add_utf8_metadata.
844sub add_metadata {
845 my $self = shift (@_);
846 my ($section, $field, $value) = @_;
847
848 $self->add_utf8_metadata ($section, $field,
849 &unicode::ascii2utf8(\$value));
850}
851
852sub add_utf8_metadata {
853 my $self = shift (@_);
854 my ($section, $field, $value) = @_;
855
856 my $section_ptr = $self->_lookup_section($section);
857 if (!defined $section_ptr) {
858 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
859 return;
860 }
861 if (!defined $value) {
862 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
863 return;
864 }
865 if (!defined $field) {
866 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
867 return;
868 }
869
870 #print STDERR "###$field=$value\n";
871 # double check that the value is utf-8
872 if (!&unicode::check_is_utf8($value)) {
873 print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
874 &unicode::ensure_utf8(\$value);
875 print STDERR " Tried converting to utf8: $value\n";
876 }
877
878 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
879}
880
881
882# methods for dealing with text
883
884# returns the text for a section
885sub get_text {
886 my $self = shift (@_);
887 my ($section) = @_;
888
889 my $section_ptr = $self->_lookup_section($section);
890 if (!defined $section_ptr) {
891 print STDERR "doc::get_text couldn't find section " .
892 "$section\n";
893 return "";
894 }
895
896 return $section_ptr->{'text'};
897}
898
899# returns the (utf-8 encoded) length of the text for a section
900sub get_text_length {
901 my $self = shift (@_);
902 my ($section) = @_;
903
904 my $section_ptr = $self->_lookup_section($section);
905 if (!defined $section_ptr) {
906 print STDERR "doc::get_text_length couldn't find section " .
907 "$section\n";
908 return 0;
909 }
910
911 return length ($section_ptr->{'text'});
912}
913
914sub delete_text {
915 my $self = shift (@_);
916 my ($section) = @_;
917
918 my $section_ptr = $self->_lookup_section($section);
919 if (!defined $section_ptr) {
920 print STDERR "doc::delete_text couldn't find section " .
921 "$section\n";
922 return;
923 }
924
925 $section_ptr->{'text'} = "";
926}
927
928# add_text assumes the text is in (extended) ascii form. For
929# text which has been already converted to the UTF-8 format
930# use add_utf8_text.
931sub add_text {
932 my $self = shift (@_);
933 my ($section, $text) = @_;
934
935 # convert the text to UTF-8 encoded unicode characters
936 # and add the text
937 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
938}
939
940
941# add_utf8_text assumes the text to be added has already
942# been converted to the UTF-8 encoding. For ascii text use
943# add_text
944sub add_utf8_text {
945 my $self = shift (@_);
946 my ($section, $text) = @_;
947
948 my $section_ptr = $self->_lookup_section($section);
949 if (!defined $section_ptr) {
950 print STDERR "doc::add_utf8_text couldn't find section " .
951 "$section\n";
952 return;
953 }
954
955 $section_ptr->{'text'} .= $text;
956}
957
958# returns the Source meta, which is the utf8 filename generated.
959# Added a separate method here for convenience
960sub get_source {
961 my $self = shift (@_);
962 return $self->get_metadata_element ($self->get_top_section(), "Source");
963}
964
965# returns the SourceFile meta, which is the url reference to the URL-encoded
966# version of Source (the utf8 filename). Added a separate method here for convenience
967sub get_sourcefile {
968 my $self = shift (@_);
969 return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
970}
971
972# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
973# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
974sub get_assocfile_from_sourcefile {
975 my $self = shift (@_);
976
977 # get the SourceFile meta, which is a *URL* to a file on the filesystem
978 my $top_section = $self->get_top_section();
979 my $source_file = $self->get_metadata_element($top_section, "SourceFile");
980
981 # get the actual filename as it exists on the filesystem which this url refers to
982 $source_file = &unicode::url_to_filename($source_file);
983 my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
984 return $assocfilename;
985}
986
987# methods for dealing with associated files
988
989# a file is associated with a document, NOT a section.
990# if section is defined it is noted in the data structure
991# only so that files associated from a particular section
992# may be removed later (using delete_section_assoc_files)
993sub associate_file {
994 my $self = shift (@_);
995 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
996 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
997
998 # remove all associated files with the same name
999 $self->delete_assoc_file ($assoc_filename);
1000
1001 push (@{$self->{'associated_files'}},
1002 [$real_filename, $assoc_filename, $mime_type, $section]);
1003}
1004
1005# returns a list of associated files in the form
1006# [[real_filename, assoc_filename, mimetype], ...]
1007sub get_assoc_files {
1008 my $self = shift (@_);
1009
1010 return $self->{'associated_files'};
1011}
1012
1013# the following two methods used to keep track of original associated files
1014# for incremental building. eg a txt file used by an item file does not end
1015# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1016# file for incremental build
1017sub associate_source_file {
1018 my $self = shift (@_);
1019 my ($full_filename) = @_;
1020
1021 push (@{$self->{'source_assoc_files'}}, $full_filename);
1022
1023}
1024
1025sub get_source_assoc_files {
1026 my $self = shift (@_);
1027
1028 return $self->{'source_assoc_files'};
1029
1030
1031}
1032sub metadata_file {
1033 my $self = shift (@_);
1034 my ($real_filename, $filename) = @_;
1035
1036 push (@{$self->{'metadata_files'}},
1037 [$real_filename, $filename]);
1038}
1039
1040# used for writing out the archiveinf-doc info database, to list all the metadata files
1041sub get_meta_files {
1042 my $self = shift (@_);
1043
1044 return $self->{'metadata_files'};
1045}
1046
1047sub delete_section_assoc_files {
1048 my $self = shift (@_);
1049 my ($section) = @_;
1050
1051 my $i=0;
1052 while ($i < scalar (@{$self->{'associated_files'}})) {
1053 if (defined $self->{'associated_files'}->[$i]->[3] &&
1054 $self->{'associated_files'}->[$i]->[3] eq $section) {
1055 splice (@{$self->{'associated_files'}}, $i, 1);
1056 } else {
1057 $i++;
1058 }
1059 }
1060}
1061
1062sub delete_assoc_file {
1063 my $self = shift (@_);
1064 my ($assoc_filename) = @_;
1065
1066 my $i=0;
1067 while ($i < scalar (@{$self->{'associated_files'}})) {
1068 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1069 splice (@{$self->{'associated_files'}}, $i, 1);
1070 } else {
1071 $i++;
1072 }
1073 }
1074}
1075
1076sub reset_nextsection_ptr {
1077 my $self = shift (@_);
1078 my ($section) = @_;
1079
1080 my $section_ptr = $self->_lookup_section($section);
1081 $section_ptr->{'next_subsection'} = 1;
1082}
1083
10841;
Note: See TracBrowser for help on using the repository browser.