source: main/trunk/greenstone2/perllib/doc.pm@ 22294

Last change on this file since 22294 was 22294, checked in by kjdon, 14 years ago

when asking for a piece of metadata, want to remove ex. from the ones we are searching through. Some metadata will be stored internally as ex., eg ex.File.MIMEType from EmbeddedMetadataPlugin. But this is equivalent to File.MIMEType

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 29.6 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use ghtml;
40use File::stat;
41##use hashdoc;
42use docprint;
43
44# the document type may be indexed_doc, nonindexed_doc, or
45# classification
46
47our $OIDcount = 0;
48
49# rename_method can be 'url', 'none', 'base64'
50sub new {
51 my $class = shift (@_);
52 my ($source_filename, $doc_type, $rename_method) = @_;
53
54
55 my $self = bless {'associated_files'=>[],
56 'subsection_order'=>[],
57 'next_subsection'=>1,
58 'subsections'=>{},
59 'metadata'=>[],
60 'text'=>"",
61 'OIDtype'=>"hash"}, $class;
62
63 # used to set lastmodified here, but this can screw up the HASH ids, so
64 # the docsave processor now calls set_lastmodified
65
66 $self->{'source_path'} = $source_filename;
67
68 if (defined $source_filename) {
69 $source_filename = &util::filename_within_collection($source_filename);
70 print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
71 $self->set_source_filename ($source_filename, $rename_method);
72 }
73
74 $self->set_doc_type ($doc_type) if defined $doc_type;
75
76 return $self;
77}
78# set lastmodified for OAI purposes, added by GRB, moved by kjdon
79sub set_lastmodified {
80 my $self = shift (@_);
81
82 my $source_path = $self->{'source_path'};
83
84 if (defined $source_path && (-e $source_path)) {
85
86 my $file_stat = stat($source_path);
87 my $mtime = $file_stat->mtime;
88 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
89 $wday, $yday, $isdst) = localtime($mtime);
90
91 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
92
93 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $file_stat->mtime);
94 $self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
95 }
96}
97
98# clone the $self object
99sub duplicate {
100 my $self = shift (@_);
101
102 my $newobj = {};
103
104 foreach my $k (keys %$self) {
105 $newobj->{$k} = &clone ($self->{$k});
106 }
107
108 bless $newobj, ref($self);
109 return $newobj;
110}
111
112sub clone {
113 my ($from) = @_;
114 my $type = ref ($from);
115
116 if ($type eq "HASH") {
117 my $to = {};
118 foreach my $key (keys %$from) {
119 $to->{$key} = &clone ($from->{$key});
120 }
121 return $to;
122 } elsif ($type eq "ARRAY") {
123 my $to = [];
124 foreach my $v (@$from) {
125 push (@$to, &clone ($v));
126 }
127 return $to;
128 } else {
129 return $from;
130 }
131}
132
133sub set_OIDtype {
134 my $self = shift (@_);
135 my ($type, $metadata) = @_;
136
137 if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|incremental|dirname|assigned)$/) {
138 $self->{'OIDtype'} = $type;
139 } else {
140 $self->{'OIDtype'} = "hash";
141 }
142
143 if ($type =~ /^assigned$/) {
144 if (defined $metadata) {
145 $self->{'OIDmetadata'} = $metadata;
146 } else {
147 $self->{'OIDmetadata'} = "dc.Identifier";
148 }
149 }
150}
151
152# rename_method can be 'url', 'none', 'base64'
153sub set_source_filename {
154 my $self = shift (@_);
155 my ($source_filename, $rename_method) = @_;
156
157 # Since the gsdlsourcefilename element goes into the doc.xml it has
158 # to be utf8. However, it should also *represent* the source filename
159 # (in the import directory) which may not be utf8 at all.
160 # For instance, if this meta element (gsdlsourcefilename) will be used
161 # by other applications that parse doc.xml in order to locate
162 # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
163 # encode the real filename as this is a binary-to-text encoding meaning
164 # that the resulting string is ASCII (utf8). Decoding will give the original.
165
166# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
167
168 # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
169 my ($srcfilename,$dirname,$suffix)
170 = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
171# print STDERR "-> $srcfilename -> ";
172 $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
173 $source_filename = &util::filename_cat($dirname, $srcfilename);
174# print STDERR "$source_filename\n";
175
176 $self->set_utf8_metadata_element ($self->get_top_section(),
177 "gsdlsourcefilename",
178 $source_filename);
179}
180
181sub set_converted_filename {
182 my $self = shift (@_);
183 my ($converted_filename) = @_;
184
185 # we know the converted filename is utf8
186 $self->set_utf8_metadata_element ($self->get_top_section(),
187 "gsdlconvertedfilename",
188 $converted_filename);
189}
190
191# returns the source_filename as it was provided
192sub get_unmodified_source_filename {
193 my $self = shift (@_);
194
195 return $self->{'source_path'};
196}
197
198# returns the source_filename with whatever rename_method was given
199sub get_source_filename {
200 my $self = shift (@_);
201
202 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
203}
204
205
206
207# returns converted filename if available else returns source filename
208sub get_filename_for_hashing {
209 my $self = shift (@_);
210
211 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
212
213 if (!defined $filename) {
214 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
215 # if NULPlug processed file, then don't give a filename
216 if (defined $plugin_name && $plugin_name eq "NULPlug") {
217 $filename = undef;
218 } else { # returns the URL encoded source filename!
219 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
220 }
221 }
222 return $filename;
223}
224
225sub set_doc_type {
226 my $self = shift (@_);
227 my ($doc_type) = @_;
228
229 $self->set_metadata_element ($self->get_top_section(),
230 "gsdldoctype",
231 $doc_type);
232}
233
234# returns the gsdldoctype as it was provided
235# the default of "indexed_doc" is used if no document
236# type was provided
237sub get_doc_type {
238 my $self = shift (@_);
239
240 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
241 return $doc_type if (defined $doc_type);
242 return "indexed_doc";
243}
244
245
246# look up the reference to the a particular section
247sub _lookup_section {
248 my $self = shift (@_);
249 my ($section) = @_;
250
251 my ($num);
252 my $sectionref = $self;
253
254 while (defined $section && $section ne "") {
255
256 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
257
258 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
259
260 $section = "" unless defined $section;
261
262
263 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
264 $sectionref = $sectionref->{'subsections'}->{$num};
265 } else {
266 return undef;
267 }
268 }
269
270 return $sectionref;
271}
272
273# calculate OID by hashing the contents of the document
274sub _calc_OID {
275 my $self = shift (@_);
276 my ($filename) = @_;
277
278 my $osexe = &util::get_os_exe();
279
280 my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
281 $ENV{'GSDLOS'},"hashfile$osexe");
282
283 my $result = "NULL";
284
285 if (-e "$hashfile_exe") {
286# $result = `\"$hashfile_exe\" \"$filename\"`;
287# $result = `hashfile$osexe \"$filename\" 2>&1`;
288 $result = `hashfile$osexe \"$filename\"`;
289
290 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
291 } else {
292 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
293 }
294 return "HASH$result";
295}
296
297# methods dealing with OID, not groups of them.
298
299# if $OID is not provided one is calculated
300sub set_OID {
301 my $self = shift (@_);
302 my ($OID) = @_;
303
304 my $use_hash_oid = 0;
305 # if an OID wasn't provided calculate one
306 if (!defined $OID) {
307 $OID = "NULL";
308 if ($self->{'OIDtype'} =~ /^hash/) {
309 $use_hash_oid = 1;
310 } elsif ($self->{'OIDtype'} eq "incremental") {
311 $OID = "D" . $OIDcount;
312 $OIDcount ++;
313
314 } elsif ($self->{'OIDtype'} eq "dirname") {
315 $OID = 'J';
316 my $filename = $self->get_source_filename();
317 if (defined($filename)) { # && -e $filename) {
318 $OID = &File::Basename::dirname($filename);
319 if (defined $OID) {
320 $OID = 'J'.&File::Basename::basename($OID);
321 $OID = &util::tidy_up_oid($OID);
322 } else {
323 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
324 $use_hash_oid = 1;
325 }
326 } else {
327 print STDERR "Failed to find filename, generating hash id\n";
328 $use_hash_oid = 1;
329 }
330
331 } elsif ($self->{'OIDtype'} eq "assigned") {
332 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
333 if (defined $identifier && $identifier ne "") {
334 $OID = $identifier;
335 $OID = &util::tidy_up_oid($OID);
336 } else {
337 # need a hash id
338 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
339 $use_hash_oid = 1;
340 }
341
342 } else {
343 $use_hash_oid = 1;
344 }
345
346 if ($use_hash_oid) {
347 my $hash_on_file = 1;
348 if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
349 $hash_on_file = 0;
350 }
351 if ($hash_on_file) {
352 # "hash" OID - feed file to hashfile.exe
353 my $filename = $self->get_filename_for_hashing();
354 # -z: don't want to hash on the file if it is zero size
355 if (defined($filename) && -e $filename && !-z $filename) {
356 $OID = $self->_calc_OID ($filename);
357 } else {
358 $hash_on_file = 0;
359 }
360 }
361 if (!$hash_on_file) {
362 my $filename = &util::get_tmp_filename();
363 if (!open (OUTFILE, ">$filename")) {
364 print STDERR "doc::set_OID could not write to $filename\n";
365 } else {
366 my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
367 print OUTFILE $doc_text;
368 close (OUTFILE);
369 }
370 $OID = $self->_calc_OID ($filename);
371 &util::rm ($filename);
372 }
373 }
374 }
375 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
376}
377
378# this uses hashdoc (embedded c thingy) which is faster but still
379# needs a little work to be suffiently stable
380sub ___set_OID {
381 my $self = shift (@_);
382 my ($OID) = @_;
383
384 # if an OID wasn't provided then calculate hash value based on document
385 if (!defined $OID)
386 {
387 my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
388 my $hash_len = length($hash_text);
389
390 $OID = &hashdoc::buffer($hash_text,$hash_len);
391 }
392
393 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
394}
395
396# returns the OID for this document
397sub get_OID {
398 my $self = shift (@_);
399 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
400 return $OID if (defined $OID);
401 return "NULL";
402}
403
404sub delete_OID {
405 my $self = shift (@_);
406
407 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
408}
409
410
411# methods for manipulating section names
412
413# returns the name of the top-most section (the top
414# level of the document
415sub get_top_section {
416 my $self = shift (@_);
417
418 return "";
419}
420
421# returns a section
422sub get_parent_section {
423 my $self = shift (@_);
424 my ($section) = @_;
425
426 $section =~ s/(^|\.)\d+$//;
427
428 return $section;
429}
430
431# returns the first child section (or the end child
432# if there isn't any)
433sub get_begin_child {
434 my $self = shift (@_);
435 my ($section) = @_;
436
437 my $section_ptr = $self->_lookup_section($section);
438 return "" unless defined $section_ptr;
439
440 if (defined $section_ptr->{'subsection_order'}->[0]) {
441 return "$section.$section_ptr->{'subsection_order'}->[0]";
442 }
443
444 return $self->get_end_child ($section);
445}
446
447# returns the next child of a parent section
448sub get_next_child {
449 my $self = shift (@_);
450 my ($section) = @_;
451
452 my $parent_section = $self->get_parent_section($section);
453 my $parent_section_ptr = $self->_lookup_section($parent_section);
454 return undef unless defined $parent_section_ptr;
455
456 my ($section_num) = $section =~ /(\d+)$/;
457 return undef unless defined $section_num;
458
459 my $i = 0;
460 my $section_order = $parent_section_ptr->{'subsection_order'};
461 while ($i < scalar(@$section_order)) {
462 last if $section_order->[$i] eq $section_num;
463 $i++;
464 }
465
466 $i++; # the next child
467 if ($i < scalar(@$section_order)) {
468 return $section_order->[$i] if $parent_section eq "";
469 return "$parent_section.$section_order->[$i]";
470 }
471
472 # no more sections in this level
473 return undef;
474}
475
476# returns a reference to a list of children
477sub get_children {
478 my $self = shift (@_);
479 my ($section) = @_;
480
481 my $section_ptr = $self->_lookup_section($section);
482 return [] unless defined $section_ptr;
483
484 my @children = @{$section_ptr->{'subsection_order'}};
485
486 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
487 return \@children;
488}
489
490# returns the child section one past the last one (which
491# is coded as "0")
492sub get_end_child {
493 my $self = shift (@_);
494 my ($section) = @_;
495
496 return $section . ".0" unless $section eq "";
497 return "0";
498}
499
500# returns the next section in book order
501sub get_next_section {
502 my $self = shift (@_);
503 my ($section) = @_;
504
505 return undef unless defined $section;
506
507 my $section_ptr = $self->_lookup_section($section);
508 return undef unless defined $section_ptr;
509
510 # first try to find first child
511 if (defined $section_ptr->{'subsection_order'}->[0]) {
512 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
513 return "$section.$section_ptr->{'subsection_order'}->[0]";
514 }
515
516 do {
517 # try to find sibling
518 my $next_child = $self->get_next_child ($section);
519 return $next_child if (defined $next_child);
520
521 # move up one level
522 $section = $self->get_parent_section ($section);
523 } while $section =~ /\d/;
524
525 return undef;
526}
527
528sub is_leaf_section {
529 my $self = shift (@_);
530 my ($section) = @_;
531
532 my $section_ptr = $self->_lookup_section($section);
533 return 1 unless defined $section_ptr;
534
535 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
536}
537
538# methods for dealing with sections
539
540# returns the name of the inserted section
541sub insert_section {
542 my $self = shift (@_);
543 my ($before_section) = @_;
544
545 # get the child to insert before and its parent section
546 my $parent_section = "";
547 my $before_child = "0";
548 my @before_section = split (/\./, $before_section);
549 if (scalar(@before_section) > 0) {
550 $before_child = pop (@before_section);
551 $parent_section = join (".", @before_section);
552 }
553
554 my $parent_section_ptr = $self->_lookup_section($parent_section);
555 if (!defined $parent_section_ptr) {
556 print STDERR "doc::insert_section couldn't find parent section " .
557 "$parent_section\n";
558 return;
559 }
560
561 # get the next section number
562 my $section_num = $parent_section_ptr->{'next_subsection'}++;
563
564 my $i = 0;
565 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
566 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
567 $i++;
568 }
569
570 # insert the section number into the order list
571 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
572
573 # add this section to the parent section
574 my $section_ptr = {'subsection_order'=>[],
575 'next_subsection'=>1,
576 'subsections'=>{},
577 'metadata'=>[],
578 'text'=>""};
579 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
580
581 # work out the full section number
582 my $section = $parent_section;
583 $section .= "." unless $section eq "";
584 $section .= $section_num;
585
586 return $section;
587}
588
589# creates a pre-named section
590sub create_named_section {
591 my $self = shift (@_);
592 my ($mastersection) = @_;
593
594 my ($num);
595 my $section = $mastersection;
596 my $sectionref = $self;
597
598 while ($section ne "") {
599 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
600 $num =~ s/^0+(\d)/$1/; # remove leading 0s
601 $section = "" unless defined $section;
602
603 if (defined $num) {
604 if (!defined $sectionref->{'subsections'}->{$num}) {
605 push (@{$sectionref->{'subsection_order'}}, $num);
606 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
607 'next_subsection'=>1,
608 'subsections'=>{},
609 'metadata'=>[],
610 'text'=>""};
611 if ($num >= $sectionref->{'next_subsection'}) {
612 $sectionref->{'next_subsection'} = $num + 1;
613 }
614 }
615 $sectionref = $sectionref->{'subsections'}->{$num};
616
617 } else {
618 print STDERR "doc::create_named_section couldn't create section ";
619 print STDERR "$mastersection\n";
620 last;
621 }
622 }
623}
624
625# returns a reference to a list of subsections
626sub list_subsections {
627 my $self = shift (@_);
628 my ($section) = @_;
629
630 my $section_ptr = $self->_lookup_section ($section);
631 if (!defined $section_ptr) {
632 print STDERR "doc::list_subsections couldn't find section $section\n";
633 return [];
634 }
635
636 return [@{$section_ptr->{'subsection_order'}}];
637}
638
639sub delete_section {
640 my $self = shift (@_);
641 my ($section) = @_;
642
643# my $section_ptr = {'subsection_order'=>[],
644# 'next_subsection'=>1,
645# 'subsections'=>{},
646# 'metadata'=>[],
647# 'text'=>""};
648
649 # if this is the top section reset everything
650 if ($section eq "") {
651 $self->{'subsection_order'} = [];
652 $self->{'subsections'} = {};
653 $self->{'metadata'} = [];
654 $self->{'text'} = "";
655 return;
656 }
657
658 # find the parent of the section to delete
659 my $parent_section = "";
660 my $child = "0";
661 my @section = split (/\./, $section);
662 if (scalar(@section) > 0) {
663 $child = pop (@section);
664 $parent_section = join (".", @section);
665 }
666
667 my $parent_section_ptr = $self->_lookup_section($parent_section);
668 if (!defined $parent_section_ptr) {
669 print STDERR "doc::delete_section couldn't find parent section " .
670 "$parent_section\n";
671 return;
672 }
673
674 # remove this section from the subsection_order list
675 my $i = 0;
676 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
677 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
678 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
679 last;
680 }
681 $i++;
682 }
683
684 # remove this section from the subsection hash
685 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
686 undef $parent_section_ptr->{'subsections'}->{$child};
687 }
688}
689
690#--
691# methods for dealing with metadata
692
693# set_metadata_element and get_metadata_element are for metadata
694# which should only have one value. add_meta_data and get_metadata
695# are for metadata which can have more than one value.
696
697# returns the first metadata value which matches field
698
699# This version of get metadata element works much like the one above,
700# except it allows for the namespace portion of a metadata element to
701# be ignored, thus if you are searching for dc.Title, the first piece
702# of matching metadata ending with the name Title (once any namespace
703# is removed) would be returned.
704# 28-11-2003 John Thompson
705sub get_metadata_element {
706 my $self = shift (@_);
707 my ($section, $field, $ignore_namespace) = @_;
708 my ($data);
709
710 $ignore_namespace = 0 unless defined $ignore_namespace;
711
712 my $section_ptr = $self->_lookup_section($section);
713 if (!defined $section_ptr) {
714 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
715 return;
716 }
717
718 # Remove the namespace if we are being told to ignore them
719 if($ignore_namespace) {
720 $field =~ s/^\w*\.//;
721 }
722
723 foreach $data (@{$section_ptr->{'metadata'}}) {
724
725 my $data_name = $data->[0];
726
727 # Remove the any namespace if we are being told to ignore them
728 if($ignore_namespace) {
729 $data_name =~ s/^\w*\.//;
730 }
731 $data_name =~ s/^ex\.//; # we always remove ex. - it maybe there in doc_obj, but we will never ask for it.
732 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
733 }
734
735 return undef; # was not found
736}
737
738# returns a list of the form [value1, value2, ...]
739sub get_metadata {
740 my $self = shift (@_);
741 my ($section, $field, $ignore_namespace) = @_;
742 my ($data);
743
744 $ignore_namespace = 0 unless defined $ignore_namespace;
745
746 my $section_ptr = $self->_lookup_section($section);
747 if (!defined $section_ptr) {
748 print STDERR "doc::get_metadata couldn't find section ",
749 $section, "\n";
750 return;
751 }
752
753 # Remove the any namespace if we are being told to ignore them
754 if($ignore_namespace) {
755 $field =~ s/^\w*\.//;
756 }
757
758 my @metadata = ();
759 foreach $data (@{$section_ptr->{'metadata'}}) {
760
761 my $data_name = $data->[0];
762 # Remove the any namespace if we are being told to ignore them
763 if($ignore_namespace) {
764 $data_name =~ s/^\w*\.//;
765 }
766 $data_name =~ s/^ex\.//; # we always remove ex. - it maybe there in doc_obj, but we will never ask for it.
767 push (@metadata, $data->[1]) if ($data_name eq $field);
768 }
769
770 return \@metadata;
771}
772
773# returns a list of the form [[field,value],[field,value],...]
774sub get_all_metadata {
775 my $self = shift (@_);
776 my ($section) = @_;
777
778 my $section_ptr = $self->_lookup_section($section);
779 if (!defined $section_ptr) {
780 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
781 return;
782 }
783
784 return $section_ptr->{'metadata'};
785}
786
787# $value is optional
788sub delete_metadata {
789 my $self = shift (@_);
790 my ($section, $field, $value) = @_;
791
792 my $section_ptr = $self->_lookup_section($section);
793 if (!defined $section_ptr) {
794 print STDERR "doc::delete_metadata couldn't find section ", $section, "\n";
795 return;
796 }
797
798 my $i = 0;
799 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
800 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
801 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
802 splice (@{$section_ptr->{'metadata'}}, $i, 1);
803 } else {
804 $i++;
805 }
806 }
807}
808
809sub delete_all_metadata {
810 my $self = shift (@_);
811 my ($section) = @_;
812
813 my $section_ptr = $self->_lookup_section($section);
814 if (!defined $section_ptr) {
815 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
816 return;
817 }
818
819 $section_ptr->{'metadata'} = [];
820}
821
822sub set_metadata_element {
823 my $self = shift (@_);
824 my ($section, $field, $value) = @_;
825
826 $self->set_utf8_metadata_element ($section, $field,
827 &unicode::ascii2utf8(\$value));
828}
829
830# set_utf8_metadata_element assumes the text has already been
831# converted to the UTF-8 encoding.
832sub set_utf8_metadata_element {
833 my $self = shift (@_);
834 my ($section, $field, $value) = @_;
835
836 $self->delete_metadata ($section, $field);
837 $self->add_utf8_metadata ($section, $field, $value);
838}
839
840
841# add_metadata assumes the text is in (extended) ascii form. For
842# text which has already been converted to the UTF-8 format use
843# add_utf8_metadata.
844sub add_metadata {
845 my $self = shift (@_);
846 my ($section, $field, $value) = @_;
847
848 $self->add_utf8_metadata ($section, $field,
849 &unicode::ascii2utf8(\$value));
850}
851
852sub add_utf8_metadata {
853 my $self = shift (@_);
854 my ($section, $field, $value) = @_;
855
856 my $section_ptr = $self->_lookup_section($section);
857 if (!defined $section_ptr) {
858 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
859 return;
860 }
861 if (!defined $value) {
862 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
863 return;
864 }
865 if (!defined $field) {
866 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
867 return;
868 }
869
870 #print STDERR "###$field=$value\n";
871 # double check that the value is utf-8
872 if (!&unicode::check_is_utf8($value)) {
873 print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
874 &unicode::ensure_utf8(\$value);
875 print STDERR " Tried converting to utf8: $value\n";
876 }
877
878 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
879}
880
881
882# methods for dealing with text
883
884# returns the text for a section
885sub get_text {
886 my $self = shift (@_);
887 my ($section) = @_;
888
889 my $section_ptr = $self->_lookup_section($section);
890 if (!defined $section_ptr) {
891 print STDERR "doc::get_text couldn't find section " .
892 "$section\n";
893 return "";
894 }
895
896 return $section_ptr->{'text'};
897}
898
899# returns the (utf-8 encoded) length of the text for a section
900sub get_text_length {
901 my $self = shift (@_);
902 my ($section) = @_;
903
904 my $section_ptr = $self->_lookup_section($section);
905 if (!defined $section_ptr) {
906 print STDERR "doc::get_text_length couldn't find section " .
907 "$section\n";
908 return 0;
909 }
910
911 return length ($section_ptr->{'text'});
912}
913
914sub delete_text {
915 my $self = shift (@_);
916 my ($section) = @_;
917
918 my $section_ptr = $self->_lookup_section($section);
919 if (!defined $section_ptr) {
920 print STDERR "doc::delete_text couldn't find section " .
921 "$section\n";
922 return;
923 }
924
925 $section_ptr->{'text'} = "";
926}
927
928# add_text assumes the text is in (extended) ascii form. For
929# text which has been already converted to the UTF-8 format
930# use add_utf8_text.
931sub add_text {
932 my $self = shift (@_);
933 my ($section, $text) = @_;
934
935 # convert the text to UTF-8 encoded unicode characters
936 # and add the text
937 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
938}
939
940
941# add_utf8_text assumes the text to be added has already
942# been converted to the UTF-8 encoding. For ascii text use
943# add_text
944sub add_utf8_text {
945 my $self = shift (@_);
946 my ($section, $text) = @_;
947
948 my $section_ptr = $self->_lookup_section($section);
949 if (!defined $section_ptr) {
950 print STDERR "doc::add_utf8_text couldn't find section " .
951 "$section\n";
952 return;
953 }
954
955 $section_ptr->{'text'} .= $text;
956}
957
958# returns the Source meta, which is the utf8 filename generated.
959# Added a separate method here for convenience
960sub get_source {
961 my $self = shift (@_);
962 return $self->get_metadata_element ($self->get_top_section(), "Source");
963}
964
965# returns the SourceFile meta, which is the url reference to the URL-encoded
966# version of Source (the utf8 filename). Added a separate method here for convenience
967sub get_sourcefile {
968 my $self = shift (@_);
969 return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
970}
971
972# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
973# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
974sub get_assocfile_from_sourcefile {
975 my $self = shift (@_);
976
977 # get the SourceFile meta, which is a *URL* to a file on the filesystem
978 my $top_section = $self->get_top_section();
979 my $source_file = $self->get_metadata_element($top_section, "SourceFile");
980
981 # get the actual filename as it exists on the filesystem which this url refers to
982 $source_file = &unicode::url_to_filename($source_file);
983 my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
984 return $assocfilename;
985}
986
987# methods for dealing with associated files
988
989# a file is associated with a document, NOT a section.
990# if section is defined it is noted in the data structure
991# only so that files associated from a particular section
992# may be removed later (using delete_section_assoc_files)
993sub associate_file {
994 my $self = shift (@_);
995 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
996 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
997
998 # remove all associated files with the same name
999 $self->delete_assoc_file ($assoc_filename);
1000
1001 push (@{$self->{'associated_files'}},
1002 [$real_filename, $assoc_filename, $mime_type, $section]);
1003}
1004
1005# returns a list of associated files in the form
1006# [[real_filename, assoc_filename, mimetype], ...]
1007sub get_assoc_files {
1008 my $self = shift (@_);
1009
1010 return $self->{'associated_files'};
1011}
1012
1013# the following two methods used to keep track of original associated files
1014# for incremental building. eg a txt file used by an item file does not end
1015# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1016# file for incremental build
1017sub associate_source_file {
1018 my $self = shift (@_);
1019 my ($full_filename) = @_;
1020
1021 push (@{$self->{'source_assoc_files'}}, $full_filename);
1022
1023}
1024
1025sub get_source_assoc_files {
1026 my $self = shift (@_);
1027
1028 return $self->{'source_assoc_files'};
1029
1030
1031}
1032sub metadata_file {
1033 my $self = shift (@_);
1034 my ($real_filename, $filename) = @_;
1035
1036 push (@{$self->{'metadata_files'}},
1037 [$real_filename, $filename]);
1038}
1039
1040# used for writing out the archiveinf-doc info database, to list all the metadata files
1041sub get_meta_files {
1042 my $self = shift (@_);
1043
1044 return $self->{'metadata_files'};
1045}
1046
1047sub delete_section_assoc_files {
1048 my $self = shift (@_);
1049 my ($section) = @_;
1050
1051 my $i=0;
1052 while ($i < scalar (@{$self->{'associated_files'}})) {
1053 if (defined $self->{'associated_files'}->[$i]->[3] &&
1054 $self->{'associated_files'}->[$i]->[3] eq $section) {
1055 splice (@{$self->{'associated_files'}}, $i, 1);
1056 } else {
1057 $i++;
1058 }
1059 }
1060}
1061
1062sub delete_assoc_file {
1063 my $self = shift (@_);
1064 my ($assoc_filename) = @_;
1065
1066 my $i=0;
1067 while ($i < scalar (@{$self->{'associated_files'}})) {
1068 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1069 splice (@{$self->{'associated_files'}}, $i, 1);
1070 } else {
1071 $i++;
1072 }
1073 }
1074}
1075
1076sub reset_nextsection_ptr {
1077 my $self = shift (@_);
1078 my ($section) = @_;
1079
1080 my $section_ptr = $self->_lookup_section($section);
1081 $section_ptr->{'next_subsection'} = 1;
1082}
1083
10841;
Note: See TracBrowser for help on using the repository browser.