source: main/trunk/greenstone2/perllib/doc.pm@ 22950

Last change on this file since 22950 was 22950, checked in by davidb, 14 years ago

Old routine used to work on raw binary strings that just happened to be UTF-8 compliant. Now strings are (in the Perl sense) Unicode aware, we *don't* want to check for them being valid utf8. In fact, things can start to go wrong if we do

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 30.1 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use ghtml;
40use File::stat;
41##use hashdoc;
42use docprint;
43
44# the document type may be indexed_doc, nonindexed_doc, or
45# classification
46
47our $OIDcount = 0;
48
49# rename_method can be 'url', 'none', 'base64'
50sub new {
51 my $class = shift (@_);
52 my ($source_filename, $doc_type, $rename_method) = @_;
53
54
55 my $self = bless {'associated_files'=>[],
56 'subsection_order'=>[],
57 'next_subsection'=>1,
58 'subsections'=>{},
59 'metadata'=>[],
60 'text'=>"",
61 'OIDtype'=>"hash"}, $class;
62
63 # used to set lastmodified here, but this can screw up the HASH ids, so
64 # the docsave processor now calls set_lastmodified
65
66 $self->{'source_path'} = $source_filename;
67
68 if (defined $source_filename) {
69 $source_filename = &util::filename_within_collection($source_filename);
70 print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
71 $self->set_source_filename ($source_filename, $rename_method);
72 }
73
74 $self->set_doc_type ($doc_type) if defined $doc_type;
75
76 return $self;
77}
78# set lastmodified for OAI purposes, added by GRB, moved by kjdon
79sub set_lastmodified {
80 my $self = shift (@_);
81
82 my $source_path = $self->{'source_path'};
83
84 if (defined $source_path && (-e $source_path)) {
85
86 my $file_stat = stat($source_path);
87 my $mtime = $file_stat->mtime;
88 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
89 $wday, $yday, $isdst) = localtime($mtime);
90
91 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
92
93 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $file_stat->mtime);
94 $self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
95 }
96}
97
98# clone the $self object
99sub duplicate {
100 my $self = shift (@_);
101
102 my $newobj = {};
103
104 foreach my $k (keys %$self) {
105 $newobj->{$k} = &clone ($self->{$k});
106 }
107
108 bless $newobj, ref($self);
109 return $newobj;
110}
111
112sub clone {
113 my ($from) = @_;
114 my $type = ref ($from);
115
116 if ($type eq "HASH") {
117 my $to = {};
118 foreach my $key (keys %$from) {
119 $to->{$key} = &clone ($from->{$key});
120 }
121 return $to;
122 } elsif ($type eq "ARRAY") {
123 my $to = [];
124 foreach my $v (@$from) {
125 push (@$to, &clone ($v));
126 }
127 return $to;
128 } else {
129 return $from;
130 }
131}
132
133sub set_OIDtype {
134 my $self = shift (@_);
135 my ($type, $metadata) = @_;
136
137 if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|incremental|dirname|assigned)$/) {
138 $self->{'OIDtype'} = $type;
139 } else {
140 $self->{'OIDtype'} = "hash";
141 }
142
143 if ($type =~ /^assigned$/) {
144 if (defined $metadata) {
145 $self->{'OIDmetadata'} = $metadata;
146 } else {
147 $self->{'OIDmetadata'} = "dc.Identifier";
148 }
149 }
150}
151
152# rename_method can be 'url', 'none', 'base64'
153sub set_source_filename {
154 my $self = shift (@_);
155 my ($source_filename, $rename_method) = @_;
156
157 # Since the gsdlsourcefilename element goes into the doc.xml it has
158 # to be utf8. However, it should also *represent* the source filename
159 # (in the import directory) which may not be utf8 at all.
160 # For instance, if this meta element (gsdlsourcefilename) will be used
161 # by other applications that parse doc.xml in order to locate
162 # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
163 # encode the real filename as this is a binary-to-text encoding meaning
164 # that the resulting string is ASCII (utf8). Decoding will give the original.
165
166# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
167
168 # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
169 my ($srcfilename,$dirname,$suffix)
170 = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
171# print STDERR "-> $srcfilename -> ";
172 $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
173 $source_filename = &util::filename_cat($dirname, $srcfilename);
174# print STDERR "$source_filename\n";
175
176 $self->set_utf8_metadata_element ($self->get_top_section(),
177 "gsdlsourcefilename",
178 $source_filename);
179}
180
181sub set_converted_filename {
182 my $self = shift (@_);
183 my ($converted_filename) = @_;
184
185 # we know the converted filename is utf8
186 $self->set_utf8_metadata_element ($self->get_top_section(),
187 "gsdlconvertedfilename",
188 $converted_filename);
189}
190
191# returns the source_filename as it was provided
192sub get_unmodified_source_filename {
193 my $self = shift (@_);
194
195 return $self->{'source_path'};
196}
197
198# returns the source_filename with whatever rename_method was given
199sub get_source_filename {
200 my $self = shift (@_);
201
202 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
203}
204
205
206
207# returns converted filename if available else returns source filename
208sub get_filename_for_hashing {
209 my $self = shift (@_);
210
211 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
212
213 if (!defined $filename) {
214 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
215 # if NULPlug processed file, then don't give a filename
216 if (defined $plugin_name && $plugin_name eq "NULPlug") {
217 $filename = undef;
218 } else { # returns the URL encoded source filename!
219 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
220 }
221 }
222 return $filename;
223}
224
225sub set_doc_type {
226 my $self = shift (@_);
227 my ($doc_type) = @_;
228
229 $self->set_metadata_element ($self->get_top_section(),
230 "gsdldoctype",
231 $doc_type);
232}
233
234# returns the gsdldoctype as it was provided
235# the default of "indexed_doc" is used if no document
236# type was provided
237sub get_doc_type {
238 my $self = shift (@_);
239
240 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
241 return $doc_type if (defined $doc_type);
242 return "indexed_doc";
243}
244
245
246# look up the reference to the a particular section
247sub _lookup_section {
248 my $self = shift (@_);
249 my ($section) = @_;
250
251 my ($num);
252 my $sectionref = $self;
253
254 while (defined $section && $section ne "") {
255
256 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
257
258 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
259
260 $section = "" unless defined $section;
261
262
263 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
264 $sectionref = $sectionref->{'subsections'}->{$num};
265 } else {
266 return undef;
267 }
268 }
269
270 return $sectionref;
271}
272
273# calculate OID by hashing the contents of the document
274sub _calc_OID {
275 my $self = shift (@_);
276 my ($filename) = @_;
277
278
279 my $osexe = &util::get_os_exe();
280
281 my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
282 $ENV{'GSDLOS'},"hashfile$osexe");
283
284 my $result = "NULL";
285
286
287 if (-e "$hashfile_exe") {
288# $result = `\"$hashfile_exe\" \"$filename\"`;
289# $result = `hashfile$osexe \"$filename\" 2>&1`;
290 $result = `hashfile$osexe \"$filename\"`;
291
292 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
293 } else {
294 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
295 }
296 return "HASH$result";
297}
298
299# methods dealing with OID, not groups of them.
300
301# if $OID is not provided one is calculated
302sub set_OID {
303 my $self = shift (@_);
304 my ($OID) = @_;
305
306 my $use_hash_oid = 0;
307 # if an OID wasn't provided calculate one
308 if (!defined $OID) {
309 $OID = "NULL";
310 if ($self->{'OIDtype'} =~ /^hash/) {
311 $use_hash_oid = 1;
312 } elsif ($self->{'OIDtype'} eq "incremental") {
313 $OID = "D" . $OIDcount;
314 $OIDcount ++;
315
316 } elsif ($self->{'OIDtype'} eq "dirname") {
317 $OID = 'J';
318 my $filename = $self->get_source_filename();
319 if (defined($filename)) { # && -e $filename) {
320 $OID = &File::Basename::dirname($filename);
321 if (defined $OID) {
322 $OID = 'J'.&File::Basename::basename($OID);
323 $OID = &util::tidy_up_oid($OID);
324 } else {
325 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
326 $use_hash_oid = 1;
327 }
328 } else {
329 print STDERR "Failed to find filename, generating hash id\n";
330 $use_hash_oid = 1;
331 }
332
333 } elsif ($self->{'OIDtype'} eq "assigned") {
334 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
335 if (defined $identifier && $identifier ne "") {
336 $OID = $identifier;
337 $OID = &util::tidy_up_oid($OID);
338 } else {
339 # need a hash id
340 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
341 $use_hash_oid = 1;
342 }
343
344 } else {
345 $use_hash_oid = 1;
346 }
347
348 if ($use_hash_oid) {
349 my $hash_on_file = 1;
350 if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
351 $hash_on_file = 0;
352 }
353 if ($hash_on_file) {
354 # "hash" OID - feed file to hashfile.exe
355 my $filename = $self->get_filename_for_hashing();
356
357 # -z: don't want to hash on the file if it is zero size
358 if (defined($filename) && -e $filename && !-z $filename) {
359 $OID = $self->_calc_OID ($filename);
360 } else {
361 $hash_on_file = 0;
362 }
363 }
364 if (!$hash_on_file) {
365 my $filename = &util::get_tmp_filename();
366 if (!open (OUTFILE, ">:utf8", $filename)) {
367 print STDERR "doc::set_OID could not write to $filename\n";
368 } else {
369 my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
370 print OUTFILE $doc_text;
371 close (OUTFILE);
372 }
373 $OID = $self->_calc_OID ($filename);
374 &util::rm ($filename);
375 }
376 }
377 }
378 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
379}
380
381# this uses hashdoc (embedded c thingy) which is faster but still
382# needs a little work to be suffiently stable
383sub ___set_OID {
384 my $self = shift (@_);
385 my ($OID) = @_;
386
387 # if an OID wasn't provided then calculate hash value based on document
388 if (!defined $OID)
389 {
390 my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
391 my $hash_len = length($hash_text);
392
393 $OID = &hashdoc::buffer($hash_text,$hash_len);
394 }
395
396 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
397}
398
399# returns the OID for this document
400sub get_OID {
401 my $self = shift (@_);
402 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
403 return $OID if (defined $OID);
404 return "NULL";
405}
406
407sub delete_OID {
408 my $self = shift (@_);
409
410 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
411}
412
413
414# methods for manipulating section names
415
416# returns the name of the top-most section (the top
417# level of the document
418sub get_top_section {
419 my $self = shift (@_);
420
421 return "";
422}
423
424# returns a section
425sub get_parent_section {
426 my $self = shift (@_);
427 my ($section) = @_;
428
429 $section =~ s/(^|\.)\d+$//;
430
431 return $section;
432}
433
434# returns the first child section (or the end child
435# if there isn't any)
436sub get_begin_child {
437 my $self = shift (@_);
438 my ($section) = @_;
439
440 my $section_ptr = $self->_lookup_section($section);
441 return "" unless defined $section_ptr;
442
443 if (defined $section_ptr->{'subsection_order'}->[0]) {
444 return "$section.$section_ptr->{'subsection_order'}->[0]";
445 }
446
447 return $self->get_end_child ($section);
448}
449
450# returns the next child of a parent section
451sub get_next_child {
452 my $self = shift (@_);
453 my ($section) = @_;
454
455 my $parent_section = $self->get_parent_section($section);
456 my $parent_section_ptr = $self->_lookup_section($parent_section);
457 return undef unless defined $parent_section_ptr;
458
459 my ($section_num) = $section =~ /(\d+)$/;
460 return undef unless defined $section_num;
461
462 my $i = 0;
463 my $section_order = $parent_section_ptr->{'subsection_order'};
464 while ($i < scalar(@$section_order)) {
465 last if $section_order->[$i] eq $section_num;
466 $i++;
467 }
468
469 $i++; # the next child
470 if ($i < scalar(@$section_order)) {
471 return $section_order->[$i] if $parent_section eq "";
472 return "$parent_section.$section_order->[$i]";
473 }
474
475 # no more sections in this level
476 return undef;
477}
478
479# returns a reference to a list of children
480sub get_children {
481 my $self = shift (@_);
482 my ($section) = @_;
483
484 my $section_ptr = $self->_lookup_section($section);
485 return [] unless defined $section_ptr;
486
487 my @children = @{$section_ptr->{'subsection_order'}};
488
489 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
490 return \@children;
491}
492
493# returns the child section one past the last one (which
494# is coded as "0")
495sub get_end_child {
496 my $self = shift (@_);
497 my ($section) = @_;
498
499 return $section . ".0" unless $section eq "";
500 return "0";
501}
502
503# returns the next section in book order
504sub get_next_section {
505 my $self = shift (@_);
506 my ($section) = @_;
507
508 return undef unless defined $section;
509
510 my $section_ptr = $self->_lookup_section($section);
511 return undef unless defined $section_ptr;
512
513 # first try to find first child
514 if (defined $section_ptr->{'subsection_order'}->[0]) {
515 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
516 return "$section.$section_ptr->{'subsection_order'}->[0]";
517 }
518
519 do {
520 # try to find sibling
521 my $next_child = $self->get_next_child ($section);
522 return $next_child if (defined $next_child);
523
524 # move up one level
525 $section = $self->get_parent_section ($section);
526 } while $section =~ /\d/;
527
528 return undef;
529}
530
531sub is_leaf_section {
532 my $self = shift (@_);
533 my ($section) = @_;
534
535 my $section_ptr = $self->_lookup_section($section);
536 return 1 unless defined $section_ptr;
537
538 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
539}
540
541# methods for dealing with sections
542
543# returns the name of the inserted section
544sub insert_section {
545 my $self = shift (@_);
546 my ($before_section) = @_;
547
548 # get the child to insert before and its parent section
549 my $parent_section = "";
550 my $before_child = "0";
551 my @before_section = split (/\./, $before_section);
552 if (scalar(@before_section) > 0) {
553 $before_child = pop (@before_section);
554 $parent_section = join (".", @before_section);
555 }
556
557 my $parent_section_ptr = $self->_lookup_section($parent_section);
558 if (!defined $parent_section_ptr) {
559 print STDERR "doc::insert_section couldn't find parent section " .
560 "$parent_section\n";
561 return;
562 }
563
564 # get the next section number
565 my $section_num = $parent_section_ptr->{'next_subsection'}++;
566
567 my $i = 0;
568 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
569 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
570 $i++;
571 }
572
573 # insert the section number into the order list
574 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
575
576 # add this section to the parent section
577 my $section_ptr = {'subsection_order'=>[],
578 'next_subsection'=>1,
579 'subsections'=>{},
580 'metadata'=>[],
581 'text'=>""};
582 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
583
584 # work out the full section number
585 my $section = $parent_section;
586 $section .= "." unless $section eq "";
587 $section .= $section_num;
588
589 return $section;
590}
591
592# creates a pre-named section
593sub create_named_section {
594 my $self = shift (@_);
595 my ($mastersection) = @_;
596
597 my ($num);
598 my $section = $mastersection;
599 my $sectionref = $self;
600
601 while ($section ne "") {
602 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
603 $num =~ s/^0+(\d)/$1/; # remove leading 0s
604 $section = "" unless defined $section;
605
606 if (defined $num) {
607 if (!defined $sectionref->{'subsections'}->{$num}) {
608 push (@{$sectionref->{'subsection_order'}}, $num);
609 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
610 'next_subsection'=>1,
611 'subsections'=>{},
612 'metadata'=>[],
613 'text'=>""};
614 if ($num >= $sectionref->{'next_subsection'}) {
615 $sectionref->{'next_subsection'} = $num + 1;
616 }
617 }
618 $sectionref = $sectionref->{'subsections'}->{$num};
619
620 } else {
621 print STDERR "doc::create_named_section couldn't create section ";
622 print STDERR "$mastersection\n";
623 last;
624 }
625 }
626}
627
628# returns a reference to a list of subsections
629sub list_subsections {
630 my $self = shift (@_);
631 my ($section) = @_;
632
633 my $section_ptr = $self->_lookup_section ($section);
634 if (!defined $section_ptr) {
635 print STDERR "doc::list_subsections couldn't find section $section\n";
636 return [];
637 }
638
639 return [@{$section_ptr->{'subsection_order'}}];
640}
641
642sub delete_section {
643 my $self = shift (@_);
644 my ($section) = @_;
645
646# my $section_ptr = {'subsection_order'=>[],
647# 'next_subsection'=>1,
648# 'subsections'=>{},
649# 'metadata'=>[],
650# 'text'=>""};
651
652 # if this is the top section reset everything
653 if ($section eq "") {
654 $self->{'subsection_order'} = [];
655 $self->{'subsections'} = {};
656 $self->{'metadata'} = [];
657 $self->{'text'} = "";
658 return;
659 }
660
661 # find the parent of the section to delete
662 my $parent_section = "";
663 my $child = "0";
664 my @section = split (/\./, $section);
665 if (scalar(@section) > 0) {
666 $child = pop (@section);
667 $parent_section = join (".", @section);
668 }
669
670 my $parent_section_ptr = $self->_lookup_section($parent_section);
671 if (!defined $parent_section_ptr) {
672 print STDERR "doc::delete_section couldn't find parent section " .
673 "$parent_section\n";
674 return;
675 }
676
677 # remove this section from the subsection_order list
678 my $i = 0;
679 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
680 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
681 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
682 last;
683 }
684 $i++;
685 }
686
687 # remove this section from the subsection hash
688 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
689 undef $parent_section_ptr->{'subsections'}->{$child};
690 }
691}
692
693#--
694# methods for dealing with metadata
695
696# set_metadata_element and get_metadata_element are for metadata
697# which should only have one value. add_meta_data and get_metadata
698# are for metadata which can have more than one value.
699
700# returns the first metadata value which matches field
701
702# This version of get metadata element works much like the one above,
703# except it allows for the namespace portion of a metadata element to
704# be ignored, thus if you are searching for dc.Title, the first piece
705# of matching metadata ending with the name Title (once any namespace
706# is removed) would be returned.
707# 28-11-2003 John Thompson
708sub get_metadata_element {
709 my $self = shift (@_);
710 my ($section, $field, $ignore_namespace) = @_;
711 my ($data);
712
713 $ignore_namespace = 0 unless defined $ignore_namespace;
714
715 my $section_ptr = $self->_lookup_section($section);
716 if (!defined $section_ptr) {
717 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
718 return;
719 }
720
721 # Remove the namespace if we are being told to ignore them
722 if($ignore_namespace) {
723 $field =~ s/^\w*\.//;
724 }
725
726 foreach $data (@{$section_ptr->{'metadata'}}) {
727
728 my $data_name = $data->[0];
729
730 # Remove the any namespace if we are being told to ignore them
731 if($ignore_namespace) {
732 $data_name =~ s/^\w*\.//;
733 }
734 $data_name =~ s/^ex\.//; # we always remove ex. - it maybe there in doc_obj, but we will never ask for it.
735 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
736 }
737
738 return undef; # was not found
739}
740
741# returns a list of the form [value1, value2, ...]
742sub get_metadata {
743 my $self = shift (@_);
744 my ($section, $field, $ignore_namespace) = @_;
745 my ($data);
746
747 $ignore_namespace = 0 unless defined $ignore_namespace;
748
749 my $section_ptr = $self->_lookup_section($section);
750 if (!defined $section_ptr) {
751 print STDERR "doc::get_metadata couldn't find section ",
752 $section, "\n";
753 return;
754 }
755
756 # Remove the any namespace if we are being told to ignore them
757 if($ignore_namespace) {
758 $field =~ s/^\w*\.//;
759 }
760
761 my @metadata = ();
762 foreach $data (@{$section_ptr->{'metadata'}}) {
763
764 my $data_name = $data->[0];
765 # Remove the any namespace if we are being told to ignore them
766 if($ignore_namespace) {
767 $data_name =~ s/^\w*\.//;
768 }
769 $data_name =~ s/^ex\.//; # we always remove ex. - it maybe there in doc_obj, but we will never ask for it.
770 push (@metadata, $data->[1]) if ($data_name eq $field);
771 }
772
773 return \@metadata;
774}
775
776# returns a list of the form [[field,value],[field,value],...]
777sub get_all_metadata {
778 my $self = shift (@_);
779 my ($section) = @_;
780
781 my $section_ptr = $self->_lookup_section($section);
782 if (!defined $section_ptr) {
783 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
784 return;
785 }
786
787 return $section_ptr->{'metadata'};
788}
789
790# $value is optional
791sub delete_metadata {
792 my $self = shift (@_);
793 my ($section, $field, $value) = @_;
794
795 my $section_ptr = $self->_lookup_section($section);
796 if (!defined $section_ptr) {
797 print STDERR "doc::delete_metadata couldn't find section ", $section, "\n";
798 return;
799 }
800
801 my $i = 0;
802 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
803 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
804 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
805 splice (@{$section_ptr->{'metadata'}}, $i, 1);
806 } else {
807 $i++;
808 }
809 }
810}
811
812sub delete_all_metadata {
813 my $self = shift (@_);
814 my ($section) = @_;
815
816 my $section_ptr = $self->_lookup_section($section);
817 if (!defined $section_ptr) {
818 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
819 return;
820 }
821
822 $section_ptr->{'metadata'} = [];
823}
824
825sub set_metadata_element {
826 my $self = shift (@_);
827 my ($section, $field, $value) = @_;
828
829 $self->set_utf8_metadata_element ($section, $field,
830 &unicode::ascii2utf8(\$value));
831}
832
833# set_utf8_metadata_element assumes the text has already been
834# converted to the UTF-8 encoding.
835sub set_utf8_metadata_element {
836 my $self = shift (@_);
837 my ($section, $field, $value) = @_;
838
839 $self->delete_metadata ($section, $field);
840 $self->add_utf8_metadata ($section, $field, $value);
841}
842
843
844# add_metadata assumes the text is in (extended) ascii form. For
845# text which has already been converted to the UTF-8 format use
846# add_utf8_metadata.
847sub add_metadata {
848 my $self = shift (@_);
849 my ($section, $field, $value) = @_;
850
851 $self->add_utf8_metadata ($section, $field,
852 &unicode::ascii2utf8(\$value));
853}
854
855sub add_utf8_metadata {
856 my $self = shift (@_);
857 my ($section, $field, $value) = @_;
858
859# my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
860# my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
861# print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
862
863 my $section_ptr = $self->_lookup_section($section);
864 if (!defined $section_ptr) {
865 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
866 return;
867 }
868 if (!defined $value) {
869 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
870 return;
871 }
872 if (!defined $field) {
873 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
874 return;
875 }
876
877 #print STDERR "###$field=$value\n";
878
879 # For now, supress this check. Given that text data read in is now
880 # Unicode aware, then the following block of code can (ironically enough)
881 # cause our unicode compliant string to be re-encoded (leading to
882 # a double-encoded UTF-8 string, which we definitely don't want!).
883
884
885 # double check that the value is utf-8
886# if (!&unicode::check_is_utf8($value)) {
887# print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
888# &unicode::ensure_utf8(\$value);
889# print STDERR " Tried converting to utf8: $value\n";
890# }
891
892 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
893}
894
895
896# methods for dealing with text
897
898# returns the text for a section
899sub get_text {
900 my $self = shift (@_);
901 my ($section) = @_;
902
903 my $section_ptr = $self->_lookup_section($section);
904 if (!defined $section_ptr) {
905 print STDERR "doc::get_text couldn't find section " .
906 "$section\n";
907 return "";
908 }
909
910 return $section_ptr->{'text'};
911}
912
913# returns the (utf-8 encoded) length of the text for a section
914sub get_text_length {
915 my $self = shift (@_);
916 my ($section) = @_;
917
918 my $section_ptr = $self->_lookup_section($section);
919 if (!defined $section_ptr) {
920 print STDERR "doc::get_text_length couldn't find section " .
921 "$section\n";
922 return 0;
923 }
924
925 return length ($section_ptr->{'text'});
926}
927
928sub delete_text {
929 my $self = shift (@_);
930 my ($section) = @_;
931
932 my $section_ptr = $self->_lookup_section($section);
933 if (!defined $section_ptr) {
934 print STDERR "doc::delete_text couldn't find section " .
935 "$section\n";
936 return;
937 }
938
939 $section_ptr->{'text'} = "";
940}
941
942# add_text assumes the text is in (extended) ascii form. For
943# text which has been already converted to the UTF-8 format
944# use add_utf8_text.
945sub add_text {
946 my $self = shift (@_);
947 my ($section, $text) = @_;
948
949 # convert the text to UTF-8 encoded unicode characters
950 # and add the text
951 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
952}
953
954
955# add_utf8_text assumes the text to be added has already
956# been converted to the UTF-8 encoding. For ascii text use
957# add_text
958sub add_utf8_text {
959 my $self = shift (@_);
960 my ($section, $text) = @_;
961
962 my $section_ptr = $self->_lookup_section($section);
963 if (!defined $section_ptr) {
964 print STDERR "doc::add_utf8_text couldn't find section " .
965 "$section\n";
966 return;
967 }
968
969 $section_ptr->{'text'} .= $text;
970}
971
972# returns the Source meta, which is the utf8 filename generated.
973# Added a separate method here for convenience
974sub get_source {
975 my $self = shift (@_);
976 return $self->get_metadata_element ($self->get_top_section(), "Source");
977}
978
979# returns the SourceFile meta, which is the url reference to the URL-encoded
980# version of Source (the utf8 filename). Added a separate method here for convenience
981sub get_sourcefile {
982 my $self = shift (@_);
983 return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
984}
985
986# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
987# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
988sub get_assocfile_from_sourcefile {
989 my $self = shift (@_);
990
991 # get the SourceFile meta, which is a *URL* to a file on the filesystem
992 my $top_section = $self->get_top_section();
993 my $source_file = $self->get_metadata_element($top_section, "SourceFile");
994
995 # get the actual filename as it exists on the filesystem which this url refers to
996 $source_file = &unicode::url_to_filename($source_file);
997 my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
998 return $assocfilename;
999}
1000
1001# methods for dealing with associated files
1002
1003# a file is associated with a document, NOT a section.
1004# if section is defined it is noted in the data structure
1005# only so that files associated from a particular section
1006# may be removed later (using delete_section_assoc_files)
1007sub associate_file {
1008 my $self = shift (@_);
1009 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1010 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1011
1012 # remove all associated files with the same name
1013 $self->delete_assoc_file ($assoc_filename);
1014
1015 push (@{$self->{'associated_files'}},
1016 [$real_filename, $assoc_filename, $mime_type, $section]);
1017}
1018
1019# returns a list of associated files in the form
1020# [[real_filename, assoc_filename, mimetype], ...]
1021sub get_assoc_files {
1022 my $self = shift (@_);
1023
1024 return $self->{'associated_files'};
1025}
1026
1027# the following two methods used to keep track of original associated files
1028# for incremental building. eg a txt file used by an item file does not end
1029# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1030# file for incremental build
1031sub associate_source_file {
1032 my $self = shift (@_);
1033 my ($full_filename) = @_;
1034
1035 push (@{$self->{'source_assoc_files'}}, $full_filename);
1036
1037}
1038
1039sub get_source_assoc_files {
1040 my $self = shift (@_);
1041
1042 return $self->{'source_assoc_files'};
1043
1044
1045}
1046sub metadata_file {
1047 my $self = shift (@_);
1048 my ($real_filename, $filename) = @_;
1049
1050 push (@{$self->{'metadata_files'}},
1051 [$real_filename, $filename]);
1052}
1053
1054# used for writing out the archiveinf-doc info database, to list all the metadata files
1055sub get_meta_files {
1056 my $self = shift (@_);
1057
1058 return $self->{'metadata_files'};
1059}
1060
1061sub delete_section_assoc_files {
1062 my $self = shift (@_);
1063 my ($section) = @_;
1064
1065 my $i=0;
1066 while ($i < scalar (@{$self->{'associated_files'}})) {
1067 if (defined $self->{'associated_files'}->[$i]->[3] &&
1068 $self->{'associated_files'}->[$i]->[3] eq $section) {
1069 splice (@{$self->{'associated_files'}}, $i, 1);
1070 } else {
1071 $i++;
1072 }
1073 }
1074}
1075
1076sub delete_assoc_file {
1077 my $self = shift (@_);
1078 my ($assoc_filename) = @_;
1079
1080 my $i=0;
1081 while ($i < scalar (@{$self->{'associated_files'}})) {
1082 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1083 splice (@{$self->{'associated_files'}}, $i, 1);
1084 } else {
1085 $i++;
1086 }
1087 }
1088}
1089
1090sub reset_nextsection_ptr {
1091 my $self = shift (@_);
1092 my ($section) = @_;
1093
1094 my $section_ptr = $self->_lookup_section($section);
1095 $section_ptr->{'next_subsection'} = 1;
1096}
1097
10981;
Note: See TracBrowser for help on using the repository browser.