source: main/trunk/greenstone2/perllib/doc.pm@ 21207

Last change on this file since 21207 was 20775, checked in by kjdon, 15 years ago

we need to store a list of original extra associated files, for incremental building. this is different from the resulting associated files for doc.xml. It will be the list of all files in import that comprised the document. added assocaite_source_file and get_source_assoc_files methods to set and get these files

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 29.0 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use ghtml;
40use File::stat;
41##use hashdoc;
42use docprint;
43
44# the document type may be indexed_doc, nonindexed_doc, or
45# classification
46
47our $OIDcount = 0;
48
49# rename_method can be 'url', 'none', 'base64'
50sub new {
51 my $class = shift (@_);
52 my ($source_filename, $doc_type, $rename_method) = @_;
53
54
55 my $self = bless {'associated_files'=>[],
56 'subsection_order'=>[],
57 'next_subsection'=>1,
58 'subsections'=>{},
59 'metadata'=>[],
60 'text'=>"",
61 'OIDtype'=>"hash"}, $class;
62
63 # used to set lastmodified here, but this can screw up the HASH ids, so
64 # the docsave processor now calls set_lastmodified
65
66 $self->{'source_path'} = $source_filename;
67
68 if (defined $source_filename) {
69 $source_filename = &util::filename_within_collection($source_filename);
70 print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
71 $self->set_source_filename ($source_filename, $rename_method);
72 }
73
74 $self->set_doc_type ($doc_type) if defined $doc_type;
75
76 return $self;
77}
78# set lastmodified for OAI purposes, added by GRB, moved by kjdon
79sub set_lastmodified {
80 my $self = shift (@_);
81
82 my $source_path = $self->{'source_path'};
83
84 if (defined $source_path && (-e $source_path)) {
85
86 my $file_stat = stat($source_path);
87 my $mtime = $file_stat->mtime;
88 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $file_stat->mtime);
89 }
90}
91
92# clone the $self object
93sub duplicate {
94 my $self = shift (@_);
95
96 my $newobj = {};
97
98 foreach my $k (keys %$self) {
99 $newobj->{$k} = &clone ($self->{$k});
100 }
101
102 bless $newobj, ref($self);
103 return $newobj;
104}
105
106sub clone {
107 my ($from) = @_;
108 my $type = ref ($from);
109
110 if ($type eq "HASH") {
111 my $to = {};
112 foreach my $key (keys %$from) {
113 $to->{$key} = &clone ($from->{$key});
114 }
115 return $to;
116 } elsif ($type eq "ARRAY") {
117 my $to = [];
118 foreach my $v (@$from) {
119 push (@$to, &clone ($v));
120 }
121 return $to;
122 } else {
123 return $from;
124 }
125}
126
127sub set_OIDtype {
128 my $self = shift (@_);
129 my ($type, $metadata) = @_;
130
131 if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|incremental|dirname|assigned)$/) {
132 $self->{'OIDtype'} = $type;
133 } else {
134 $self->{'OIDtype'} = "hash";
135 }
136
137 if ($type =~ /^assigned$/) {
138 if (defined $metadata) {
139 $self->{'OIDmetadata'} = $metadata;
140 } else {
141 $self->{'OIDmetadata'} = "dc.Identifier";
142 }
143 }
144}
145
146# rename_method can be 'url', 'none', 'base64'
147sub set_source_filename {
148 my $self = shift (@_);
149 my ($source_filename, $rename_method) = @_;
150
151 # Since the gsdlsourcefilename element goes into the doc.xml it has
152 # to be utf8. However, it should also *represent* the source filename
153 # (in the import directory) which may not be utf8 at all.
154 # For instance, if this meta element (gsdlsourcefilename) will be used
155 # by other applications that parse doc.xml in order to locate
156 # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
157 # encode the real filename as this is a binary-to-text encoding meaning
158 # that the resulting string is ASCII (utf8). Decoding will give the original.
159
160# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
161
162 # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
163 my ($srcfilename,$dirname,$suffix)
164 = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
165# print STDERR "-> $srcfilename -> ";
166 $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
167 $source_filename = &util::filename_cat($dirname, $srcfilename);
168# print STDERR "$source_filename\n";
169
170 $self->set_utf8_metadata_element ($self->get_top_section(),
171 "gsdlsourcefilename",
172 $source_filename);
173}
174
175sub set_converted_filename {
176 my $self = shift (@_);
177 my ($converted_filename) = @_;
178
179 # we know the converted filename is utf8
180 $self->set_utf8_metadata_element ($self->get_top_section(),
181 "gsdlconvertedfilename",
182 $converted_filename);
183}
184
185# returns the source_filename as it was provided
186sub get_unmodified_source_filename {
187 my $self = shift (@_);
188
189 return $self->{'source_path'};
190}
191
192# returns the source_filename with whatever rename_method was given
193sub get_source_filename {
194 my $self = shift (@_);
195
196 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
197}
198
199
200
201# returns converted filename if available else returns source filename
202sub get_filename_for_hashing {
203 my $self = shift (@_);
204
205 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
206
207 if (!defined $filename) {
208 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
209 # if NULPlug processed file, then don't give a filename
210 if (defined $plugin_name && $plugin_name eq "NULPlug") {
211 $filename = undef;
212 } else { # returns the URL encoded source filename!
213 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
214 }
215 }
216 return $filename;
217}
218
219sub set_doc_type {
220 my $self = shift (@_);
221 my ($doc_type) = @_;
222
223 $self->set_metadata_element ($self->get_top_section(),
224 "gsdldoctype",
225 $doc_type);
226}
227
228# returns the gsdldoctype as it was provided
229# the default of "indexed_doc" is used if no document
230# type was provided
231sub get_doc_type {
232 my $self = shift (@_);
233
234 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
235 return $doc_type if (defined $doc_type);
236 return "indexed_doc";
237}
238
239
240# look up the reference to the a particular section
241sub _lookup_section {
242 my $self = shift (@_);
243 my ($section) = @_;
244
245 my ($num);
246 my $sectionref = $self;
247
248 while (defined $section && $section ne "") {
249
250 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
251
252 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
253
254 $section = "" unless defined $section;
255
256
257 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
258 $sectionref = $sectionref->{'subsections'}->{$num};
259 } else {
260 return undef;
261 }
262 }
263
264 return $sectionref;
265}
266
267# calculate OID by hashing the contents of the document
268sub _calc_OID {
269 my $self = shift (@_);
270 my ($filename) = @_;
271
272 my $osexe = &util::get_os_exe();
273
274 my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
275 $ENV{'GSDLOS'},"hashfile$osexe");
276
277 my $result = "NULL";
278
279 if (-e "$hashfile_exe") {
280# $result = `\"$hashfile_exe\" \"$filename\"`;
281 $result = `hashfile$osexe \"$filename\"`;
282 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
283 } else {
284 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
285 }
286 return "HASH$result";
287}
288
289# methods dealing with OID, not groups of them.
290
291# if $OID is not provided one is calculated
292sub set_OID {
293 my $self = shift (@_);
294 my ($OID) = @_;
295
296 my $use_hash_oid = 0;
297 # if an OID wasn't provided calculate one
298 if (!defined $OID) {
299 $OID = "NULL";
300 if ($self->{'OIDtype'} =~ /^hash/) {
301 $use_hash_oid = 1;
302 } elsif ($self->{'OIDtype'} eq "incremental") {
303 $OID = "D" . $OIDcount;
304 $OIDcount ++;
305
306 } elsif ($self->{'OIDtype'} eq "dirname") {
307 $OID = 'J';
308 my $filename = $self->get_source_filename();
309 if (defined($filename)) { # && -e $filename) {
310 $OID = &File::Basename::dirname($filename);
311 if (defined $OID) {
312 $OID = 'J'.&File::Basename::basename($OID);
313 $OID = &util::tidy_up_oid($OID);
314 } else {
315 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
316 $use_hash_oid = 1;
317 }
318 } else {
319 print STDERR "Failed to find filename, generating hash id\n";
320 $use_hash_oid = 1;
321 }
322
323 } elsif ($self->{'OIDtype'} eq "assigned") {
324 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
325 if (defined $identifier && $identifier ne "") {
326 $OID = $identifier;
327 $OID = &util::tidy_up_oid($OID);
328 } else {
329 # need a hash id
330 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
331 $use_hash_oid = 1;
332 }
333
334 } else {
335 $use_hash_oid = 1;
336 }
337
338 if ($use_hash_oid) {
339 my $hash_on_file = 1;
340 if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
341 $hash_on_file = 0;
342 }
343 if ($hash_on_file) {
344 # "hash" OID - feed file to hashfile.exe
345 my $filename = $self->get_filename_for_hashing();
346 # -z: don't want to hash on the file if it is zero size
347 if (defined($filename) && -e $filename && !-z $filename) {
348 $OID = $self->_calc_OID ($filename);
349 } else {
350 $hash_on_file = 0;
351 }
352 }
353 if (!$hash_on_file) {
354 my $filename = &util::get_tmp_filename();
355 if (!open (OUTFILE, ">$filename")) {
356 print STDERR "doc::set_OID could not write to $filename\n";
357 } else {
358 my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
359 print OUTFILE $doc_text;
360 close (OUTFILE);
361 }
362 $OID = $self->_calc_OID ($filename);
363 &util::rm ($filename);
364 }
365 }
366 }
367 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
368}
369
370# this uses hashdoc (embedded c thingy) which is faster but still
371# needs a little work to be suffiently stable
372sub ___set_OID {
373 my $self = shift (@_);
374 my ($OID) = @_;
375
376 # if an OID wasn't provided then calculate hash value based on document
377 if (!defined $OID)
378 {
379 my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
380 my $hash_len = length($hash_text);
381
382 $OID = &hashdoc::buffer($hash_text,$hash_len);
383 }
384
385 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
386}
387
388# returns the OID for this document
389sub get_OID {
390 my $self = shift (@_);
391 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
392 return $OID if (defined $OID);
393 return "NULL";
394}
395
396sub delete_OID {
397 my $self = shift (@_);
398
399 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
400}
401
402
403# methods for manipulating section names
404
405# returns the name of the top-most section (the top
406# level of the document
407sub get_top_section {
408 my $self = shift (@_);
409
410 return "";
411}
412
413# returns a section
414sub get_parent_section {
415 my $self = shift (@_);
416 my ($section) = @_;
417
418 $section =~ s/(^|\.)\d+$//;
419
420 return $section;
421}
422
423# returns the first child section (or the end child
424# if there isn't any)
425sub get_begin_child {
426 my $self = shift (@_);
427 my ($section) = @_;
428
429 my $section_ptr = $self->_lookup_section($section);
430 return "" unless defined $section_ptr;
431
432 if (defined $section_ptr->{'subsection_order'}->[0]) {
433 return "$section.$section_ptr->{'subsection_order'}->[0]";
434 }
435
436 return $self->get_end_child ($section);
437}
438
439# returns the next child of a parent section
440sub get_next_child {
441 my $self = shift (@_);
442 my ($section) = @_;
443
444 my $parent_section = $self->get_parent_section($section);
445 my $parent_section_ptr = $self->_lookup_section($parent_section);
446 return undef unless defined $parent_section_ptr;
447
448 my ($section_num) = $section =~ /(\d+)$/;
449 return undef unless defined $section_num;
450
451 my $i = 0;
452 my $section_order = $parent_section_ptr->{'subsection_order'};
453 while ($i < scalar(@$section_order)) {
454 last if $section_order->[$i] eq $section_num;
455 $i++;
456 }
457
458 $i++; # the next child
459 if ($i < scalar(@$section_order)) {
460 return $section_order->[$i] if $parent_section eq "";
461 return "$parent_section.$section_order->[$i]";
462 }
463
464 # no more sections in this level
465 return undef;
466}
467
468# returns a reference to a list of children
469sub get_children {
470 my $self = shift (@_);
471 my ($section) = @_;
472
473 my $section_ptr = $self->_lookup_section($section);
474 return [] unless defined $section_ptr;
475
476 my @children = @{$section_ptr->{'subsection_order'}};
477
478 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
479 return \@children;
480}
481
482# returns the child section one past the last one (which
483# is coded as "0")
484sub get_end_child {
485 my $self = shift (@_);
486 my ($section) = @_;
487
488 return $section . ".0" unless $section eq "";
489 return "0";
490}
491
492# returns the next section in book order
493sub get_next_section {
494 my $self = shift (@_);
495 my ($section) = @_;
496
497 return undef unless defined $section;
498
499 my $section_ptr = $self->_lookup_section($section);
500 return undef unless defined $section_ptr;
501
502 # first try to find first child
503 if (defined $section_ptr->{'subsection_order'}->[0]) {
504 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
505 return "$section.$section_ptr->{'subsection_order'}->[0]";
506 }
507
508 do {
509 # try to find sibling
510 my $next_child = $self->get_next_child ($section);
511 return $next_child if (defined $next_child);
512
513 # move up one level
514 $section = $self->get_parent_section ($section);
515 } while $section =~ /\d/;
516
517 return undef;
518}
519
520sub is_leaf_section {
521 my $self = shift (@_);
522 my ($section) = @_;
523
524 my $section_ptr = $self->_lookup_section($section);
525 return 1 unless defined $section_ptr;
526
527 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
528}
529
530# methods for dealing with sections
531
532# returns the name of the inserted section
533sub insert_section {
534 my $self = shift (@_);
535 my ($before_section) = @_;
536
537 # get the child to insert before and its parent section
538 my $parent_section = "";
539 my $before_child = "0";
540 my @before_section = split (/\./, $before_section);
541 if (scalar(@before_section) > 0) {
542 $before_child = pop (@before_section);
543 $parent_section = join (".", @before_section);
544 }
545
546 my $parent_section_ptr = $self->_lookup_section($parent_section);
547 if (!defined $parent_section_ptr) {
548 print STDERR "doc::insert_section couldn't find parent section " .
549 "$parent_section\n";
550 return;
551 }
552
553 # get the next section number
554 my $section_num = $parent_section_ptr->{'next_subsection'}++;
555
556 my $i = 0;
557 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
558 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
559 $i++;
560 }
561
562 # insert the section number into the order list
563 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
564
565 # add this section to the parent section
566 my $section_ptr = {'subsection_order'=>[],
567 'next_subsection'=>1,
568 'subsections'=>{},
569 'metadata'=>[],
570 'text'=>""};
571 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
572
573 # work out the full section number
574 my $section = $parent_section;
575 $section .= "." unless $section eq "";
576 $section .= $section_num;
577
578 return $section;
579}
580
581# creates a pre-named section
582sub create_named_section {
583 my $self = shift (@_);
584 my ($mastersection) = @_;
585
586 my ($num);
587 my $section = $mastersection;
588 my $sectionref = $self;
589
590 while ($section ne "") {
591 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
592 $num =~ s/^0+(\d)/$1/; # remove leading 0s
593 $section = "" unless defined $section;
594
595 if (defined $num) {
596 if (!defined $sectionref->{'subsections'}->{$num}) {
597 push (@{$sectionref->{'subsection_order'}}, $num);
598 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
599 'next_subsection'=>1,
600 'subsections'=>{},
601 'metadata'=>[],
602 'text'=>""};
603 if ($num >= $sectionref->{'next_subsection'}) {
604 $sectionref->{'next_subsection'} = $num + 1;
605 }
606 }
607 $sectionref = $sectionref->{'subsections'}->{$num};
608
609 } else {
610 print STDERR "doc::create_named_section couldn't create section ";
611 print STDERR "$mastersection\n";
612 last;
613 }
614 }
615}
616
617# returns a reference to a list of subsections
618sub list_subsections {
619 my $self = shift (@_);
620 my ($section) = @_;
621
622 my $section_ptr = $self->_lookup_section ($section);
623 if (!defined $section_ptr) {
624 print STDERR "doc::list_subsections couldn't find section $section\n";
625 return [];
626 }
627
628 return [@{$section_ptr->{'subsection_order'}}];
629}
630
631sub delete_section {
632 my $self = shift (@_);
633 my ($section) = @_;
634
635# my $section_ptr = {'subsection_order'=>[],
636# 'next_subsection'=>1,
637# 'subsections'=>{},
638# 'metadata'=>[],
639# 'text'=>""};
640
641 # if this is the top section reset everything
642 if ($section eq "") {
643 $self->{'subsection_order'} = [];
644 $self->{'subsections'} = {};
645 $self->{'metadata'} = [];
646 $self->{'text'} = "";
647 return;
648 }
649
650 # find the parent of the section to delete
651 my $parent_section = "";
652 my $child = "0";
653 my @section = split (/\./, $section);
654 if (scalar(@section) > 0) {
655 $child = pop (@section);
656 $parent_section = join (".", @section);
657 }
658
659 my $parent_section_ptr = $self->_lookup_section($parent_section);
660 if (!defined $parent_section_ptr) {
661 print STDERR "doc::delete_section couldn't find parent section " .
662 "$parent_section\n";
663 return;
664 }
665
666 # remove this section from the subsection_order list
667 my $i = 0;
668 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
669 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
670 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
671 last;
672 }
673 $i++;
674 }
675
676 # remove this section from the subsection hash
677 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
678 undef $parent_section_ptr->{'subsections'}->{$child};
679 }
680}
681
682#--
683# methods for dealing with metadata
684
685# set_metadata_element and get_metadata_element are for metadata
686# which should only have one value. add_meta_data and get_metadata
687# are for metadata which can have more than one value.
688
689# returns the first metadata value which matches field
690
691# This version of get metadata element works much like the one above,
692# except it allows for the namespace portion of a metadata element to
693# be ignored, thus if you are searching for dc.Title, the first piece
694# of matching metadata ending with the name Title (once any namespace
695# is removed) would be returned.
696# 28-11-2003 John Thompson
697sub get_metadata_element {
698 my $self = shift (@_);
699 my ($section, $field, $ignore_namespace) = @_;
700 my ($data);
701
702 $ignore_namespace = 0 unless defined $ignore_namespace;
703
704 my $section_ptr = $self->_lookup_section($section);
705 if (!defined $section_ptr) {
706 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
707 return;
708 }
709
710 # Remove the namespace if we are being told to ignore them
711 if($ignore_namespace) {
712 $field =~ s/^\w*\.//;
713 }
714
715 foreach $data (@{$section_ptr->{'metadata'}}) {
716
717 my $data_name = $data->[0];
718
719 # Remove the any namespace if we are being told to ignore them
720 if($ignore_namespace) {
721 $data_name =~ s/^\w*\.//;
722 }
723
724 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
725 }
726
727 return undef; # was not found
728}
729
730# returns a list of the form [value1, value2, ...]
731sub get_metadata {
732 my $self = shift (@_);
733 my ($section, $field, $ignore_namespace) = @_;
734 my ($data);
735
736 $ignore_namespace = 0 unless defined $ignore_namespace;
737
738 my $section_ptr = $self->_lookup_section($section);
739 if (!defined $section_ptr) {
740 print STDERR "doc::get_metadata couldn't find section ",
741 $section, "\n";
742 return;
743 }
744
745 # Remove the any namespace if we are being told to ignore them
746 if($ignore_namespace) {
747 $field =~ s/^\w*\.//;
748 }
749
750 my @metadata = ();
751 foreach $data (@{$section_ptr->{'metadata'}}) {
752
753 my $data_name = $data->[0];
754 # Remove the any namespace if we are being told to ignore them
755 if($ignore_namespace) {
756 $data_name =~ s/^\w*\.//;
757 }
758
759 push (@metadata, $data->[1]) if ($data_name eq $field);
760 }
761
762 return \@metadata;
763}
764
765# returns a list of the form [[field,value],[field,value],...]
766sub get_all_metadata {
767 my $self = shift (@_);
768 my ($section) = @_;
769
770 my $section_ptr = $self->_lookup_section($section);
771 if (!defined $section_ptr) {
772 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
773 return;
774 }
775
776 return $section_ptr->{'metadata'};
777}
778
779# $value is optional
780sub delete_metadata {
781 my $self = shift (@_);
782 my ($section, $field, $value) = @_;
783
784 my $section_ptr = $self->_lookup_section($section);
785 if (!defined $section_ptr) {
786 print STDERR "doc::delete_metadata couldn't find section ", $section, "\n";
787 return;
788 }
789
790 my $i = 0;
791 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
792 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
793 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
794 splice (@{$section_ptr->{'metadata'}}, $i, 1);
795 } else {
796 $i++;
797 }
798 }
799}
800
801sub delete_all_metadata {
802 my $self = shift (@_);
803 my ($section) = @_;
804
805 my $section_ptr = $self->_lookup_section($section);
806 if (!defined $section_ptr) {
807 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
808 return;
809 }
810
811 $section_ptr->{'metadata'} = [];
812}
813
814sub set_metadata_element {
815 my $self = shift (@_);
816 my ($section, $field, $value) = @_;
817
818 $self->set_utf8_metadata_element ($section, $field,
819 &unicode::ascii2utf8(\$value));
820}
821
822# set_utf8_metadata_element assumes the text has already been
823# converted to the UTF-8 encoding.
824sub set_utf8_metadata_element {
825 my $self = shift (@_);
826 my ($section, $field, $value) = @_;
827
828 $self->delete_metadata ($section, $field);
829 $self->add_utf8_metadata ($section, $field, $value);
830}
831
832
833# add_metadata assumes the text is in (extended) ascii form. For
834# text which has already been converted to the UTF-8 format use
835# add_utf8_metadata.
836sub add_metadata {
837 my $self = shift (@_);
838 my ($section, $field, $value) = @_;
839
840 $self->add_utf8_metadata ($section, $field,
841 &unicode::ascii2utf8(\$value));
842}
843
844sub add_utf8_metadata {
845 my $self = shift (@_);
846 my ($section, $field, $value) = @_;
847
848 my $section_ptr = $self->_lookup_section($section);
849 if (!defined $section_ptr) {
850 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
851 return;
852 }
853 if (!defined $value) {
854 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
855 return;
856 }
857 if (!defined $field) {
858 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
859 return;
860 }
861
862 #print STDERR "###$field=$value\n";
863 # double check that the value is utf-8
864 if (!&unicode::check_is_utf8($value)) {
865 print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
866 &unicode::ensure_utf8(\$value);
867 print STDERR " Tried converting to utf8: $value\n";
868 }
869
870 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
871}
872
873
874# methods for dealing with text
875
876# returns the text for a section
877sub get_text {
878 my $self = shift (@_);
879 my ($section) = @_;
880
881 my $section_ptr = $self->_lookup_section($section);
882 if (!defined $section_ptr) {
883 print STDERR "doc::get_text couldn't find section " .
884 "$section\n";
885 return "";
886 }
887
888 return $section_ptr->{'text'};
889}
890
891# returns the (utf-8 encoded) length of the text for a section
892sub get_text_length {
893 my $self = shift (@_);
894 my ($section) = @_;
895
896 my $section_ptr = $self->_lookup_section($section);
897 if (!defined $section_ptr) {
898 print STDERR "doc::get_text_length couldn't find section " .
899 "$section\n";
900 return 0;
901 }
902
903 return length ($section_ptr->{'text'});
904}
905
906sub delete_text {
907 my $self = shift (@_);
908 my ($section) = @_;
909
910 my $section_ptr = $self->_lookup_section($section);
911 if (!defined $section_ptr) {
912 print STDERR "doc::delete_text couldn't find section " .
913 "$section\n";
914 return;
915 }
916
917 $section_ptr->{'text'} = "";
918}
919
920# add_text assumes the text is in (extended) ascii form. For
921# text which has been already converted to the UTF-8 format
922# use add_utf8_text.
923sub add_text {
924 my $self = shift (@_);
925 my ($section, $text) = @_;
926
927 # convert the text to UTF-8 encoded unicode characters
928 # and add the text
929 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
930}
931
932
933# add_utf8_text assumes the text to be added has already
934# been converted to the UTF-8 encoding. For ascii text use
935# add_text
936sub add_utf8_text {
937 my $self = shift (@_);
938 my ($section, $text) = @_;
939
940 my $section_ptr = $self->_lookup_section($section);
941 if (!defined $section_ptr) {
942 print STDERR "doc::add_utf8_text couldn't find section " .
943 "$section\n";
944 return;
945 }
946
947 $section_ptr->{'text'} .= $text;
948}
949
950# returns the Source meta, which is the utf8 filename generated.
951# Added a separate method here for convenience
952sub get_source {
953 my $self = shift (@_);
954 return $self->get_metadata_element ($self->get_top_section(), "Source");
955}
956
957# returns the SourceFile meta, which is the url reference to the URL-encoded
958# version of Source (the utf8 filename). Added a separate method here for convenience
959sub get_sourcefile {
960 my $self = shift (@_);
961 return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
962}
963
964# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
965# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
966sub get_assocfile_from_sourcefile {
967 my $self = shift (@_);
968
969 # get the SourceFile meta, which is a *URL* to a file on the filesystem
970 my $top_section = $self->get_top_section();
971 my $source_file = $self->get_metadata_element($top_section, "SourceFile");
972
973 # get the actual filename as it exists on the filesystem which this url refers to
974 $source_file = &unicode::url_to_filename($source_file);
975 my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
976 return $assocfilename;
977}
978
979# methods for dealing with associated files
980
981# a file is associated with a document, NOT a section.
982# if section is defined it is noted in the data structure
983# only so that files associated from a particular section
984# may be removed later (using delete_section_assoc_files)
985sub associate_file {
986 my $self = shift (@_);
987 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
988 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
989
990 # remove all associated files with the same name
991 $self->delete_assoc_file ($assoc_filename);
992
993 push (@{$self->{'associated_files'}},
994 [$real_filename, $assoc_filename, $mime_type, $section]);
995}
996
997# returns a list of associated files in the form
998# [[real_filename, assoc_filename, mimetype], ...]
999sub get_assoc_files {
1000 my $self = shift (@_);
1001
1002 return $self->{'associated_files'};
1003}
1004
1005# the following two methods used to keep track of original associated files
1006# for incremental building. eg a txt file used by an item file does not end
1007# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1008# file for incremental build
1009sub associate_source_file {
1010 my $self = shift (@_);
1011 my ($full_filename) = @_;
1012
1013 push (@{$self->{'source_assoc_files'}}, $full_filename);
1014
1015}
1016
1017sub get_source_assoc_files {
1018 my $self = shift (@_);
1019
1020 return $self->{'source_assoc_files'};
1021
1022
1023}
1024sub metadata_file {
1025 my $self = shift (@_);
1026 my ($real_filename, $filename) = @_;
1027
1028 push (@{$self->{'metadata_files'}},
1029 [$real_filename, $filename]);
1030}
1031
1032# used for writing out archiveinf-doc.gdb, to list all the metadata files
1033sub get_meta_files {
1034 my $self = shift (@_);
1035
1036 return $self->{'metadata_files'};
1037}
1038
1039sub delete_section_assoc_files {
1040 my $self = shift (@_);
1041 my ($section) = @_;
1042
1043 my $i=0;
1044 while ($i < scalar (@{$self->{'associated_files'}})) {
1045 if (defined $self->{'associated_files'}->[$i]->[3] &&
1046 $self->{'associated_files'}->[$i]->[3] eq $section) {
1047 splice (@{$self->{'associated_files'}}, $i, 1);
1048 } else {
1049 $i++;
1050 }
1051 }
1052}
1053
1054sub delete_assoc_file {
1055 my $self = shift (@_);
1056 my ($assoc_filename) = @_;
1057
1058 my $i=0;
1059 while ($i < scalar (@{$self->{'associated_files'}})) {
1060 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1061 splice (@{$self->{'associated_files'}}, $i, 1);
1062 } else {
1063 $i++;
1064 }
1065 }
1066}
1067
1068sub reset_nextsection_ptr {
1069 my $self = shift (@_);
1070 my ($section) = @_;
1071
1072 my $section_ptr = $self->_lookup_section($section);
1073 $section_ptr->{'next_subsection'} = 1;
1074}
1075
10761;
Note: See TracBrowser for help on using the repository browser.