source: gsdl/trunk/perllib/doc.pm@ 15874

Last change on this file since 15874 was 15874, checked in by kjdon, 16 years ago

replaced some code with equivalent method from util (filename_within_collection)

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 25.3 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use unicode;
37use util;
38use ghtml;
39use File::stat;
40##use hashdoc;
41use docprint;
42
43# the document type may be indexed_doc, nonindexed_doc, or
44# classification
45
46my $OIDcount = 0;
47
48sub new {
49 my $class = shift (@_);
50 my ($source_filename, $doc_type) = @_;
51
52
53 my $self = bless {'associated_files'=>[],
54 'subsection_order'=>[],
55 'next_subsection'=>1,
56 'subsections'=>{},
57 'metadata'=>[],
58 'text'=>"",
59 'OIDtype'=>"hash"}, $class;
60
61 # used to set lastmodified here, but this can screw up the HASH ids, so
62 # the docsave processor now calls set_lastmodified
63
64 $self->{'source_path'} = $source_filename;
65
66 if (defined $source_filename) {
67 $source_filename = &util::filename_within_collection($source_filename);
68 $self->set_source_filename ($source_filename);
69 }
70
71 $self->set_doc_type ($doc_type) if defined $doc_type;
72
73 return $self;
74}
75# set lastmodified for OAI purposes, added by GRB, moved by kjdon
76sub set_lastmodified {
77 my $self = shift (@_);
78
79 my $source_path = $self->{'source_path'};
80
81 if (defined $source_path && (-e $source_path)) {
82
83 my $file_stat = stat($source_path);
84 my $mtime = $file_stat->mtime;
85 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $file_stat->mtime);
86 }
87}
88
89# clone the $self object
90sub duplicate {
91 my $self = shift (@_);
92
93 my $newobj = {};
94
95 foreach my $k (keys %$self) {
96 $newobj->{$k} = &clone ($self->{$k});
97 }
98
99 bless $newobj, ref($self);
100 return $newobj;
101}
102
103sub clone {
104 my ($from) = @_;
105 my $type = ref ($from);
106
107 if ($type eq "HASH") {
108 my $to = {};
109 foreach my $key (keys %$from) {
110 $to->{$key} = &clone ($from->{$key});
111 }
112 return $to;
113 } elsif ($type eq "ARRAY") {
114 my $to = [];
115 foreach my $v (@$from) {
116 push (@$to, &clone ($v));
117 }
118 return $to;
119 } else {
120 return $from;
121 }
122}
123
124sub set_OIDtype {
125 my $self = shift (@_);
126 my ($type, $metadata) = @_;
127
128 if (defined $type && $type =~ /^(hash|incremental|dirname|assigned)$/) {
129 $self->{'OIDtype'} = $type;
130 } else {
131 $self->{'OIDtype'} = "hash";
132 }
133 if ($type =~ /^assigned$/) {
134 if (defined $metadata) {
135 $self->{'OIDmetadata'} = $metadata;
136 } else {
137 $self->{'OIDmetadata'} = "dc.Identifier";
138 }
139 }
140}
141
142sub set_source_filename {
143 my $self = shift (@_);
144 my ($source_filename) = @_;
145
146 $self->set_metadata_element ($self->get_top_section(),
147 "gsdlsourcefilename",
148 $source_filename);
149}
150
151sub set_converted_filename {
152 my $self = shift (@_);
153 my ($converted_filename) = @_;
154
155 $self->set_metadata_element ($self->get_top_section(),
156 "gsdlconvertedfilename",
157 $converted_filename);
158}
159
160
161# returns the source_filename as it was provided
162sub get_source_filename {
163 my $self = shift (@_);
164
165 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
166}
167
168# returns converted filename if available else returns source filename
169sub get_filename_for_hashing {
170 my $self = shift (@_);
171
172 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
173
174 if (!defined $filename) {
175 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
176 # if NULPlug processed file, then don't give a filename
177 if (defined $plugin_name && $plugin_name eq "NULPlug") {
178 $filename = undef;
179 } else {
180 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
181 }
182 }
183 return $filename;
184}
185
186sub set_doc_type {
187 my $self = shift (@_);
188 my ($doc_type) = @_;
189
190 $self->set_metadata_element ($self->get_top_section(),
191 "gsdldoctype",
192 $doc_type);
193}
194
195# returns the gsdldoctype as it was provided
196# the default of "indexed_doc" is used if no document
197# type was provided
198sub get_doc_type {
199 my $self = shift (@_);
200
201 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
202 return $doc_type if (defined $doc_type);
203 return "indexed_doc";
204}
205
206
207# look up the reference to the a particular section
208sub _lookup_section {
209 my $self = shift (@_);
210 my ($section) = @_;
211
212 my ($num);
213 my $sectionref = $self;
214
215 while (defined $section && $section ne "") {
216
217 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
218
219 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
220
221 $section = "" unless defined $section;
222
223
224 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
225 $sectionref = $sectionref->{'subsections'}->{$num};
226 } else {
227 return undef;
228 }
229 }
230
231 return $sectionref;
232}
233
234# calculate OID by hashing the contents of the document
235sub _calc_OID {
236 my $self = shift (@_);
237 my ($filename) = @_;
238
239 my $osexe = &util::get_os_exe();
240
241 my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
242 $ENV{'GSDLOS'},"hashfile$osexe");
243
244 my $result = "NULL";
245
246 if (-e "$hashfile_exe") {
247# $result = `\"$hashfile_exe\" \"$filename\"`;
248 $result = `hashfile$osexe \"$filename\"`;
249 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
250
251 } else {
252 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
253 }
254 return "HASH$result";
255}
256
257# methods dealing with OID, not groups of them.
258
259# if $OID is not provided one is calculated
260sub set_OID {
261 my $self = shift (@_);
262 my ($OID) = @_;
263
264 my $use_hash_oid = 0;
265 # if an OID wasn't provided claculate one
266 if (!defined $OID) {
267 $OID = "NULL";
268 if ($self->{'OIDtype'} eq "hash") {
269 $use_hash_oid = 1;
270 } elsif ($self->{'OIDtype'} eq "incremental") {
271 $OID = "D" . $OIDcount;
272 $OIDcount ++;
273
274 } elsif ($self->{'OIDtype'} eq "dirname") {
275 $OID = 'J';
276 my $filename = $self->get_source_filename();
277 if (defined($filename)) { # && -e $filename) {
278 $OID = &File::Basename::dirname($filename);
279 if (defined $OID) {
280 $OID = 'J'.&File::Basename::basename($OID);
281 $OID =~ s/\.//; #remove any periods
282 } else {
283 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
284 $use_hash_oid = 1;
285 }
286 } else {
287 print STDERR "Failed to find filename, generating hash id\n";
288 $use_hash_oid = 1;
289 }
290
291 } elsif ($self->{'OIDtype'} eq "assigned") {
292 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
293 if (defined $identifier && $identifier ne "") {
294 $OID = "D" . $identifier;
295 $OID =~ s/\.//; #remove any periods
296 } else {
297 # need a hash id
298 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
299 $use_hash_oid = 1;
300 }
301
302 } else {
303 $use_hash_oid = 1;
304 }
305
306 if ($use_hash_oid) {
307
308 # "hash" OID - feed file to hashfile.exe
309 #my $filename = $self->get_source_filename();
310 # we want to use the converted file for hashing if available
311 # cos its quicker
312 my $filename = $self->get_filename_for_hashing();
313 # -z: don't want to hash on the file if it is zero size
314 if (defined($filename) && -e $filename && !-z $filename) {
315 $OID = $self->_calc_OID ($filename);
316 } else {
317 $filename = &util::get_tmp_filename();
318 if (!open (OUTFILE, ">$filename")) {
319 print STDERR "doc::set_OID could not write to $filename\n";
320 } else {
321 my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
322 print OUTFILE $doc_text;
323 close (OUTFILE);
324 }
325 $OID = $self->_calc_OID ($filename);
326 &util::rm ($filename);
327 }
328 }
329 }
330 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
331}
332
333# this uses hashdoc (embedded c thingy) which is faster but still
334# needs a little work to be suffiently stable
335sub ___set_OID {
336 my $self = shift (@_);
337 my ($OID) = @_;
338
339 # if an OID wasn't provided then calculate hash value based on document
340 if (!defined $OID)
341 {
342 my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
343 my $hash_len = length($hash_text);
344
345 $OID = &hashdoc::buffer($hash_text,$hash_len);
346 }
347
348 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
349}
350
351# returns the OID for this document
352sub get_OID {
353 my $self = shift (@_);
354 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
355 return $OID if (defined $OID);
356 return "NULL";
357}
358
359sub delete_OID {
360 my $self = shift (@_);
361
362 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
363}
364
365
366# methods for manipulating section names
367
368# returns the name of the top-most section (the top
369# level of the document
370sub get_top_section {
371 my $self = shift (@_);
372
373 return "";
374}
375
376# returns a section
377sub get_parent_section {
378 my $self = shift (@_);
379 my ($section) = @_;
380
381 $section =~ s/(^|\.)\d+$//;
382
383 return $section;
384}
385
386# returns the first child section (or the end child
387# if there isn't any)
388sub get_begin_child {
389 my $self = shift (@_);
390 my ($section) = @_;
391
392 my $section_ptr = $self->_lookup_section($section);
393 return "" unless defined $section_ptr;
394
395 if (defined $section_ptr->{'subsection_order'}->[0]) {
396 return "$section.$section_ptr->{'subsection_order'}->[0]";
397 }
398
399 return $self->get_end_child ($section);
400}
401
402# returns the next child of a parent section
403sub get_next_child {
404 my $self = shift (@_);
405 my ($section) = @_;
406
407 my $parent_section = $self->get_parent_section($section);
408 my $parent_section_ptr = $self->_lookup_section($parent_section);
409 return undef unless defined $parent_section_ptr;
410
411 my ($section_num) = $section =~ /(\d+)$/;
412 return undef unless defined $section_num;
413
414 my $i = 0;
415 my $section_order = $parent_section_ptr->{'subsection_order'};
416 while ($i < scalar(@$section_order)) {
417 last if $section_order->[$i] eq $section_num;
418 $i++;
419 }
420
421 $i++; # the next child
422 if ($i < scalar(@$section_order)) {
423 return $section_order->[$i] if $parent_section eq "";
424 return "$parent_section.$section_order->[$i]";
425 }
426
427 # no more sections in this level
428 return undef;
429}
430
431# returns a reference to a list of children
432sub get_children {
433 my $self = shift (@_);
434 my ($section) = @_;
435
436 my $section_ptr = $self->_lookup_section($section);
437 return [] unless defined $section_ptr;
438
439 my @children = @{$section_ptr->{'subsection_order'}};
440
441 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
442 return \@children;
443}
444
445# returns the child section one past the last one (which
446# is coded as "0")
447sub get_end_child {
448 my $self = shift (@_);
449 my ($section) = @_;
450
451 return $section . ".0" unless $section eq "";
452 return "0";
453}
454
455# returns the next section in book order
456sub get_next_section {
457 my $self = shift (@_);
458 my ($section) = @_;
459
460 return undef unless defined $section;
461
462 my $section_ptr = $self->_lookup_section($section);
463 return undef unless defined $section_ptr;
464
465 # first try to find first child
466 if (defined $section_ptr->{'subsection_order'}->[0]) {
467 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
468 return "$section.$section_ptr->{'subsection_order'}->[0]";
469 }
470
471 do {
472 # try to find sibling
473 my $next_child = $self->get_next_child ($section);
474 return $next_child if (defined $next_child);
475
476 # move up one level
477 $section = $self->get_parent_section ($section);
478 } while $section =~ /\d/;
479
480 return undef;
481}
482
483sub is_leaf_section {
484 my $self = shift (@_);
485 my ($section) = @_;
486
487 my $section_ptr = $self->_lookup_section($section);
488 return 1 unless defined $section_ptr;
489
490 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
491}
492
493# methods for dealing with sections
494
495# returns the name of the inserted section
496sub insert_section {
497 my $self = shift (@_);
498 my ($before_section) = @_;
499
500 # get the child to insert before and its parent section
501 my $parent_section = "";
502 my $before_child = "0";
503 my @before_section = split (/\./, $before_section);
504 if (scalar(@before_section) > 0) {
505 $before_child = pop (@before_section);
506 $parent_section = join (".", @before_section);
507 }
508
509 my $parent_section_ptr = $self->_lookup_section($parent_section);
510 if (!defined $parent_section_ptr) {
511 print STDERR "doc::insert_section couldn't find parent section " .
512 "$parent_section\n";
513 return;
514 }
515
516 # get the next section number
517 my $section_num = $parent_section_ptr->{'next_subsection'}++;
518
519 my $i = 0;
520 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
521 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
522 $i++;
523 }
524
525 # insert the section number into the order list
526 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
527
528 # add this section to the parent section
529 my $section_ptr = {'subsection_order'=>[],
530 'next_subsection'=>1,
531 'subsections'=>{},
532 'metadata'=>[],
533 'text'=>""};
534 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
535
536 # work out the full section number
537 my $section = $parent_section;
538 $section .= "." unless $section eq "";
539 $section .= $section_num;
540
541 return $section;
542}
543
544# creates a pre-named section
545sub create_named_section {
546 my $self = shift (@_);
547 my ($mastersection) = @_;
548
549 my ($num);
550 my $section = $mastersection;
551 my $sectionref = $self;
552
553 while ($section ne "") {
554 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
555 $num =~ s/^0+(\d)/$1/; # remove leading 0s
556 $section = "" unless defined $section;
557
558 if (defined $num) {
559 if (!defined $sectionref->{'subsections'}->{$num}) {
560 push (@{$sectionref->{'subsection_order'}}, $num);
561 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
562 'next_subsection'=>1,
563 'subsections'=>{},
564 'metadata'=>[],
565 'text'=>""};
566 if ($num >= $sectionref->{'next_subsection'}) {
567 $sectionref->{'next_subsection'} = $num + 1;
568 }
569 }
570 $sectionref = $sectionref->{'subsections'}->{$num};
571
572 } else {
573 print STDERR "doc::create_named_section couldn't create section ";
574 print STDERR "$mastersection\n";
575 last;
576 }
577 }
578}
579
580# returns a reference to a list of subsections
581sub list_subsections {
582 my $self = shift (@_);
583 my ($section) = @_;
584
585 my $section_ptr = $self->_lookup_section ($section);
586 if (!defined $section_ptr) {
587 print STDERR "doc::list_subsections couldn't find section $section\n";
588 return [];
589 }
590
591 return [@{$section_ptr->{'subsection_order'}}];
592}
593
594sub delete_section {
595 my $self = shift (@_);
596 my ($section) = @_;
597
598# my $section_ptr = {'subsection_order'=>[],
599# 'next_subsection'=>1,
600# 'subsections'=>{},
601# 'metadata'=>[],
602# 'text'=>""};
603
604 # if this is the top section reset everything
605 if ($section eq "") {
606 $self->{'subsection_order'} = [];
607 $self->{'subsections'} = {};
608 $self->{'metadata'} = [];
609 $self->{'text'} = "";
610 return;
611 }
612
613 # find the parent of the section to delete
614 my $parent_section = "";
615 my $child = "0";
616 my @section = split (/\./, $section);
617 if (scalar(@section) > 0) {
618 $child = pop (@section);
619 $parent_section = join (".", @section);
620 }
621
622 my $parent_section_ptr = $self->_lookup_section($parent_section);
623 if (!defined $parent_section_ptr) {
624 print STDERR "doc::delete_section couldn't find parent section " .
625 "$parent_section\n";
626 return;
627 }
628
629 # remove this section from the subsection_order list
630 my $i = 0;
631 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
632 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
633 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
634 last;
635 }
636 $i++;
637 }
638
639 # remove this section from the subsection hash
640 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
641 undef $parent_section_ptr->{'subsections'}->{$child};
642 }
643}
644
645#--
646# methods for dealing with metadata
647
648# set_metadata_element and get_metadata_element are for metadata
649# which should only have one value. add_meta_data and get_metadata
650# are for metadata which can have more than one value.
651
652# returns the first metadata value which matches field
653
654# This version of get metadata element works much like the one above,
655# except it allows for the namespace portion of a metadata element to
656# be ignored, thus if you are searching for dc.Title, the first piece
657# of matching metadata ending with the name Title (once any namespace
658# is removed) would be returned.
659# 28-11-2003 John Thompson
660sub get_metadata_element {
661 my $self = shift (@_);
662 my ($section, $field, $ignore_namespace) = @_;
663 my ($data);
664
665 $ignore_namespace = 0 unless defined $ignore_namespace;
666
667 my $section_ptr = $self->_lookup_section($section);
668 if (!defined $section_ptr) {
669 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
670 return;
671 }
672
673 # Remove the any namespace if we are being told to ignore them
674 if($ignore_namespace) {
675 $field =~ s/^\w*\.//;
676 }
677
678 foreach $data (@{$section_ptr->{'metadata'}}) {
679
680 my $data_name = $data->[0];
681
682 # Remove the any namespace if we are being told to ignore them
683 if($ignore_namespace) {
684 $data_name =~ s/^\w*\.//;
685 }
686
687 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
688 }
689
690 return undef; # was not found
691}
692
693# returns a list of the form [value1, value2, ...]
694sub get_metadata {
695 my $self = shift (@_);
696 my ($section, $field, $ignore_namespace) = @_;
697 my ($data);
698
699 $ignore_namespace = 0 unless defined $ignore_namespace;
700
701 my $section_ptr = $self->_lookup_section($section);
702 if (!defined $section_ptr) {
703 print STDERR "doc::get_metadata couldn't find section ",
704 $section, "\n";
705 return;
706 }
707
708 # Remove the any namespace if we are being told to ignore them
709 if($ignore_namespace) {
710 $field =~ s/^\w*\.//;
711 }
712
713 my @metadata = ();
714 foreach $data (@{$section_ptr->{'metadata'}}) {
715
716 my $data_name = $data->[0];
717 # Remove the any namespace if we are being told to ignore them
718 if($ignore_namespace) {
719 $data_name =~ s/^\w*\.//;
720 }
721
722 push (@metadata, $data->[1]) if ($data_name eq $field);
723 }
724
725 return \@metadata;
726}
727
728# returns a list of the form [[field,value],[field,value],...]
729sub get_all_metadata {
730 my $self = shift (@_);
731 my ($section) = @_;
732
733 my $section_ptr = $self->_lookup_section($section);
734 if (!defined $section_ptr) {
735 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
736 return;
737 }
738
739 return $section_ptr->{'metadata'};
740}
741
742# $value is optional
743sub delete_metadata {
744 my $self = shift (@_);
745 my ($section, $field, $value) = @_;
746
747 my $section_ptr = $self->_lookup_section($section);
748 if (!defined $section_ptr) {
749 print STDERR "doc::delete_metadata couldn't find section ", $section, "\n";
750 return;
751 }
752
753 my $i = 0;
754 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
755 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
756 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
757 splice (@{$section_ptr->{'metadata'}}, $i, 1);
758 } else {
759 $i++;
760 }
761 }
762}
763
764sub delete_all_metadata {
765 my $self = shift (@_);
766 my ($section) = @_;
767
768 my $section_ptr = $self->_lookup_section($section);
769 if (!defined $section_ptr) {
770 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
771 return;
772 }
773
774 $section_ptr->{'metadata'} = [];
775}
776
777sub set_metadata_element {
778 my $self = shift (@_);
779 my ($section, $field, $value) = @_;
780
781 $self->set_utf8_metadata_element ($section, $field,
782 &unicode::ascii2utf8(\$value));
783}
784
785# set_utf8_metadata_element assumes the text has already been
786# converted to the UTF-8 encoding.
787sub set_utf8_metadata_element {
788 my $self = shift (@_);
789 my ($section, $field, $value) = @_;
790
791 $self->delete_metadata ($section, $field);
792 $self->add_utf8_metadata ($section, $field, $value);
793}
794
795
796# add_metadata assumes the text is in (extended) ascii form. For
797# text which has already been converted to the UTF-8 format use
798# add_utf8_metadata.
799sub add_metadata {
800 my $self = shift (@_);
801 my ($section, $field, $value) = @_;
802
803 $self->add_utf8_metadata ($section, $field,
804 &unicode::ascii2utf8(\$value));
805}
806
807sub add_utf8_metadata {
808 my $self = shift (@_);
809 my ($section, $field, $value) = @_;
810
811 my $section_ptr = $self->_lookup_section($section);
812 if (!defined $section_ptr) {
813 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
814 return;
815 }
816 if (!defined $value) {
817 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
818 return;
819 }
820 if (!defined $field) {
821 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
822 return;
823 }
824
825 #print STDERR "###$field=$value\n";
826 # double check that the value is utf-8
827 if (unicode::ensure_utf8(\$value)) {
828 print STDERR "doc::add_utf8_metadata: warning: '$field' wasn't utf8\n";
829 }
830
831 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
832}
833
834
835# methods for dealing with text
836
837# returns the text for a section
838sub get_text {
839 my $self = shift (@_);
840 my ($section) = @_;
841
842 my $section_ptr = $self->_lookup_section($section);
843 if (!defined $section_ptr) {
844 print STDERR "doc::get_text couldn't find section " .
845 "$section\n";
846 return "";
847 }
848
849 return $section_ptr->{'text'};
850}
851
852# returns the (utf-8 encoded) length of the text for a section
853sub get_text_length {
854 my $self = shift (@_);
855 my ($section) = @_;
856
857 my $section_ptr = $self->_lookup_section($section);
858 if (!defined $section_ptr) {
859 print STDERR "doc::get_text_length couldn't find section " .
860 "$section\n";
861 return 0;
862 }
863
864 return length ($section_ptr->{'text'});
865}
866
867sub delete_text {
868 my $self = shift (@_);
869 my ($section) = @_;
870
871 my $section_ptr = $self->_lookup_section($section);
872 if (!defined $section_ptr) {
873 print STDERR "doc::delete_text couldn't find section " .
874 "$section\n";
875 return;
876 }
877
878 $section_ptr->{'text'} = "";
879}
880
881# add_text assumes the text is in (extended) ascii form. For
882# text which has been already converted to the UTF-8 format
883# use add_utf8_text.
884sub add_text {
885 my $self = shift (@_);
886 my ($section, $text) = @_;
887
888 # convert the text to UTF-8 encoded unicode characters
889 # and add the text
890 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
891}
892
893
894# add_utf8_text assumes the text to be added has already
895# been converted to the UTF-8 encoding. For ascii text use
896# add_text
897sub add_utf8_text {
898 my $self = shift (@_);
899 my ($section, $text) = @_;
900
901 my $section_ptr = $self->_lookup_section($section);
902 if (!defined $section_ptr) {
903 print STDERR "doc::add_utf8_text couldn't find section " .
904 "$section\n";
905 return;
906 }
907
908 $section_ptr->{'text'} .= $text;
909}
910
911
912# methods for dealing with associated files
913
914# a file is associated with a document, NOT a section.
915# if section is defined it is noted in the data structure
916# only so that files associated from a particular section
917# may be removed later (using delete_section_assoc_files)
918sub associate_file {
919 my $self = shift (@_);
920 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
921 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
922
923 # remove all associated files with the same name
924 $self->delete_assoc_file ($assoc_filename);
925
926 push (@{$self->{'associated_files'}},
927 [$real_filename, $assoc_filename, $mime_type, $section]);
928}
929
930# returns a list of associated files in the form
931# [[real_filename, assoc_filename, mimetype], ...]
932sub get_assoc_files {
933 my $self = shift (@_);
934
935 return $self->{'associated_files'};
936}
937
938sub delete_section_assoc_files {
939 my $self = shift (@_);
940 my ($section) = @_;
941
942 my $i=0;
943 while ($i < scalar (@{$self->{'associated_files'}})) {
944 if (defined $self->{'associated_files'}->[$i]->[3] &&
945 $self->{'associated_files'}->[$i]->[3] eq $section) {
946 splice (@{$self->{'associated_files'}}, $i, 1);
947 } else {
948 $i++;
949 }
950 }
951}
952
953sub delete_assoc_file {
954 my $self = shift (@_);
955 my ($assoc_filename) = @_;
956
957 my $i=0;
958 while ($i < scalar (@{$self->{'associated_files'}})) {
959 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
960 splice (@{$self->{'associated_files'}}, $i, 1);
961 } else {
962 $i++;
963 }
964 }
965}
966
967sub reset_nextsection_ptr {
968 my $self = shift (@_);
969 my ($section) = @_;
970
971 my $section_ptr = $self->_lookup_section($section);
972 $section_ptr->{'next_subsection'} = 1;
973}
974
9751;
Note: See TracBrowser for help on using the repository browser.