source: gsdl/trunk/perllib/doc.pm@ 16670

Last change on this file since 16670 was 16670, checked in by ak19, 16 years ago

Instead of base64 encoding the gsdl_source_filename, it now URL encodes it. GLI's DocXMLFile.java urlDecodes it now (instead of applying base64 decoding).

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 26.5 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use ghtml;
40use File::stat;
41##use hashdoc;
42use docprint;
43
44# the document type may be indexed_doc, nonindexed_doc, or
45# classification
46
47my $OIDcount = 0;
48
49sub new {
50 my $class = shift (@_);
51 my ($source_filename, $doc_type) = @_;
52
53
54 my $self = bless {'associated_files'=>[],
55 'subsection_order'=>[],
56 'next_subsection'=>1,
57 'subsections'=>{},
58 'metadata'=>[],
59 'text'=>"",
60 'OIDtype'=>"hash"}, $class;
61
62 # used to set lastmodified here, but this can screw up the HASH ids, so
63 # the docsave processor now calls set_lastmodified
64
65 $self->{'source_path'} = $source_filename;
66
67 if (defined $source_filename) {
68 $source_filename = &util::filename_within_collection($source_filename);
69 $self->set_source_filename ($source_filename);
70 }
71
72 $self->set_doc_type ($doc_type) if defined $doc_type;
73
74 return $self;
75}
76# set lastmodified for OAI purposes, added by GRB, moved by kjdon
77sub set_lastmodified {
78 my $self = shift (@_);
79
80 my $source_path = $self->{'source_path'};
81
82 if (defined $source_path && (-e $source_path)) {
83
84 my $file_stat = stat($source_path);
85 my $mtime = $file_stat->mtime;
86 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $file_stat->mtime);
87 }
88}
89
90# clone the $self object
91sub duplicate {
92 my $self = shift (@_);
93
94 my $newobj = {};
95
96 foreach my $k (keys %$self) {
97 $newobj->{$k} = &clone ($self->{$k});
98 }
99
100 bless $newobj, ref($self);
101 return $newobj;
102}
103
104sub clone {
105 my ($from) = @_;
106 my $type = ref ($from);
107
108 if ($type eq "HASH") {
109 my $to = {};
110 foreach my $key (keys %$from) {
111 $to->{$key} = &clone ($from->{$key});
112 }
113 return $to;
114 } elsif ($type eq "ARRAY") {
115 my $to = [];
116 foreach my $v (@$from) {
117 push (@$to, &clone ($v));
118 }
119 return $to;
120 } else {
121 return $from;
122 }
123}
124
125sub set_OIDtype {
126 my $self = shift (@_);
127 my ($type, $metadata) = @_;
128
129 if (defined $type && $type =~ /^(hash|incremental|dirname|assigned)$/) {
130 $self->{'OIDtype'} = $type;
131 } else {
132 $self->{'OIDtype'} = "hash";
133 }
134 if ($type =~ /^assigned$/) {
135 if (defined $metadata) {
136 $self->{'OIDmetadata'} = $metadata;
137 } else {
138 $self->{'OIDmetadata'} = "dc.Identifier";
139 }
140 }
141}
142
143sub set_source_filename {
144 my $self = shift (@_);
145 my ($source_filename) = @_;
146
147 # Since the gsdlsourcefilename element goes into the doc.xml it has
148 # to be utf8. However, it should also *represent* the source filename
149 # (in the import directory) which may not be utf8 at all.
150 # For instance, if this meta element (gsdlsourcefilename) will be
151 # used by other applications that parse doc.xml in order to locate
152 # gsdlsourcefilename. Therefore, the solution is to URLencode the real
153 # filename as this is a binary-to-text encoding meaning that the
154 # resulting string is ASCII (utf8).
155
156 print STDERR "******URLencoding the gsdl_source_filename $source_filename ";
157
158 # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
159 my ($srcfilename,$dirname,$suffix)
160 = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
161 print STDERR "-> $srcfilename -> ";
162 $srcfilename = &unicode::url_encode($srcfilename.$suffix);
163 $source_filename = &util::filename_cat($dirname, $srcfilename);
164 print STDERR "$source_filename\n";
165
166 $self->set_utf8_metadata_element ($self->get_top_section(),
167 "gsdlsourcefilename",
168 $source_filename);
169}
170
171sub set_converted_filename {
172 my $self = shift (@_);
173 my ($converted_filename) = @_;
174
175 # we know the converted filename is utf8
176 $self->set_utf8_metadata_element ($self->get_top_section(),
177 "gsdlconvertedfilename",
178 $converted_filename);
179}
180
181
182# returns the source_filename as it was provided
183sub get_source_filename {
184 my $self = shift (@_);
185
186 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
187}
188
189# returns converted filename if available else returns source filename
190sub get_filename_for_hashing {
191 my $self = shift (@_);
192
193 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
194
195 if (!defined $filename) {
196 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
197 # if NULPlug processed file, then don't give a filename
198 if (defined $plugin_name && $plugin_name eq "NULPlug") {
199 $filename = undef;
200 } else { # returns the URL encoded source filename!
201 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
202 }
203 }
204 return $filename;
205}
206
207sub set_doc_type {
208 my $self = shift (@_);
209 my ($doc_type) = @_;
210
211 $self->set_metadata_element ($self->get_top_section(),
212 "gsdldoctype",
213 $doc_type);
214}
215
216# returns the gsdldoctype as it was provided
217# the default of "indexed_doc" is used if no document
218# type was provided
219sub get_doc_type {
220 my $self = shift (@_);
221
222 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
223 return $doc_type if (defined $doc_type);
224 return "indexed_doc";
225}
226
227
228# look up the reference to the a particular section
229sub _lookup_section {
230 my $self = shift (@_);
231 my ($section) = @_;
232
233 my ($num);
234 my $sectionref = $self;
235
236 while (defined $section && $section ne "") {
237
238 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
239
240 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
241
242 $section = "" unless defined $section;
243
244
245 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
246 $sectionref = $sectionref->{'subsections'}->{$num};
247 } else {
248 return undef;
249 }
250 }
251
252 return $sectionref;
253}
254
255# calculate OID by hashing the contents of the document
256sub _calc_OID {
257 my $self = shift (@_);
258 my ($filename) = @_;
259
260 my $osexe = &util::get_os_exe();
261
262 my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
263 $ENV{'GSDLOS'},"hashfile$osexe");
264
265 my $result = "NULL";
266
267 if (-e "$hashfile_exe") {
268# $result = `\"$hashfile_exe\" \"$filename\"`;
269 $result = `hashfile$osexe \"$filename\"`;
270 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
271 } else {
272 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
273 }
274 return "HASH$result";
275}
276
277# methods dealing with OID, not groups of them.
278
279# if $OID is not provided one is calculated
280sub set_OID {
281 my $self = shift (@_);
282 my ($OID) = @_;
283
284 my $use_hash_oid = 0;
285 # if an OID wasn't provided claculate one
286 if (!defined $OID) {
287 $OID = "NULL";
288 if ($self->{'OIDtype'} eq "hash") {
289 $use_hash_oid = 1;
290 } elsif ($self->{'OIDtype'} eq "incremental") {
291 $OID = "D" . $OIDcount;
292 $OIDcount ++;
293
294 } elsif ($self->{'OIDtype'} eq "dirname") {
295 $OID = 'J';
296 my $filename = $self->get_source_filename();
297 if (defined($filename)) { # && -e $filename) {
298 $OID = &File::Basename::dirname($filename);
299 if (defined $OID) {
300 $OID = 'J'.&File::Basename::basename($OID);
301 $OID =~ s/\.//; #remove any periods
302 } else {
303 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
304 $use_hash_oid = 1;
305 }
306 } else {
307 print STDERR "Failed to find filename, generating hash id\n";
308 $use_hash_oid = 1;
309 }
310
311 } elsif ($self->{'OIDtype'} eq "assigned") {
312 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
313 if (defined $identifier && $identifier ne "") {
314 $OID = "D" . $identifier;
315 $OID =~ s/\.//; #remove any periods
316 } else {
317 # need a hash id
318 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
319 $use_hash_oid = 1;
320 }
321
322 } else {
323 $use_hash_oid = 1;
324 }
325
326 if ($use_hash_oid) {
327
328 # "hash" OID - feed file to hashfile.exe
329 #my $filename = $self->get_source_filename();
330 # we want to use the converted file for hashing if available
331 # cos its quicker
332 my $filename = $self->get_filename_for_hashing();
333 # -z: don't want to hash on the file if it is zero size
334 if (defined($filename) && -e $filename && !-z $filename) {
335 $OID = $self->_calc_OID ($filename);
336 } else {
337 $filename = &util::get_tmp_filename();
338 if (!open (OUTFILE, ">$filename")) {
339 print STDERR "doc::set_OID could not write to $filename\n";
340 } else {
341 my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
342 print OUTFILE $doc_text;
343 close (OUTFILE);
344 }
345 $OID = $self->_calc_OID ($filename);
346 &util::rm ($filename);
347 }
348 }
349 }
350 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
351}
352
353# this uses hashdoc (embedded c thingy) which is faster but still
354# needs a little work to be suffiently stable
355sub ___set_OID {
356 my $self = shift (@_);
357 my ($OID) = @_;
358
359 # if an OID wasn't provided then calculate hash value based on document
360 if (!defined $OID)
361 {
362 my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
363 my $hash_len = length($hash_text);
364
365 $OID = &hashdoc::buffer($hash_text,$hash_len);
366 }
367
368 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
369}
370
371# returns the OID for this document
372sub get_OID {
373 my $self = shift (@_);
374 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
375 return $OID if (defined $OID);
376 return "NULL";
377}
378
379sub delete_OID {
380 my $self = shift (@_);
381
382 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
383}
384
385
386# methods for manipulating section names
387
388# returns the name of the top-most section (the top
389# level of the document
390sub get_top_section {
391 my $self = shift (@_);
392
393 return "";
394}
395
396# returns a section
397sub get_parent_section {
398 my $self = shift (@_);
399 my ($section) = @_;
400
401 $section =~ s/(^|\.)\d+$//;
402
403 return $section;
404}
405
406# returns the first child section (or the end child
407# if there isn't any)
408sub get_begin_child {
409 my $self = shift (@_);
410 my ($section) = @_;
411
412 my $section_ptr = $self->_lookup_section($section);
413 return "" unless defined $section_ptr;
414
415 if (defined $section_ptr->{'subsection_order'}->[0]) {
416 return "$section.$section_ptr->{'subsection_order'}->[0]";
417 }
418
419 return $self->get_end_child ($section);
420}
421
422# returns the next child of a parent section
423sub get_next_child {
424 my $self = shift (@_);
425 my ($section) = @_;
426
427 my $parent_section = $self->get_parent_section($section);
428 my $parent_section_ptr = $self->_lookup_section($parent_section);
429 return undef unless defined $parent_section_ptr;
430
431 my ($section_num) = $section =~ /(\d+)$/;
432 return undef unless defined $section_num;
433
434 my $i = 0;
435 my $section_order = $parent_section_ptr->{'subsection_order'};
436 while ($i < scalar(@$section_order)) {
437 last if $section_order->[$i] eq $section_num;
438 $i++;
439 }
440
441 $i++; # the next child
442 if ($i < scalar(@$section_order)) {
443 return $section_order->[$i] if $parent_section eq "";
444 return "$parent_section.$section_order->[$i]";
445 }
446
447 # no more sections in this level
448 return undef;
449}
450
451# returns a reference to a list of children
452sub get_children {
453 my $self = shift (@_);
454 my ($section) = @_;
455
456 my $section_ptr = $self->_lookup_section($section);
457 return [] unless defined $section_ptr;
458
459 my @children = @{$section_ptr->{'subsection_order'}};
460
461 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
462 return \@children;
463}
464
465# returns the child section one past the last one (which
466# is coded as "0")
467sub get_end_child {
468 my $self = shift (@_);
469 my ($section) = @_;
470
471 return $section . ".0" unless $section eq "";
472 return "0";
473}
474
475# returns the next section in book order
476sub get_next_section {
477 my $self = shift (@_);
478 my ($section) = @_;
479
480 return undef unless defined $section;
481
482 my $section_ptr = $self->_lookup_section($section);
483 return undef unless defined $section_ptr;
484
485 # first try to find first child
486 if (defined $section_ptr->{'subsection_order'}->[0]) {
487 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
488 return "$section.$section_ptr->{'subsection_order'}->[0]";
489 }
490
491 do {
492 # try to find sibling
493 my $next_child = $self->get_next_child ($section);
494 return $next_child if (defined $next_child);
495
496 # move up one level
497 $section = $self->get_parent_section ($section);
498 } while $section =~ /\d/;
499
500 return undef;
501}
502
503sub is_leaf_section {
504 my $self = shift (@_);
505 my ($section) = @_;
506
507 my $section_ptr = $self->_lookup_section($section);
508 return 1 unless defined $section_ptr;
509
510 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
511}
512
513# methods for dealing with sections
514
515# returns the name of the inserted section
516sub insert_section {
517 my $self = shift (@_);
518 my ($before_section) = @_;
519
520 # get the child to insert before and its parent section
521 my $parent_section = "";
522 my $before_child = "0";
523 my @before_section = split (/\./, $before_section);
524 if (scalar(@before_section) > 0) {
525 $before_child = pop (@before_section);
526 $parent_section = join (".", @before_section);
527 }
528
529 my $parent_section_ptr = $self->_lookup_section($parent_section);
530 if (!defined $parent_section_ptr) {
531 print STDERR "doc::insert_section couldn't find parent section " .
532 "$parent_section\n";
533 return;
534 }
535
536 # get the next section number
537 my $section_num = $parent_section_ptr->{'next_subsection'}++;
538
539 my $i = 0;
540 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
541 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
542 $i++;
543 }
544
545 # insert the section number into the order list
546 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
547
548 # add this section to the parent section
549 my $section_ptr = {'subsection_order'=>[],
550 'next_subsection'=>1,
551 'subsections'=>{},
552 'metadata'=>[],
553 'text'=>""};
554 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
555
556 # work out the full section number
557 my $section = $parent_section;
558 $section .= "." unless $section eq "";
559 $section .= $section_num;
560
561 return $section;
562}
563
564# creates a pre-named section
565sub create_named_section {
566 my $self = shift (@_);
567 my ($mastersection) = @_;
568
569 my ($num);
570 my $section = $mastersection;
571 my $sectionref = $self;
572
573 while ($section ne "") {
574 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
575 $num =~ s/^0+(\d)/$1/; # remove leading 0s
576 $section = "" unless defined $section;
577
578 if (defined $num) {
579 if (!defined $sectionref->{'subsections'}->{$num}) {
580 push (@{$sectionref->{'subsection_order'}}, $num);
581 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
582 'next_subsection'=>1,
583 'subsections'=>{},
584 'metadata'=>[],
585 'text'=>""};
586 if ($num >= $sectionref->{'next_subsection'}) {
587 $sectionref->{'next_subsection'} = $num + 1;
588 }
589 }
590 $sectionref = $sectionref->{'subsections'}->{$num};
591
592 } else {
593 print STDERR "doc::create_named_section couldn't create section ";
594 print STDERR "$mastersection\n";
595 last;
596 }
597 }
598}
599
600# returns a reference to a list of subsections
601sub list_subsections {
602 my $self = shift (@_);
603 my ($section) = @_;
604
605 my $section_ptr = $self->_lookup_section ($section);
606 if (!defined $section_ptr) {
607 print STDERR "doc::list_subsections couldn't find section $section\n";
608 return [];
609 }
610
611 return [@{$section_ptr->{'subsection_order'}}];
612}
613
614sub delete_section {
615 my $self = shift (@_);
616 my ($section) = @_;
617
618# my $section_ptr = {'subsection_order'=>[],
619# 'next_subsection'=>1,
620# 'subsections'=>{},
621# 'metadata'=>[],
622# 'text'=>""};
623
624 # if this is the top section reset everything
625 if ($section eq "") {
626 $self->{'subsection_order'} = [];
627 $self->{'subsections'} = {};
628 $self->{'metadata'} = [];
629 $self->{'text'} = "";
630 return;
631 }
632
633 # find the parent of the section to delete
634 my $parent_section = "";
635 my $child = "0";
636 my @section = split (/\./, $section);
637 if (scalar(@section) > 0) {
638 $child = pop (@section);
639 $parent_section = join (".", @section);
640 }
641
642 my $parent_section_ptr = $self->_lookup_section($parent_section);
643 if (!defined $parent_section_ptr) {
644 print STDERR "doc::delete_section couldn't find parent section " .
645 "$parent_section\n";
646 return;
647 }
648
649 # remove this section from the subsection_order list
650 my $i = 0;
651 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
652 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
653 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
654 last;
655 }
656 $i++;
657 }
658
659 # remove this section from the subsection hash
660 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
661 undef $parent_section_ptr->{'subsections'}->{$child};
662 }
663}
664
665#--
666# methods for dealing with metadata
667
668# set_metadata_element and get_metadata_element are for metadata
669# which should only have one value. add_meta_data and get_metadata
670# are for metadata which can have more than one value.
671
672# returns the first metadata value which matches field
673
674# This version of get metadata element works much like the one above,
675# except it allows for the namespace portion of a metadata element to
676# be ignored, thus if you are searching for dc.Title, the first piece
677# of matching metadata ending with the name Title (once any namespace
678# is removed) would be returned.
679# 28-11-2003 John Thompson
680sub get_metadata_element {
681 my $self = shift (@_);
682 my ($section, $field, $ignore_namespace) = @_;
683 my ($data);
684
685 $ignore_namespace = 0 unless defined $ignore_namespace;
686
687 my $section_ptr = $self->_lookup_section($section);
688 if (!defined $section_ptr) {
689 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
690 return;
691 }
692
693 # Remove the any namespace if we are being told to ignore them
694 if($ignore_namespace) {
695 $field =~ s/^\w*\.//;
696 }
697
698 foreach $data (@{$section_ptr->{'metadata'}}) {
699
700 my $data_name = $data->[0];
701
702 # Remove the any namespace if we are being told to ignore them
703 if($ignore_namespace) {
704 $data_name =~ s/^\w*\.//;
705 }
706
707 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
708 }
709
710 return undef; # was not found
711}
712
713# returns a list of the form [value1, value2, ...]
714sub get_metadata {
715 my $self = shift (@_);
716 my ($section, $field, $ignore_namespace) = @_;
717 my ($data);
718
719 $ignore_namespace = 0 unless defined $ignore_namespace;
720
721 my $section_ptr = $self->_lookup_section($section);
722 if (!defined $section_ptr) {
723 print STDERR "doc::get_metadata couldn't find section ",
724 $section, "\n";
725 return;
726 }
727
728 # Remove the any namespace if we are being told to ignore them
729 if($ignore_namespace) {
730 $field =~ s/^\w*\.//;
731 }
732
733 my @metadata = ();
734 foreach $data (@{$section_ptr->{'metadata'}}) {
735
736 my $data_name = $data->[0];
737 # Remove the any namespace if we are being told to ignore them
738 if($ignore_namespace) {
739 $data_name =~ s/^\w*\.//;
740 }
741
742 push (@metadata, $data->[1]) if ($data_name eq $field);
743 }
744
745 return \@metadata;
746}
747
748# returns a list of the form [[field,value],[field,value],...]
749sub get_all_metadata {
750 my $self = shift (@_);
751 my ($section) = @_;
752
753 my $section_ptr = $self->_lookup_section($section);
754 if (!defined $section_ptr) {
755 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
756 return;
757 }
758
759 return $section_ptr->{'metadata'};
760}
761
762# $value is optional
763sub delete_metadata {
764 my $self = shift (@_);
765 my ($section, $field, $value) = @_;
766
767 my $section_ptr = $self->_lookup_section($section);
768 if (!defined $section_ptr) {
769 print STDERR "doc::delete_metadata couldn't find section ", $section, "\n";
770 return;
771 }
772
773 my $i = 0;
774 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
775 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
776 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
777 splice (@{$section_ptr->{'metadata'}}, $i, 1);
778 } else {
779 $i++;
780 }
781 }
782}
783
784sub delete_all_metadata {
785 my $self = shift (@_);
786 my ($section) = @_;
787
788 my $section_ptr = $self->_lookup_section($section);
789 if (!defined $section_ptr) {
790 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
791 return;
792 }
793
794 $section_ptr->{'metadata'} = [];
795}
796
797sub set_metadata_element {
798 my $self = shift (@_);
799 my ($section, $field, $value) = @_;
800
801 $self->set_utf8_metadata_element ($section, $field,
802 &unicode::ascii2utf8(\$value));
803}
804
805# set_utf8_metadata_element assumes the text has already been
806# converted to the UTF-8 encoding.
807sub set_utf8_metadata_element {
808 my $self = shift (@_);
809 my ($section, $field, $value) = @_;
810
811 $self->delete_metadata ($section, $field);
812 $self->add_utf8_metadata ($section, $field, $value);
813}
814
815
816# add_metadata assumes the text is in (extended) ascii form. For
817# text which has already been converted to the UTF-8 format use
818# add_utf8_metadata.
819sub add_metadata {
820 my $self = shift (@_);
821 my ($section, $field, $value) = @_;
822
823 $self->add_utf8_metadata ($section, $field,
824 &unicode::ascii2utf8(\$value));
825}
826
827sub add_utf8_metadata {
828 my $self = shift (@_);
829 my ($section, $field, $value) = @_;
830
831 my $section_ptr = $self->_lookup_section($section);
832 if (!defined $section_ptr) {
833 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
834 return;
835 }
836 if (!defined $value) {
837 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
838 return;
839 }
840 if (!defined $field) {
841 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
842 return;
843 }
844
845 #print STDERR "###$field=$value\n";
846 # double check that the value is utf-8
847 if (!&unicode::check_is_utf8($value)) {
848 print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
849 &unicode::ensure_utf8(\$value);
850 print STDERR " Tried converting to utf8: $value\n";
851 }
852
853 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
854}
855
856
857# methods for dealing with text
858
859# returns the text for a section
860sub get_text {
861 my $self = shift (@_);
862 my ($section) = @_;
863
864 my $section_ptr = $self->_lookup_section($section);
865 if (!defined $section_ptr) {
866 print STDERR "doc::get_text couldn't find section " .
867 "$section\n";
868 return "";
869 }
870
871 return $section_ptr->{'text'};
872}
873
874# returns the (utf-8 encoded) length of the text for a section
875sub get_text_length {
876 my $self = shift (@_);
877 my ($section) = @_;
878
879 my $section_ptr = $self->_lookup_section($section);
880 if (!defined $section_ptr) {
881 print STDERR "doc::get_text_length couldn't find section " .
882 "$section\n";
883 return 0;
884 }
885
886 return length ($section_ptr->{'text'});
887}
888
889sub delete_text {
890 my $self = shift (@_);
891 my ($section) = @_;
892
893 my $section_ptr = $self->_lookup_section($section);
894 if (!defined $section_ptr) {
895 print STDERR "doc::delete_text couldn't find section " .
896 "$section\n";
897 return;
898 }
899
900 $section_ptr->{'text'} = "";
901}
902
903# add_text assumes the text is in (extended) ascii form. For
904# text which has been already converted to the UTF-8 format
905# use add_utf8_text.
906sub add_text {
907 my $self = shift (@_);
908 my ($section, $text) = @_;
909
910 # convert the text to UTF-8 encoded unicode characters
911 # and add the text
912 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
913}
914
915
916# add_utf8_text assumes the text to be added has already
917# been converted to the UTF-8 encoding. For ascii text use
918# add_text
919sub add_utf8_text {
920 my $self = shift (@_);
921 my ($section, $text) = @_;
922
923 my $section_ptr = $self->_lookup_section($section);
924 if (!defined $section_ptr) {
925 print STDERR "doc::add_utf8_text couldn't find section " .
926 "$section\n";
927 return;
928 }
929
930 $section_ptr->{'text'} .= $text;
931}
932
933
934# methods for dealing with associated files
935
936# a file is associated with a document, NOT a section.
937# if section is defined it is noted in the data structure
938# only so that files associated from a particular section
939# may be removed later (using delete_section_assoc_files)
940sub associate_file {
941 my $self = shift (@_);
942 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
943 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
944
945 # remove all associated files with the same name
946 $self->delete_assoc_file ($assoc_filename);
947
948 push (@{$self->{'associated_files'}},
949 [$real_filename, $assoc_filename, $mime_type, $section]);
950}
951
952# returns a list of associated files in the form
953# [[real_filename, assoc_filename, mimetype], ...]
954sub get_assoc_files {
955 my $self = shift (@_);
956
957 return $self->{'associated_files'};
958}
959
960sub delete_section_assoc_files {
961 my $self = shift (@_);
962 my ($section) = @_;
963
964 my $i=0;
965 while ($i < scalar (@{$self->{'associated_files'}})) {
966 if (defined $self->{'associated_files'}->[$i]->[3] &&
967 $self->{'associated_files'}->[$i]->[3] eq $section) {
968 splice (@{$self->{'associated_files'}}, $i, 1);
969 } else {
970 $i++;
971 }
972 }
973}
974
975sub delete_assoc_file {
976 my $self = shift (@_);
977 my ($assoc_filename) = @_;
978
979 my $i=0;
980 while ($i < scalar (@{$self->{'associated_files'}})) {
981 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
982 splice (@{$self->{'associated_files'}}, $i, 1);
983 } else {
984 $i++;
985 }
986 }
987}
988
989sub reset_nextsection_ptr {
990 my $self = shift (@_);
991 my ($section) = @_;
992
993 my $section_ptr = $self->_lookup_section($section);
994 $section_ptr->{'next_subsection'} = 1;
995}
996
9971;
Note: See TracBrowser for help on using the repository browser.