source: gsdl/trunk/perllib/doc.pm@ 14966

Last change on this file since 14966 was 14966, checked in by davidb, 15 years ago

Code split up a bit more with whitespace

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 26.1 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use unicode;
37use util;
38use ghtml;
39use File::stat;
40##use hashdoc;
41use docprint;
42
43# the document type may be indexed_doc, nonindexed_doc, or
44# classification
45
46my $OIDcount = 0;
47
48sub new {
49 my $class = shift (@_);
50 my ($source_filename, $doc_type) = @_;
51
52
53 my $self = bless {'associated_files'=>[],
54 'subsection_order'=>[],
55 'next_subsection'=>1,
56 'subsections'=>{},
57 'metadata'=>[],
58 'text'=>"",
59 'OIDtype'=>"hash"}, $class;
60
61 # used to set lastmodified here, but this can screw up the HASH ids, so
62 # the docsave processor now calls set_lastmodified
63
64 $self->{'source_path'} = $source_filename;
65
66 if (defined $source_filename) {
67 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
68
69 if (defined $collect_dir) {
70 my $dirsep = &util::get_dirsep();
71 if ($collect_dir !~ m/$dirsep$/) {
72 $collect_dir .= $dirsep;
73 }
74
75 $collect_dir =~ s/\\/\\\\/g; # escape DOS style file separator
76
77 # if from within GSDLCOLLECTDIR, then remove directory prefix
78 # so source_filename is realative to it. This is done to aid
79 # portability, i.e. the collection can be moved to somewhere
80 # else on the file system and the archives directory will still
81 # work. This is needed, for example in the applet version of
82 # GLI where GSDLHOME/collect on the server will be different to
83 # the collect directory of the remove user. Of course,
84 # GSDLCOLLECTDIR subsequently needs to be put back on to turn
85 # it back into a full pathname.
86
87 if ($source_filename =~ /^$collect_dir(.*)$/) {
88 $source_filename = $1;
89 }
90 }
91
92 $self->set_source_filename ($source_filename);
93 }
94
95 $self->set_doc_type ($doc_type) if defined $doc_type;
96
97 return $self;
98}
99# set lastmodified for OAI purposes, added by GRB, moved by kjdon
100sub set_lastmodified {
101 my $self = shift (@_);
102
103 my $source_path = $self->{'source_path'};
104
105 if (defined $source_path && (-e $source_path)) {
106
107 my $file_stat = stat($source_path);
108 my $mtime = $file_stat->mtime;
109 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $file_stat->mtime);
110 }
111}
112
113# clone the $self object
114sub duplicate {
115 my $self = shift (@_);
116
117 my $newobj = {};
118
119 foreach my $k (keys %$self) {
120 $newobj->{$k} = &clone ($self->{$k});
121 }
122
123 bless $newobj, ref($self);
124 return $newobj;
125}
126
127sub clone {
128 my ($from) = @_;
129 my $type = ref ($from);
130
131 if ($type eq "HASH") {
132 my $to = {};
133 foreach my $key (keys %$from) {
134 $to->{$key} = &clone ($from->{$key});
135 }
136 return $to;
137 } elsif ($type eq "ARRAY") {
138 my $to = [];
139 foreach my $v (@$from) {
140 push (@$to, &clone ($v));
141 }
142 return $to;
143 } else {
144 return $from;
145 }
146}
147
148sub set_OIDtype {
149 my $self = shift (@_);
150 my ($type, $metadata) = @_;
151
152 if (defined $type && $type =~ /^(hash|incremental|dirname|assigned)$/) {
153 $self->{'OIDtype'} = $type;
154 } else {
155 $self->{'OIDtype'} = "hash";
156 }
157 if ($type =~ /^assigned$/) {
158 if (defined $metadata) {
159 $self->{'OIDmetadata'} = $metadata;
160 } else {
161 $self->{'OIDmetadata'} = "dc.Identifier";
162 }
163 }
164}
165
166sub set_source_filename {
167 my $self = shift (@_);
168 my ($source_filename) = @_;
169
170 $self->set_metadata_element ($self->get_top_section(),
171 "gsdlsourcefilename",
172 $source_filename);
173}
174
175sub set_converted_filename {
176 my $self = shift (@_);
177 my ($converted_filename) = @_;
178
179 $self->set_metadata_element ($self->get_top_section(),
180 "gsdlconvertedfilename",
181 $converted_filename);
182}
183
184
185# returns the source_filename as it was provided
186sub get_source_filename {
187 my $self = shift (@_);
188
189 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
190}
191
192# returns converted filename if available else returns source filename
193sub get_filename_for_hashing {
194 my $self = shift (@_);
195
196 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
197
198 if (!defined $filename) {
199 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
200 # if NULPlug processed file, then don't give a filename
201 if (defined $plugin_name && $plugin_name eq "NULPlug") {
202 $filename = undef;
203 } else {
204 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
205 }
206 }
207 return $filename;
208}
209
210sub set_doc_type {
211 my $self = shift (@_);
212 my ($doc_type) = @_;
213
214 $self->set_metadata_element ($self->get_top_section(),
215 "gsdldoctype",
216 $doc_type);
217}
218
219# returns the gsdldoctype as it was provided
220# the default of "indexed_doc" is used if no document
221# type was provided
222sub get_doc_type {
223 my $self = shift (@_);
224
225 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
226 return $doc_type if (defined $doc_type);
227 return "indexed_doc";
228}
229
230
231# look up the reference to the a particular section
232sub _lookup_section {
233 my $self = shift (@_);
234 my ($section) = @_;
235
236 my ($num);
237 my $sectionref = $self;
238
239 while (defined $section && $section ne "") {
240
241 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
242
243 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
244
245 $section = "" unless defined $section;
246
247
248 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
249 $sectionref = $sectionref->{'subsections'}->{$num};
250 } else {
251 return undef;
252 }
253 }
254
255 return $sectionref;
256}
257
258# calculate OID by hashing the contents of the document
259sub _calc_OID {
260 my $self = shift (@_);
261 my ($filename) = @_;
262
263 my $osexe = &util::get_os_exe();
264
265 my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
266 $ENV{'GSDLOS'},"hashfile$osexe");
267
268 my $result = "NULL";
269
270 if (-e "$hashfile_exe") {
271# $result = `\"$hashfile_exe\" \"$filename\"`;
272 $result = `hashfile$osexe \"$filename\"`;
273 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
274
275 } else {
276 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
277 }
278 return "HASH$result";
279}
280
281# methods dealing with OID, not groups of them.
282
283# if $OID is not provided one is calculated
284sub set_OID {
285 my $self = shift (@_);
286 my ($OID) = @_;
287
288 my $use_hash_oid = 0;
289 # if an OID wasn't provided claculate one
290 if (!defined $OID) {
291 $OID = "NULL";
292 if ($self->{'OIDtype'} eq "hash") {
293 $use_hash_oid = 1;
294 } elsif ($self->{'OIDtype'} eq "incremental") {
295 $OID = "D" . $OIDcount;
296 $OIDcount ++;
297
298 } elsif ($self->{'OIDtype'} eq "dirname") {
299 $OID = 'J';
300 my $filename = $self->get_source_filename();
301 if (defined($filename)) { # && -e $filename) {
302 $OID = &File::Basename::dirname($filename);
303 if (defined $OID) {
304 $OID = 'J'.&File::Basename::basename($OID);
305 $OID =~ s/\.//; #remove any periods
306 } else {
307 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
308 $use_hash_oid = 1;
309 }
310 } else {
311 print STDERR "Failed to find filename, generating hash id\n";
312 $use_hash_oid = 1;
313 }
314
315 } elsif ($self->{'OIDtype'} eq "assigned") {
316 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
317 if (defined $identifier && $identifier ne "") {
318 $OID = "D" . $identifier;
319 $OID =~ s/\.//; #remove any periods
320 } else {
321 # need a hash id
322 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
323 $use_hash_oid = 1;
324 }
325
326 } else {
327 $use_hash_oid = 1;
328 }
329
330 if ($use_hash_oid) {
331
332 # "hash" OID - feed file to hashfile.exe
333 #my $filename = $self->get_source_filename();
334 # we want to use the converted file for hashing if available
335 # cos its quicker
336 my $filename = $self->get_filename_for_hashing();
337 # -z: don't want to hash on the file if it is zero size
338 if (defined($filename) && -e $filename && !-z $filename) {
339 $OID = $self->_calc_OID ($filename);
340 } else {
341 $filename = &util::get_tmp_filename();
342 if (!open (OUTFILE, ">$filename")) {
343 print STDERR "doc::set_OID could not write to $filename\n";
344 } else {
345 my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
346 print OUTFILE $doc_text;
347 close (OUTFILE);
348 }
349 $OID = $self->_calc_OID ($filename);
350 &util::rm ($filename);
351 }
352 }
353 }
354 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
355}
356
357# this uses hashdoc (embedded c thingy) which is faster but still
358# needs a little work to be suffiently stable
359sub ___set_OID {
360 my $self = shift (@_);
361 my ($OID) = @_;
362
363 # if an OID wasn't provided then calculate hash value based on document
364 if (!defined $OID)
365 {
366 my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
367 my $hash_len = length($hash_text);
368
369 $OID = &hashdoc::buffer($hash_text,$hash_len);
370 }
371
372 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
373}
374
375# returns the OID for this document
376sub get_OID {
377 my $self = shift (@_);
378 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
379 return $OID if (defined $OID);
380 return "NULL";
381}
382
383sub delete_OID {
384 my $self = shift (@_);
385
386 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
387}
388
389
390# methods for manipulating section names
391
392# returns the name of the top-most section (the top
393# level of the document
394sub get_top_section {
395 my $self = shift (@_);
396
397 return "";
398}
399
400# returns a section
401sub get_parent_section {
402 my $self = shift (@_);
403 my ($section) = @_;
404
405 $section =~ s/(^|\.)\d+$//;
406
407 return $section;
408}
409
410# returns the first child section (or the end child
411# if there isn't any)
412sub get_begin_child {
413 my $self = shift (@_);
414 my ($section) = @_;
415
416 my $section_ptr = $self->_lookup_section($section);
417 return "" unless defined $section_ptr;
418
419 if (defined $section_ptr->{'subsection_order'}->[0]) {
420 return "$section.$section_ptr->{'subsection_order'}->[0]";
421 }
422
423 return $self->get_end_child ($section);
424}
425
426# returns the next child of a parent section
427sub get_next_child {
428 my $self = shift (@_);
429 my ($section) = @_;
430
431 my $parent_section = $self->get_parent_section($section);
432 my $parent_section_ptr = $self->_lookup_section($parent_section);
433 return undef unless defined $parent_section_ptr;
434
435 my ($section_num) = $section =~ /(\d+)$/;
436 return undef unless defined $section_num;
437
438 my $i = 0;
439 my $section_order = $parent_section_ptr->{'subsection_order'};
440 while ($i < scalar(@$section_order)) {
441 last if $section_order->[$i] eq $section_num;
442 $i++;
443 }
444
445 $i++; # the next child
446 if ($i < scalar(@$section_order)) {
447 return $section_order->[$i] if $parent_section eq "";
448 return "$parent_section.$section_order->[$i]";
449 }
450
451 # no more sections in this level
452 return undef;
453}
454
455# returns a reference to a list of children
456sub get_children {
457 my $self = shift (@_);
458 my ($section) = @_;
459
460 my $section_ptr = $self->_lookup_section($section);
461 return [] unless defined $section_ptr;
462
463 my @children = @{$section_ptr->{'subsection_order'}};
464
465 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
466 return \@children;
467}
468
469# returns the child section one past the last one (which
470# is coded as "0")
471sub get_end_child {
472 my $self = shift (@_);
473 my ($section) = @_;
474
475 return $section . ".0" unless $section eq "";
476 return "0";
477}
478
479# returns the next section in book order
480sub get_next_section {
481 my $self = shift (@_);
482 my ($section) = @_;
483
484 return undef unless defined $section;
485
486 my $section_ptr = $self->_lookup_section($section);
487 return undef unless defined $section_ptr;
488
489 # first try to find first child
490 if (defined $section_ptr->{'subsection_order'}->[0]) {
491 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
492 return "$section.$section_ptr->{'subsection_order'}->[0]";
493 }
494
495 do {
496 # try to find sibling
497 my $next_child = $self->get_next_child ($section);
498 return $next_child if (defined $next_child);
499
500 # move up one level
501 $section = $self->get_parent_section ($section);
502 } while $section =~ /\d/;
503
504 return undef;
505}
506
507sub is_leaf_section {
508 my $self = shift (@_);
509 my ($section) = @_;
510
511 my $section_ptr = $self->_lookup_section($section);
512 return 1 unless defined $section_ptr;
513
514 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
515}
516
517# methods for dealing with sections
518
519# returns the name of the inserted section
520sub insert_section {
521 my $self = shift (@_);
522 my ($before_section) = @_;
523
524 # get the child to insert before and its parent section
525 my $parent_section = "";
526 my $before_child = "0";
527 my @before_section = split (/\./, $before_section);
528 if (scalar(@before_section) > 0) {
529 $before_child = pop (@before_section);
530 $parent_section = join (".", @before_section);
531 }
532
533 my $parent_section_ptr = $self->_lookup_section($parent_section);
534 if (!defined $parent_section_ptr) {
535 print STDERR "doc::insert_section couldn't find parent section " .
536 "$parent_section\n";
537 return;
538 }
539
540 # get the next section number
541 my $section_num = $parent_section_ptr->{'next_subsection'}++;
542
543 my $i = 0;
544 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
545 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
546 $i++;
547 }
548
549 # insert the section number into the order list
550 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
551
552 # add this section to the parent section
553 my $section_ptr = {'subsection_order'=>[],
554 'next_subsection'=>1,
555 'subsections'=>{},
556 'metadata'=>[],
557 'text'=>""};
558 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
559
560 # work out the full section number
561 my $section = $parent_section;
562 $section .= "." unless $section eq "";
563 $section .= $section_num;
564
565 return $section;
566}
567
568# creates a pre-named section
569sub create_named_section {
570 my $self = shift (@_);
571 my ($mastersection) = @_;
572
573 my ($num);
574 my $section = $mastersection;
575 my $sectionref = $self;
576
577 while ($section ne "") {
578 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
579 $num =~ s/^0+(\d)/$1/; # remove leading 0s
580 $section = "" unless defined $section;
581
582 if (defined $num) {
583 if (!defined $sectionref->{'subsections'}->{$num}) {
584 push (@{$sectionref->{'subsection_order'}}, $num);
585 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
586 'next_subsection'=>1,
587 'subsections'=>{},
588 'metadata'=>[],
589 'text'=>""};
590 if ($num >= $sectionref->{'next_subsection'}) {
591 $sectionref->{'next_subsection'} = $num + 1;
592 }
593 }
594 $sectionref = $sectionref->{'subsections'}->{$num};
595
596 } else {
597 print STDERR "doc::create_named_section couldn't create section ";
598 print STDERR "$mastersection\n";
599 last;
600 }
601 }
602}
603
604# returns a reference to a list of subsections
605sub list_subsections {
606 my $self = shift (@_);
607 my ($section) = @_;
608
609 my $section_ptr = $self->_lookup_section ($section);
610 if (!defined $section_ptr) {
611 print STDERR "doc::list_subsections couldn't find section $section\n";
612 return [];
613 }
614
615 return [@{$section_ptr->{'subsection_order'}}];
616}
617
618sub delete_section {
619 my $self = shift (@_);
620 my ($section) = @_;
621
622# my $section_ptr = {'subsection_order'=>[],
623# 'next_subsection'=>1,
624# 'subsections'=>{},
625# 'metadata'=>[],
626# 'text'=>""};
627
628 # if this is the top section reset everything
629 if ($section eq "") {
630 $self->{'subsection_order'} = [];
631 $self->{'subsections'} = {};
632 $self->{'metadata'} = [];
633 $self->{'text'} = "";
634 return;
635 }
636
637 # find the parent of the section to delete
638 my $parent_section = "";
639 my $child = "0";
640 my @section = split (/\./, $section);
641 if (scalar(@section) > 0) {
642 $child = pop (@section);
643 $parent_section = join (".", @section);
644 }
645
646 my $parent_section_ptr = $self->_lookup_section($parent_section);
647 if (!defined $parent_section_ptr) {
648 print STDERR "doc::delete_section couldn't find parent section " .
649 "$parent_section\n";
650 return;
651 }
652
653 # remove this section from the subsection_order list
654 my $i = 0;
655 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
656 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
657 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
658 last;
659 }
660 $i++;
661 }
662
663 # remove this section from the subsection hash
664 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
665 undef $parent_section_ptr->{'subsections'}->{$child};
666 }
667}
668
669#--
670# methods for dealing with metadata
671
672# set_metadata_element and get_metadata_element are for metadata
673# which should only have one value. add_meta_data and get_metadata
674# are for metadata which can have more than one value.
675
676# returns the first metadata value which matches field
677
678# This version of get metadata element works much like the one above,
679# except it allows for the namespace portion of a metadata element to
680# be ignored, thus if you are searching for dc.Title, the first piece
681# of matching metadata ending with the name Title (once any namespace
682# is removed) would be returned.
683# 28-11-2003 John Thompson
684sub get_metadata_element {
685 my $self = shift (@_);
686 my ($section, $field, $ignore_namespace) = @_;
687 my ($data);
688
689 $ignore_namespace = 0 unless defined $ignore_namespace;
690
691 my $section_ptr = $self->_lookup_section($section);
692 if (!defined $section_ptr) {
693 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
694 return;
695 }
696
697 # Remove the any namespace if we are being told to ignore them
698 if($ignore_namespace) {
699 $field =~ s/^\w*\.//;
700 }
701
702 foreach $data (@{$section_ptr->{'metadata'}}) {
703
704 my $data_name = $data->[0];
705
706 # Remove the any namespace if we are being told to ignore them
707 if($ignore_namespace) {
708 $data_name =~ s/^\w*\.//;
709 }
710
711 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
712 }
713
714 return undef; # was not found
715}
716
717# returns a list of the form [value1, value2, ...]
718sub get_metadata {
719 my $self = shift (@_);
720 my ($section, $field, $ignore_namespace) = @_;
721 my ($data);
722
723 $ignore_namespace = 0 unless defined $ignore_namespace;
724
725 my $section_ptr = $self->_lookup_section($section);
726 if (!defined $section_ptr) {
727 print STDERR "doc::get_metadata couldn't find section ",
728 $section, "\n";
729 return;
730 }
731
732 # Remove the any namespace if we are being told to ignore them
733 if($ignore_namespace) {
734 $field =~ s/^\w*\.//;
735 }
736
737 my @metadata = ();
738 foreach $data (@{$section_ptr->{'metadata'}}) {
739
740 my $data_name = $data->[0];
741 # Remove the any namespace if we are being told to ignore them
742 if($ignore_namespace) {
743 $data_name =~ s/^\w*\.//;
744 }
745
746 push (@metadata, $data->[1]) if ($data_name eq $field);
747 }
748
749 return \@metadata;
750}
751
752# returns a list of the form [[field,value],[field,value],...]
753sub get_all_metadata {
754 my $self = shift (@_);
755 my ($section) = @_;
756
757 my $section_ptr = $self->_lookup_section($section);
758 if (!defined $section_ptr) {
759 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
760 return;
761 }
762
763 return $section_ptr->{'metadata'};
764}
765
766# $value is optional
767sub delete_metadata {
768 my $self = shift (@_);
769 my ($section, $field, $value) = @_;
770
771 my $section_ptr = $self->_lookup_section($section);
772 if (!defined $section_ptr) {
773 print STDERR "doc::delete_metadata couldn't find section ", $section, "\n";
774 return;
775 }
776
777 my $i = 0;
778 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
779 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
780 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
781 splice (@{$section_ptr->{'metadata'}}, $i, 1);
782 } else {
783 $i++;
784 }
785 }
786}
787
788sub delete_all_metadata {
789 my $self = shift (@_);
790 my ($section) = @_;
791
792 my $section_ptr = $self->_lookup_section($section);
793 if (!defined $section_ptr) {
794 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
795 return;
796 }
797
798 $section_ptr->{'metadata'} = [];
799}
800
801sub set_metadata_element {
802 my $self = shift (@_);
803 my ($section, $field, $value) = @_;
804
805 $self->set_utf8_metadata_element ($section, $field,
806 &unicode::ascii2utf8(\$value));
807}
808
809# set_utf8_metadata_element assumes the text has already been
810# converted to the UTF-8 encoding.
811sub set_utf8_metadata_element {
812 my $self = shift (@_);
813 my ($section, $field, $value) = @_;
814
815 $self->delete_metadata ($section, $field);
816 $self->add_utf8_metadata ($section, $field, $value);
817}
818
819
820# add_metadata assumes the text is in (extended) ascii form. For
821# text which has already been converted to the UTF-8 format use
822# add_utf8_metadata.
823sub add_metadata {
824 my $self = shift (@_);
825 my ($section, $field, $value) = @_;
826
827 $self->add_utf8_metadata ($section, $field,
828 &unicode::ascii2utf8(\$value));
829}
830
831sub add_utf8_metadata {
832 my $self = shift (@_);
833 my ($section, $field, $value) = @_;
834
835 my $section_ptr = $self->_lookup_section($section);
836 if (!defined $section_ptr) {
837 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
838 return;
839 }
840 if (!defined $value) {
841 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
842 return;
843 }
844 if (!defined $field) {
845 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
846 return;
847 }
848
849 #print STDERR "###$field=$value\n";
850 # double check that the value is utf-8
851 if (unicode::ensure_utf8(\$value)) {
852 print STDERR "doc::add_utf8_metadata: warning: '$field' wasn't utf8\n";
853 }
854
855 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
856}
857
858
859# methods for dealing with text
860
861# returns the text for a section
862sub get_text {
863 my $self = shift (@_);
864 my ($section) = @_;
865
866 my $section_ptr = $self->_lookup_section($section);
867 if (!defined $section_ptr) {
868 print STDERR "doc::get_text couldn't find section " .
869 "$section\n";
870 return "";
871 }
872
873 return $section_ptr->{'text'};
874}
875
876# returns the (utf-8 encoded) length of the text for a section
877sub get_text_length {
878 my $self = shift (@_);
879 my ($section) = @_;
880
881 my $section_ptr = $self->_lookup_section($section);
882 if (!defined $section_ptr) {
883 print STDERR "doc::get_text_length couldn't find section " .
884 "$section\n";
885 return 0;
886 }
887
888 return length ($section_ptr->{'text'});
889}
890
891sub delete_text {
892 my $self = shift (@_);
893 my ($section) = @_;
894
895 my $section_ptr = $self->_lookup_section($section);
896 if (!defined $section_ptr) {
897 print STDERR "doc::delete_text couldn't find section " .
898 "$section\n";
899 return;
900 }
901
902 $section_ptr->{'text'} = "";
903}
904
905# add_text assumes the text is in (extended) ascii form. For
906# text which has been already converted to the UTF-8 format
907# use add_utf8_text.
908sub add_text {
909 my $self = shift (@_);
910 my ($section, $text) = @_;
911
912 # convert the text to UTF-8 encoded unicode characters
913 # and add the text
914 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
915}
916
917
918# add_utf8_text assumes the text to be added has already
919# been converted to the UTF-8 encoding. For ascii text use
920# add_text
921sub add_utf8_text {
922 my $self = shift (@_);
923 my ($section, $text) = @_;
924
925 my $section_ptr = $self->_lookup_section($section);
926 if (!defined $section_ptr) {
927 print STDERR "doc::add_utf8_text couldn't find section " .
928 "$section\n";
929 return;
930 }
931
932 $section_ptr->{'text'} .= $text;
933}
934
935
936# methods for dealing with associated files
937
938# a file is associated with a document, NOT a section.
939# if section is defined it is noted in the data structure
940# only so that files associated from a particular section
941# may be removed later (using delete_section_assoc_files)
942sub associate_file {
943 my $self = shift (@_);
944 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
945 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
946
947 # remove all associated files with the same name
948 $self->delete_assoc_file ($assoc_filename);
949
950 push (@{$self->{'associated_files'}},
951 [$real_filename, $assoc_filename, $mime_type, $section]);
952}
953
954# returns a list of associated files in the form
955# [[real_filename, assoc_filename, mimetype], ...]
956sub get_assoc_files {
957 my $self = shift (@_);
958
959 return $self->{'associated_files'};
960}
961
962sub delete_section_assoc_files {
963 my $self = shift (@_);
964 my ($section) = @_;
965
966 my $i=0;
967 while ($i < scalar (@{$self->{'associated_files'}})) {
968 if (defined $self->{'associated_files'}->[$i]->[3] &&
969 $self->{'associated_files'}->[$i]->[3] eq $section) {
970 splice (@{$self->{'associated_files'}}, $i, 1);
971 } else {
972 $i++;
973 }
974 }
975}
976
977sub delete_assoc_file {
978 my $self = shift (@_);
979 my ($assoc_filename) = @_;
980
981 my $i=0;
982 while ($i < scalar (@{$self->{'associated_files'}})) {
983 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
984 splice (@{$self->{'associated_files'}}, $i, 1);
985 } else {
986 $i++;
987 }
988 }
989}
990
991sub reset_nextsection_ptr {
992 my $self = shift (@_);
993 my ($section) = @_;
994
995 my $section_ptr = $self->_lookup_section($section);
996 $section_ptr->{'next_subsection'} = 1;
997}
998
9991;
Note: See TracBrowser for help on using the repository browser.