source: gsdl/trunk/perllib/doc.pm@ 16934

Last change on this file since 16934 was 16934, checked in by ak19, 16 years ago

Removed commented out debug statements

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 27.2 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use ghtml;
40use File::stat;
41##use hashdoc;
42use docprint;
43
44# the document type may be indexed_doc, nonindexed_doc, or
45# classification
46
47my $OIDcount = 0;
48
49sub new {
50 my $class = shift (@_);
51 my ($source_filename, $doc_type) = @_;
52
53
54 my $self = bless {'associated_files'=>[],
55 'subsection_order'=>[],
56 'next_subsection'=>1,
57 'subsections'=>{},
58 'metadata'=>[],
59 'text'=>"",
60 'OIDtype'=>"hash"}, $class;
61
62 # used to set lastmodified here, but this can screw up the HASH ids, so
63 # the docsave processor now calls set_lastmodified
64
65 $self->{'source_path'} = $source_filename;
66
67 if (defined $source_filename) {
68 $source_filename = &util::filename_within_collection($source_filename);
69 $self->set_source_filename ($source_filename);
70 }
71
72 $self->set_doc_type ($doc_type) if defined $doc_type;
73
74 return $self;
75}
76# set lastmodified for OAI purposes, added by GRB, moved by kjdon
77sub set_lastmodified {
78 my $self = shift (@_);
79
80 my $source_path = $self->{'source_path'};
81
82 if (defined $source_path && (-e $source_path)) {
83
84 my $file_stat = stat($source_path);
85 my $mtime = $file_stat->mtime;
86 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $file_stat->mtime);
87 }
88}
89
90# clone the $self object
91sub duplicate {
92 my $self = shift (@_);
93
94 my $newobj = {};
95
96 foreach my $k (keys %$self) {
97 $newobj->{$k} = &clone ($self->{$k});
98 }
99
100 bless $newobj, ref($self);
101 return $newobj;
102}
103
104sub clone {
105 my ($from) = @_;
106 my $type = ref ($from);
107
108 if ($type eq "HASH") {
109 my $to = {};
110 foreach my $key (keys %$from) {
111 $to->{$key} = &clone ($from->{$key});
112 }
113 return $to;
114 } elsif ($type eq "ARRAY") {
115 my $to = [];
116 foreach my $v (@$from) {
117 push (@$to, &clone ($v));
118 }
119 return $to;
120 } else {
121 return $from;
122 }
123}
124
125sub set_OIDtype {
126 my $self = shift (@_);
127 my ($type, $metadata) = @_;
128
129 if (defined $type && $type =~ /^(hash|incremental|dirname|assigned)$/) {
130 $self->{'OIDtype'} = $type;
131 } else {
132 $self->{'OIDtype'} = "hash";
133 }
134
135 if ($type =~ /^assigned$/) {
136 if (defined $metadata) {
137 $self->{'OIDmetadata'} = $metadata;
138 } else {
139 $self->{'OIDmetadata'} = "dc.Identifier";
140 }
141 }
142}
143
144sub set_source_filename {
145 my $self = shift (@_);
146 my ($source_filename) = @_;
147
148 # Since the gsdlsourcefilename element goes into the doc.xml it has
149 # to be utf8. However, it should also *represent* the source filename
150 # (in the import directory) which may not be utf8 at all.
151 # For instance, if this meta element (gsdlsourcefilename) will be
152 # used by other applications that parse doc.xml in order to locate
153 # gsdlsourcefilename. Therefore, the solution is to URLencode the real
154 # filename as this is a binary-to-text encoding meaning that the
155 # resulting string is ASCII (utf8).
156
157# print STDERR "******URLencoding the gsdl_source_filename $source_filename ";
158
159 # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
160 my ($srcfilename,$dirname,$suffix)
161 = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
162# print STDERR "-> $srcfilename -> ";
163 $srcfilename = &unicode::url_encode($srcfilename.$suffix);
164 $source_filename = &util::filename_cat($dirname, $srcfilename);
165# print STDERR "$source_filename\n";
166
167 $self->set_utf8_metadata_element ($self->get_top_section(),
168 "gsdlsourcefilename",
169 $source_filename);
170}
171
172sub set_converted_filename {
173 my $self = shift (@_);
174 my ($converted_filename) = @_;
175
176 # we know the converted filename is utf8
177 $self->set_utf8_metadata_element ($self->get_top_section(),
178 "gsdlconvertedfilename",
179 $converted_filename);
180}
181
182
183# returns the source_filename as it was provided
184sub get_source_filename {
185 my $self = shift (@_);
186
187 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
188}
189
190# returns converted filename if available else returns source filename
191sub get_filename_for_hashing {
192 my $self = shift (@_);
193
194 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
195
196 if (!defined $filename) {
197 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
198 # if NULPlug processed file, then don't give a filename
199 if (defined $plugin_name && $plugin_name eq "NULPlug") {
200 $filename = undef;
201 } else { # returns the URL encoded source filename!
202 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
203 }
204 }
205 return $filename;
206}
207
208sub set_doc_type {
209 my $self = shift (@_);
210 my ($doc_type) = @_;
211
212 $self->set_metadata_element ($self->get_top_section(),
213 "gsdldoctype",
214 $doc_type);
215}
216
217# returns the gsdldoctype as it was provided
218# the default of "indexed_doc" is used if no document
219# type was provided
220sub get_doc_type {
221 my $self = shift (@_);
222
223 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
224 return $doc_type if (defined $doc_type);
225 return "indexed_doc";
226}
227
228
229# look up the reference to the a particular section
230sub _lookup_section {
231 my $self = shift (@_);
232 my ($section) = @_;
233
234 my ($num);
235 my $sectionref = $self;
236
237 while (defined $section && $section ne "") {
238
239 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
240
241 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
242
243 $section = "" unless defined $section;
244
245
246 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
247 $sectionref = $sectionref->{'subsections'}->{$num};
248 } else {
249 return undef;
250 }
251 }
252
253 return $sectionref;
254}
255
256# calculate OID by hashing the contents of the document
257sub _calc_OID {
258 my $self = shift (@_);
259 my ($filename) = @_;
260
261 my $osexe = &util::get_os_exe();
262
263 my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
264 $ENV{'GSDLOS'},"hashfile$osexe");
265
266 my $result = "NULL";
267
268 if (-e "$hashfile_exe") {
269# $result = `\"$hashfile_exe\" \"$filename\"`;
270 $result = `hashfile$osexe \"$filename\"`;
271 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
272 } else {
273 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
274 }
275 return "HASH$result";
276}
277
278# methods dealing with OID, not groups of them.
279
280# if $OID is not provided one is calculated
281sub set_OID {
282 my $self = shift (@_);
283 my ($OID) = @_;
284
285 my $use_hash_oid = 0;
286 # if an OID wasn't provided claculate one
287 if (!defined $OID) {
288 $OID = "NULL";
289 if ($self->{'OIDtype'} eq "hash") {
290 $use_hash_oid = 1;
291 } elsif ($self->{'OIDtype'} eq "incremental") {
292 $OID = "D" . $OIDcount;
293 $OIDcount ++;
294
295 } elsif ($self->{'OIDtype'} eq "dirname") {
296 $OID = 'J';
297 my $filename = $self->get_source_filename();
298 if (defined($filename)) { # && -e $filename) {
299 $OID = &File::Basename::dirname($filename);
300 if (defined $OID) {
301 $OID = 'J'.&File::Basename::basename($OID);
302 $OID =~ s/\.//; #remove any periods
303 } else {
304 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
305 $use_hash_oid = 1;
306 }
307 } else {
308 print STDERR "Failed to find filename, generating hash id\n";
309 $use_hash_oid = 1;
310 }
311
312 } elsif ($self->{'OIDtype'} eq "assigned") {
313 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
314 if (defined $identifier && $identifier ne "") {
315 $OID = "D" . $identifier;
316 $OID =~ s/\.//; #remove any periods
317 } else {
318 # need a hash id
319 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
320 $use_hash_oid = 1;
321 }
322
323 } else {
324 $use_hash_oid = 1;
325 }
326
327 if ($use_hash_oid) {
328
329 # "hash" OID - feed file to hashfile.exe
330 #my $filename = $self->get_source_filename();
331 # we want to use the converted file for hashing if available
332 # cos its quicker
333 my $filename = $self->get_filename_for_hashing();
334 # -z: don't want to hash on the file if it is zero size
335 if (defined($filename) && -e $filename && !-z $filename) {
336 $OID = $self->_calc_OID ($filename);
337 } else {
338 $filename = &util::get_tmp_filename();
339 if (!open (OUTFILE, ">$filename")) {
340 print STDERR "doc::set_OID could not write to $filename\n";
341 } else {
342 my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
343 print OUTFILE $doc_text;
344 close (OUTFILE);
345 }
346 $OID = $self->_calc_OID ($filename);
347 &util::rm ($filename);
348 }
349 }
350 }
351 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
352}
353
354# this uses hashdoc (embedded c thingy) which is faster but still
355# needs a little work to be suffiently stable
356sub ___set_OID {
357 my $self = shift (@_);
358 my ($OID) = @_;
359
360 # if an OID wasn't provided then calculate hash value based on document
361 if (!defined $OID)
362 {
363 my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
364 my $hash_len = length($hash_text);
365
366 $OID = &hashdoc::buffer($hash_text,$hash_len);
367 }
368
369 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
370}
371
372# returns the OID for this document
373sub get_OID {
374 my $self = shift (@_);
375 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
376 return $OID if (defined $OID);
377 return "NULL";
378}
379
380sub delete_OID {
381 my $self = shift (@_);
382
383 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
384}
385
386
387# methods for manipulating section names
388
389# returns the name of the top-most section (the top
390# level of the document
391sub get_top_section {
392 my $self = shift (@_);
393
394 return "";
395}
396
397# returns a section
398sub get_parent_section {
399 my $self = shift (@_);
400 my ($section) = @_;
401
402 $section =~ s/(^|\.)\d+$//;
403
404 return $section;
405}
406
407# returns the first child section (or the end child
408# if there isn't any)
409sub get_begin_child {
410 my $self = shift (@_);
411 my ($section) = @_;
412
413 my $section_ptr = $self->_lookup_section($section);
414 return "" unless defined $section_ptr;
415
416 if (defined $section_ptr->{'subsection_order'}->[0]) {
417 return "$section.$section_ptr->{'subsection_order'}->[0]";
418 }
419
420 return $self->get_end_child ($section);
421}
422
423# returns the next child of a parent section
424sub get_next_child {
425 my $self = shift (@_);
426 my ($section) = @_;
427
428 my $parent_section = $self->get_parent_section($section);
429 my $parent_section_ptr = $self->_lookup_section($parent_section);
430 return undef unless defined $parent_section_ptr;
431
432 my ($section_num) = $section =~ /(\d+)$/;
433 return undef unless defined $section_num;
434
435 my $i = 0;
436 my $section_order = $parent_section_ptr->{'subsection_order'};
437 while ($i < scalar(@$section_order)) {
438 last if $section_order->[$i] eq $section_num;
439 $i++;
440 }
441
442 $i++; # the next child
443 if ($i < scalar(@$section_order)) {
444 return $section_order->[$i] if $parent_section eq "";
445 return "$parent_section.$section_order->[$i]";
446 }
447
448 # no more sections in this level
449 return undef;
450}
451
452# returns a reference to a list of children
453sub get_children {
454 my $self = shift (@_);
455 my ($section) = @_;
456
457 my $section_ptr = $self->_lookup_section($section);
458 return [] unless defined $section_ptr;
459
460 my @children = @{$section_ptr->{'subsection_order'}};
461
462 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
463 return \@children;
464}
465
466# returns the child section one past the last one (which
467# is coded as "0")
468sub get_end_child {
469 my $self = shift (@_);
470 my ($section) = @_;
471
472 return $section . ".0" unless $section eq "";
473 return "0";
474}
475
476# returns the next section in book order
477sub get_next_section {
478 my $self = shift (@_);
479 my ($section) = @_;
480
481 return undef unless defined $section;
482
483 my $section_ptr = $self->_lookup_section($section);
484 return undef unless defined $section_ptr;
485
486 # first try to find first child
487 if (defined $section_ptr->{'subsection_order'}->[0]) {
488 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
489 return "$section.$section_ptr->{'subsection_order'}->[0]";
490 }
491
492 do {
493 # try to find sibling
494 my $next_child = $self->get_next_child ($section);
495 return $next_child if (defined $next_child);
496
497 # move up one level
498 $section = $self->get_parent_section ($section);
499 } while $section =~ /\d/;
500
501 return undef;
502}
503
504sub is_leaf_section {
505 my $self = shift (@_);
506 my ($section) = @_;
507
508 my $section_ptr = $self->_lookup_section($section);
509 return 1 unless defined $section_ptr;
510
511 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
512}
513
514# methods for dealing with sections
515
516# returns the name of the inserted section
517sub insert_section {
518 my $self = shift (@_);
519 my ($before_section) = @_;
520
521 # get the child to insert before and its parent section
522 my $parent_section = "";
523 my $before_child = "0";
524 my @before_section = split (/\./, $before_section);
525 if (scalar(@before_section) > 0) {
526 $before_child = pop (@before_section);
527 $parent_section = join (".", @before_section);
528 }
529
530 my $parent_section_ptr = $self->_lookup_section($parent_section);
531 if (!defined $parent_section_ptr) {
532 print STDERR "doc::insert_section couldn't find parent section " .
533 "$parent_section\n";
534 return;
535 }
536
537 # get the next section number
538 my $section_num = $parent_section_ptr->{'next_subsection'}++;
539
540 my $i = 0;
541 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
542 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
543 $i++;
544 }
545
546 # insert the section number into the order list
547 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
548
549 # add this section to the parent section
550 my $section_ptr = {'subsection_order'=>[],
551 'next_subsection'=>1,
552 'subsections'=>{},
553 'metadata'=>[],
554 'text'=>""};
555 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
556
557 # work out the full section number
558 my $section = $parent_section;
559 $section .= "." unless $section eq "";
560 $section .= $section_num;
561
562 return $section;
563}
564
565# creates a pre-named section
566sub create_named_section {
567 my $self = shift (@_);
568 my ($mastersection) = @_;
569
570 my ($num);
571 my $section = $mastersection;
572 my $sectionref = $self;
573
574 while ($section ne "") {
575 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
576 $num =~ s/^0+(\d)/$1/; # remove leading 0s
577 $section = "" unless defined $section;
578
579 if (defined $num) {
580 if (!defined $sectionref->{'subsections'}->{$num}) {
581 push (@{$sectionref->{'subsection_order'}}, $num);
582 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
583 'next_subsection'=>1,
584 'subsections'=>{},
585 'metadata'=>[],
586 'text'=>""};
587 if ($num >= $sectionref->{'next_subsection'}) {
588 $sectionref->{'next_subsection'} = $num + 1;
589 }
590 }
591 $sectionref = $sectionref->{'subsections'}->{$num};
592
593 } else {
594 print STDERR "doc::create_named_section couldn't create section ";
595 print STDERR "$mastersection\n";
596 last;
597 }
598 }
599}
600
601# returns a reference to a list of subsections
602sub list_subsections {
603 my $self = shift (@_);
604 my ($section) = @_;
605
606 my $section_ptr = $self->_lookup_section ($section);
607 if (!defined $section_ptr) {
608 print STDERR "doc::list_subsections couldn't find section $section\n";
609 return [];
610 }
611
612 return [@{$section_ptr->{'subsection_order'}}];
613}
614
615sub delete_section {
616 my $self = shift (@_);
617 my ($section) = @_;
618
619# my $section_ptr = {'subsection_order'=>[],
620# 'next_subsection'=>1,
621# 'subsections'=>{},
622# 'metadata'=>[],
623# 'text'=>""};
624
625 # if this is the top section reset everything
626 if ($section eq "") {
627 $self->{'subsection_order'} = [];
628 $self->{'subsections'} = {};
629 $self->{'metadata'} = [];
630 $self->{'text'} = "";
631 return;
632 }
633
634 # find the parent of the section to delete
635 my $parent_section = "";
636 my $child = "0";
637 my @section = split (/\./, $section);
638 if (scalar(@section) > 0) {
639 $child = pop (@section);
640 $parent_section = join (".", @section);
641 }
642
643 my $parent_section_ptr = $self->_lookup_section($parent_section);
644 if (!defined $parent_section_ptr) {
645 print STDERR "doc::delete_section couldn't find parent section " .
646 "$parent_section\n";
647 return;
648 }
649
650 # remove this section from the subsection_order list
651 my $i = 0;
652 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
653 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
654 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
655 last;
656 }
657 $i++;
658 }
659
660 # remove this section from the subsection hash
661 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
662 undef $parent_section_ptr->{'subsections'}->{$child};
663 }
664}
665
666#--
667# methods for dealing with metadata
668
669# set_metadata_element and get_metadata_element are for metadata
670# which should only have one value. add_meta_data and get_metadata
671# are for metadata which can have more than one value.
672
673# returns the first metadata value which matches field
674
675# This version of get metadata element works much like the one above,
676# except it allows for the namespace portion of a metadata element to
677# be ignored, thus if you are searching for dc.Title, the first piece
678# of matching metadata ending with the name Title (once any namespace
679# is removed) would be returned.
680# 28-11-2003 John Thompson
681sub get_metadata_element {
682 my $self = shift (@_);
683 my ($section, $field, $ignore_namespace) = @_;
684 my ($data);
685
686 $ignore_namespace = 0 unless defined $ignore_namespace;
687
688 my $section_ptr = $self->_lookup_section($section);
689 if (!defined $section_ptr) {
690 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
691 return;
692 }
693
694 # Remove the any namespace if we are being told to ignore them
695 if($ignore_namespace) {
696 $field =~ s/^\w*\.//;
697 }
698
699 foreach $data (@{$section_ptr->{'metadata'}}) {
700
701 my $data_name = $data->[0];
702
703 # Remove the any namespace if we are being told to ignore them
704 if($ignore_namespace) {
705 $data_name =~ s/^\w*\.//;
706 }
707
708 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
709 }
710
711 return undef; # was not found
712}
713
714# returns a list of the form [value1, value2, ...]
715sub get_metadata {
716 my $self = shift (@_);
717 my ($section, $field, $ignore_namespace) = @_;
718 my ($data);
719
720 $ignore_namespace = 0 unless defined $ignore_namespace;
721
722 my $section_ptr = $self->_lookup_section($section);
723 if (!defined $section_ptr) {
724 print STDERR "doc::get_metadata couldn't find section ",
725 $section, "\n";
726 return;
727 }
728
729 # Remove the any namespace if we are being told to ignore them
730 if($ignore_namespace) {
731 $field =~ s/^\w*\.//;
732 }
733
734 my @metadata = ();
735 foreach $data (@{$section_ptr->{'metadata'}}) {
736
737 my $data_name = $data->[0];
738 # Remove the any namespace if we are being told to ignore them
739 if($ignore_namespace) {
740 $data_name =~ s/^\w*\.//;
741 }
742
743 push (@metadata, $data->[1]) if ($data_name eq $field);
744 }
745
746 return \@metadata;
747}
748
749# returns a list of the form [[field,value],[field,value],...]
750sub get_all_metadata {
751 my $self = shift (@_);
752 my ($section) = @_;
753
754 my $section_ptr = $self->_lookup_section($section);
755 if (!defined $section_ptr) {
756 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
757 return;
758 }
759
760 return $section_ptr->{'metadata'};
761}
762
763# $value is optional
764sub delete_metadata {
765 my $self = shift (@_);
766 my ($section, $field, $value) = @_;
767
768 my $section_ptr = $self->_lookup_section($section);
769 if (!defined $section_ptr) {
770 print STDERR "doc::delete_metadata couldn't find section ", $section, "\n";
771 return;
772 }
773
774 my $i = 0;
775 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
776 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
777 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
778 splice (@{$section_ptr->{'metadata'}}, $i, 1);
779 } else {
780 $i++;
781 }
782 }
783}
784
785sub delete_all_metadata {
786 my $self = shift (@_);
787 my ($section) = @_;
788
789 my $section_ptr = $self->_lookup_section($section);
790 if (!defined $section_ptr) {
791 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
792 return;
793 }
794
795 $section_ptr->{'metadata'} = [];
796}
797
798sub set_metadata_element {
799 my $self = shift (@_);
800 my ($section, $field, $value) = @_;
801
802 $self->set_utf8_metadata_element ($section, $field,
803 &unicode::ascii2utf8(\$value));
804}
805
806# set_utf8_metadata_element assumes the text has already been
807# converted to the UTF-8 encoding.
808sub set_utf8_metadata_element {
809 my $self = shift (@_);
810 my ($section, $field, $value) = @_;
811
812 $self->delete_metadata ($section, $field);
813 $self->add_utf8_metadata ($section, $field, $value);
814}
815
816
817# add_metadata assumes the text is in (extended) ascii form. For
818# text which has already been converted to the UTF-8 format use
819# add_utf8_metadata.
820sub add_metadata {
821 my $self = shift (@_);
822 my ($section, $field, $value) = @_;
823
824 $self->add_utf8_metadata ($section, $field,
825 &unicode::ascii2utf8(\$value));
826}
827
828sub add_utf8_metadata {
829 my $self = shift (@_);
830 my ($section, $field, $value) = @_;
831
832 my $section_ptr = $self->_lookup_section($section);
833 if (!defined $section_ptr) {
834 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
835 return;
836 }
837 if (!defined $value) {
838 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
839 return;
840 }
841 if (!defined $field) {
842 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
843 return;
844 }
845
846 #print STDERR "###$field=$value\n";
847 # double check that the value is utf-8
848 if (!&unicode::check_is_utf8($value)) {
849 print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
850 &unicode::ensure_utf8(\$value);
851 print STDERR " Tried converting to utf8: $value\n";
852 }
853
854 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
855}
856
857
858# methods for dealing with text
859
860# returns the text for a section
861sub get_text {
862 my $self = shift (@_);
863 my ($section) = @_;
864
865 my $section_ptr = $self->_lookup_section($section);
866 if (!defined $section_ptr) {
867 print STDERR "doc::get_text couldn't find section " .
868 "$section\n";
869 return "";
870 }
871
872 return $section_ptr->{'text'};
873}
874
875# returns the (utf-8 encoded) length of the text for a section
876sub get_text_length {
877 my $self = shift (@_);
878 my ($section) = @_;
879
880 my $section_ptr = $self->_lookup_section($section);
881 if (!defined $section_ptr) {
882 print STDERR "doc::get_text_length couldn't find section " .
883 "$section\n";
884 return 0;
885 }
886
887 return length ($section_ptr->{'text'});
888}
889
890sub delete_text {
891 my $self = shift (@_);
892 my ($section) = @_;
893
894 my $section_ptr = $self->_lookup_section($section);
895 if (!defined $section_ptr) {
896 print STDERR "doc::delete_text couldn't find section " .
897 "$section\n";
898 return;
899 }
900
901 $section_ptr->{'text'} = "";
902}
903
904# add_text assumes the text is in (extended) ascii form. For
905# text which has been already converted to the UTF-8 format
906# use add_utf8_text.
907sub add_text {
908 my $self = shift (@_);
909 my ($section, $text) = @_;
910
911 # convert the text to UTF-8 encoded unicode characters
912 # and add the text
913 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
914}
915
916
917# add_utf8_text assumes the text to be added has already
918# been converted to the UTF-8 encoding. For ascii text use
919# add_text
920sub add_utf8_text {
921 my $self = shift (@_);
922 my ($section, $text) = @_;
923
924 my $section_ptr = $self->_lookup_section($section);
925 if (!defined $section_ptr) {
926 print STDERR "doc::add_utf8_text couldn't find section " .
927 "$section\n";
928 return;
929 }
930
931 $section_ptr->{'text'} .= $text;
932}
933
934# get the actual name of the assocfile from SourceFile which is the url referring to it
935sub assocfile_from_sourcefile {
936 my $self = shift (@_);
937
938 # get the SourceFile meta, which is a *URL* to a file on the filesystem
939 my $top_section = $self->get_top_section();
940 my $source_file = $self->get_metadata_element($top_section, "SourceFile");
941
942 # get the actual filename as it exists on the filesystem which this url refers to
943 $source_file = &unicode::url_to_filename($source_file);
944 my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
945 return $assocfilename;
946}
947
948
949# methods for dealing with associated files
950
951# a file is associated with a document, NOT a section.
952# if section is defined it is noted in the data structure
953# only so that files associated from a particular section
954# may be removed later (using delete_section_assoc_files)
955sub associate_file {
956 my $self = shift (@_);
957 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
958 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
959
960# my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
961# print STDERR "Calling method; $cfilename:$cline $cpackage->$csubr\n";
962
963 # remove all associated files with the same name
964 $self->delete_assoc_file ($assoc_filename);
965
966 push (@{$self->{'associated_files'}},
967 [$real_filename, $assoc_filename, $mime_type, $section]);
968}
969
970# returns a list of associated files in the form
971# [[real_filename, assoc_filename, mimetype], ...]
972sub get_assoc_files {
973 my $self = shift (@_);
974
975 return $self->{'associated_files'};
976}
977
978sub delete_section_assoc_files {
979 my $self = shift (@_);
980 my ($section) = @_;
981
982 my $i=0;
983 while ($i < scalar (@{$self->{'associated_files'}})) {
984 if (defined $self->{'associated_files'}->[$i]->[3] &&
985 $self->{'associated_files'}->[$i]->[3] eq $section) {
986 splice (@{$self->{'associated_files'}}, $i, 1);
987 } else {
988 $i++;
989 }
990 }
991}
992
993sub delete_assoc_file {
994 my $self = shift (@_);
995 my ($assoc_filename) = @_;
996
997 my $i=0;
998 while ($i < scalar (@{$self->{'associated_files'}})) {
999 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1000 splice (@{$self->{'associated_files'}}, $i, 1);
1001 } else {
1002 $i++;
1003 }
1004 }
1005}
1006
1007sub reset_nextsection_ptr {
1008 my $self = shift (@_);
1009 my ($section) = @_;
1010
1011 my $section_ptr = $self->_lookup_section($section);
1012 $section_ptr->{'next_subsection'} = 1;
1013}
1014
10151;
Note: See TracBrowser for help on using the repository browser.