source: gsdl/trunk/perllib/doc.pm@ 17110

Last change on this file since 17110 was 17057, checked in by kjdon, 16 years ago

added some more warnings when tidying up assigned OIDs

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 28.1 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use ghtml;
40use File::stat;
41##use hashdoc;
42use docprint;
43
44# the document type may be indexed_doc, nonindexed_doc, or
45# classification
46
47my $OIDcount = 0;
48
49sub new {
50 my $class = shift (@_);
51 my ($source_filename, $doc_type) = @_;
52
53
54 my $self = bless {'associated_files'=>[],
55 'subsection_order'=>[],
56 'next_subsection'=>1,
57 'subsections'=>{},
58 'metadata'=>[],
59 'text'=>"",
60 'OIDtype'=>"hash"}, $class;
61
62 # used to set lastmodified here, but this can screw up the HASH ids, so
63 # the docsave processor now calls set_lastmodified
64
65 $self->{'source_path'} = $source_filename;
66
67 if (defined $source_filename) {
68 $source_filename = &util::filename_within_collection($source_filename);
69 $self->set_source_filename ($source_filename);
70 }
71
72 $self->set_doc_type ($doc_type) if defined $doc_type;
73
74 return $self;
75}
76# set lastmodified for OAI purposes, added by GRB, moved by kjdon
77sub set_lastmodified {
78 my $self = shift (@_);
79
80 my $source_path = $self->{'source_path'};
81
82 if (defined $source_path && (-e $source_path)) {
83
84 my $file_stat = stat($source_path);
85 my $mtime = $file_stat->mtime;
86 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $file_stat->mtime);
87 }
88}
89
90# clone the $self object
91sub duplicate {
92 my $self = shift (@_);
93
94 my $newobj = {};
95
96 foreach my $k (keys %$self) {
97 $newobj->{$k} = &clone ($self->{$k});
98 }
99
100 bless $newobj, ref($self);
101 return $newobj;
102}
103
104sub clone {
105 my ($from) = @_;
106 my $type = ref ($from);
107
108 if ($type eq "HASH") {
109 my $to = {};
110 foreach my $key (keys %$from) {
111 $to->{$key} = &clone ($from->{$key});
112 }
113 return $to;
114 } elsif ($type eq "ARRAY") {
115 my $to = [];
116 foreach my $v (@$from) {
117 push (@$to, &clone ($v));
118 }
119 return $to;
120 } else {
121 return $from;
122 }
123}
124
125sub set_OIDtype {
126 my $self = shift (@_);
127 my ($type, $metadata) = @_;
128
129 if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|incremental|dirname|assigned)$/) {
130 $self->{'OIDtype'} = $type;
131 } else {
132 $self->{'OIDtype'} = "hash";
133 }
134
135 if ($type =~ /^assigned$/) {
136 if (defined $metadata) {
137 $self->{'OIDmetadata'} = $metadata;
138 } else {
139 $self->{'OIDmetadata'} = "dc.Identifier";
140 }
141 }
142}
143
144sub set_source_filename {
145 my $self = shift (@_);
146 my ($source_filename) = @_;
147
148 # Since the gsdlsourcefilename element goes into the doc.xml it has
149 # to be utf8. However, it should also *represent* the source filename
150 # (in the import directory) which may not be utf8 at all.
151 # For instance, if this meta element (gsdlsourcefilename) will be
152 # used by other applications that parse doc.xml in order to locate
153 # gsdlsourcefilename. Therefore, the solution is to URLencode the real
154 # filename as this is a binary-to-text encoding meaning that the
155 # resulting string is ASCII (utf8).
156
157# print STDERR "******URLencoding the gsdl_source_filename $source_filename ";
158
159 # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
160 my ($srcfilename,$dirname,$suffix)
161 = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
162# print STDERR "-> $srcfilename -> ";
163 $srcfilename = &unicode::url_encode($srcfilename.$suffix);
164 $source_filename = &util::filename_cat($dirname, $srcfilename);
165# print STDERR "$source_filename\n";
166
167 $self->set_utf8_metadata_element ($self->get_top_section(),
168 "gsdlsourcefilename",
169 $source_filename);
170}
171
172sub set_converted_filename {
173 my $self = shift (@_);
174 my ($converted_filename) = @_;
175
176 # we know the converted filename is utf8
177 $self->set_utf8_metadata_element ($self->get_top_section(),
178 "gsdlconvertedfilename",
179 $converted_filename);
180}
181
182# returns the source_filename as it was provided
183sub get_source_filename {
184 my $self = shift (@_);
185
186 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
187}
188
189# returns converted filename if available else returns source filename
190sub get_filename_for_hashing {
191 my $self = shift (@_);
192
193 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
194
195 if (!defined $filename) {
196 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
197 # if NULPlug processed file, then don't give a filename
198 if (defined $plugin_name && $plugin_name eq "NULPlug") {
199 $filename = undef;
200 } else { # returns the URL encoded source filename!
201 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
202 }
203 }
204 return $filename;
205}
206
207sub set_doc_type {
208 my $self = shift (@_);
209 my ($doc_type) = @_;
210
211 $self->set_metadata_element ($self->get_top_section(),
212 "gsdldoctype",
213 $doc_type);
214}
215
216# returns the gsdldoctype as it was provided
217# the default of "indexed_doc" is used if no document
218# type was provided
219sub get_doc_type {
220 my $self = shift (@_);
221
222 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
223 return $doc_type if (defined $doc_type);
224 return "indexed_doc";
225}
226
227
228# look up the reference to the a particular section
229sub _lookup_section {
230 my $self = shift (@_);
231 my ($section) = @_;
232
233 my ($num);
234 my $sectionref = $self;
235
236 while (defined $section && $section ne "") {
237
238 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
239
240 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
241
242 $section = "" unless defined $section;
243
244
245 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
246 $sectionref = $sectionref->{'subsections'}->{$num};
247 } else {
248 return undef;
249 }
250 }
251
252 return $sectionref;
253}
254
255# calculate OID by hashing the contents of the document
256sub _calc_OID {
257 my $self = shift (@_);
258 my ($filename) = @_;
259
260 my $osexe = &util::get_os_exe();
261
262 my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
263 $ENV{'GSDLOS'},"hashfile$osexe");
264
265 my $result = "NULL";
266
267 if (-e "$hashfile_exe") {
268# $result = `\"$hashfile_exe\" \"$filename\"`;
269 $result = `hashfile$osexe \"$filename\"`;
270 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
271 } else {
272 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
273 }
274 return "HASH$result";
275}
276
277# methods dealing with OID, not groups of them.
278
279# if $OID is not provided one is calculated
280sub set_OID {
281 my $self = shift (@_);
282 my ($OID) = @_;
283
284 my $use_hash_oid = 0;
285 # if an OID wasn't provided calculate one
286 if (!defined $OID) {
287 $OID = "NULL";
288 if ($self->{'OIDtype'} =~ /^hash/) {
289 $use_hash_oid = 1;
290 } elsif ($self->{'OIDtype'} eq "incremental") {
291 $OID = "D" . $OIDcount;
292 $OIDcount ++;
293
294 } elsif ($self->{'OIDtype'} eq "dirname") {
295 $OID = 'J';
296 my $filename = $self->get_source_filename();
297 if (defined($filename)) { # && -e $filename) {
298 $OID = &File::Basename::dirname($filename);
299 if (defined $OID) {
300 $OID = 'J'.&File::Basename::basename($OID);
301 $OID =~ s/\.//; #remove any periods
302 } else {
303 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
304 $use_hash_oid = 1;
305 }
306 } else {
307 print STDERR "Failed to find filename, generating hash id\n";
308 $use_hash_oid = 1;
309 }
310
311 } elsif ($self->{'OIDtype'} eq "assigned") {
312 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
313 if (defined $identifier && $identifier ne "") {
314 $OID = $identifier;
315 if ($OID =~ /\./) {
316 print STDERR "Warning, assigned identifier $identifier contains periods (.), removing them\n";
317 $OID =~ s/\.//g; #remove any periods
318 }
319 if ($OID =~ /^\d*$/) {
320 print STDERR "Warning, assigned identifier $identifier contains only digits. Prepending 'D'.\n";
321 $OID = "D" . $OID;
322 }
323 if ($OID =~ /^[\d]*$/) {
324 $OID = "D" . $OID;
325 print STDERR "OID only contains numbers, adding a D\n";
326 }
327
328 } else {
329 # need a hash id
330 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
331 $use_hash_oid = 1;
332 }
333
334 } else {
335 $use_hash_oid = 1;
336 }
337
338 if ($use_hash_oid) {
339 my $hash_on_file = 1;
340 if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
341 $hash_on_file = 0;
342 }
343 if ($hash_on_file) {
344 # "hash" OID - feed file to hashfile.exe
345 my $filename = $self->get_filename_for_hashing();
346 # -z: don't want to hash on the file if it is zero size
347 if (defined($filename) && -e $filename && !-z $filename) {
348 $OID = $self->_calc_OID ($filename);
349 } else {
350 $hash_on_file = 0;
351 }
352 }
353 if (!$hash_on_file) {
354 my $filename = &util::get_tmp_filename();
355 if (!open (OUTFILE, ">$filename")) {
356 print STDERR "doc::set_OID could not write to $filename\n";
357 } else {
358 my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
359 print OUTFILE $doc_text;
360 close (OUTFILE);
361 }
362 $OID = $self->_calc_OID ($filename);
363 &util::rm ($filename);
364 }
365 }
366 }
367 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
368}
369
370# this uses hashdoc (embedded c thingy) which is faster but still
371# needs a little work to be suffiently stable
372sub ___set_OID {
373 my $self = shift (@_);
374 my ($OID) = @_;
375
376 # if an OID wasn't provided then calculate hash value based on document
377 if (!defined $OID)
378 {
379 my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
380 my $hash_len = length($hash_text);
381
382 $OID = &hashdoc::buffer($hash_text,$hash_len);
383 }
384
385 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
386}
387
388# returns the OID for this document
389sub get_OID {
390 my $self = shift (@_);
391 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
392 return $OID if (defined $OID);
393 return "NULL";
394}
395
396sub delete_OID {
397 my $self = shift (@_);
398
399 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
400}
401
402
403# methods for manipulating section names
404
405# returns the name of the top-most section (the top
406# level of the document
407sub get_top_section {
408 my $self = shift (@_);
409
410 return "";
411}
412
413# returns a section
414sub get_parent_section {
415 my $self = shift (@_);
416 my ($section) = @_;
417
418 $section =~ s/(^|\.)\d+$//;
419
420 return $section;
421}
422
423# returns the first child section (or the end child
424# if there isn't any)
425sub get_begin_child {
426 my $self = shift (@_);
427 my ($section) = @_;
428
429 my $section_ptr = $self->_lookup_section($section);
430 return "" unless defined $section_ptr;
431
432 if (defined $section_ptr->{'subsection_order'}->[0]) {
433 return "$section.$section_ptr->{'subsection_order'}->[0]";
434 }
435
436 return $self->get_end_child ($section);
437}
438
439# returns the next child of a parent section
440sub get_next_child {
441 my $self = shift (@_);
442 my ($section) = @_;
443
444 my $parent_section = $self->get_parent_section($section);
445 my $parent_section_ptr = $self->_lookup_section($parent_section);
446 return undef unless defined $parent_section_ptr;
447
448 my ($section_num) = $section =~ /(\d+)$/;
449 return undef unless defined $section_num;
450
451 my $i = 0;
452 my $section_order = $parent_section_ptr->{'subsection_order'};
453 while ($i < scalar(@$section_order)) {
454 last if $section_order->[$i] eq $section_num;
455 $i++;
456 }
457
458 $i++; # the next child
459 if ($i < scalar(@$section_order)) {
460 return $section_order->[$i] if $parent_section eq "";
461 return "$parent_section.$section_order->[$i]";
462 }
463
464 # no more sections in this level
465 return undef;
466}
467
468# returns a reference to a list of children
469sub get_children {
470 my $self = shift (@_);
471 my ($section) = @_;
472
473 my $section_ptr = $self->_lookup_section($section);
474 return [] unless defined $section_ptr;
475
476 my @children = @{$section_ptr->{'subsection_order'}};
477
478 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
479 return \@children;
480}
481
482# returns the child section one past the last one (which
483# is coded as "0")
484sub get_end_child {
485 my $self = shift (@_);
486 my ($section) = @_;
487
488 return $section . ".0" unless $section eq "";
489 return "0";
490}
491
492# returns the next section in book order
493sub get_next_section {
494 my $self = shift (@_);
495 my ($section) = @_;
496
497 return undef unless defined $section;
498
499 my $section_ptr = $self->_lookup_section($section);
500 return undef unless defined $section_ptr;
501
502 # first try to find first child
503 if (defined $section_ptr->{'subsection_order'}->[0]) {
504 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
505 return "$section.$section_ptr->{'subsection_order'}->[0]";
506 }
507
508 do {
509 # try to find sibling
510 my $next_child = $self->get_next_child ($section);
511 return $next_child if (defined $next_child);
512
513 # move up one level
514 $section = $self->get_parent_section ($section);
515 } while $section =~ /\d/;
516
517 return undef;
518}
519
520sub is_leaf_section {
521 my $self = shift (@_);
522 my ($section) = @_;
523
524 my $section_ptr = $self->_lookup_section($section);
525 return 1 unless defined $section_ptr;
526
527 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
528}
529
530# methods for dealing with sections
531
532# returns the name of the inserted section
533sub insert_section {
534 my $self = shift (@_);
535 my ($before_section) = @_;
536
537 # get the child to insert before and its parent section
538 my $parent_section = "";
539 my $before_child = "0";
540 my @before_section = split (/\./, $before_section);
541 if (scalar(@before_section) > 0) {
542 $before_child = pop (@before_section);
543 $parent_section = join (".", @before_section);
544 }
545
546 my $parent_section_ptr = $self->_lookup_section($parent_section);
547 if (!defined $parent_section_ptr) {
548 print STDERR "doc::insert_section couldn't find parent section " .
549 "$parent_section\n";
550 return;
551 }
552
553 # get the next section number
554 my $section_num = $parent_section_ptr->{'next_subsection'}++;
555
556 my $i = 0;
557 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
558 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
559 $i++;
560 }
561
562 # insert the section number into the order list
563 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
564
565 # add this section to the parent section
566 my $section_ptr = {'subsection_order'=>[],
567 'next_subsection'=>1,
568 'subsections'=>{},
569 'metadata'=>[],
570 'text'=>""};
571 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
572
573 # work out the full section number
574 my $section = $parent_section;
575 $section .= "." unless $section eq "";
576 $section .= $section_num;
577
578 return $section;
579}
580
581# creates a pre-named section
582sub create_named_section {
583 my $self = shift (@_);
584 my ($mastersection) = @_;
585
586 my ($num);
587 my $section = $mastersection;
588 my $sectionref = $self;
589
590 while ($section ne "") {
591 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
592 $num =~ s/^0+(\d)/$1/; # remove leading 0s
593 $section = "" unless defined $section;
594
595 if (defined $num) {
596 if (!defined $sectionref->{'subsections'}->{$num}) {
597 push (@{$sectionref->{'subsection_order'}}, $num);
598 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
599 'next_subsection'=>1,
600 'subsections'=>{},
601 'metadata'=>[],
602 'text'=>""};
603 if ($num >= $sectionref->{'next_subsection'}) {
604 $sectionref->{'next_subsection'} = $num + 1;
605 }
606 }
607 $sectionref = $sectionref->{'subsections'}->{$num};
608
609 } else {
610 print STDERR "doc::create_named_section couldn't create section ";
611 print STDERR "$mastersection\n";
612 last;
613 }
614 }
615}
616
617# returns a reference to a list of subsections
618sub list_subsections {
619 my $self = shift (@_);
620 my ($section) = @_;
621
622 my $section_ptr = $self->_lookup_section ($section);
623 if (!defined $section_ptr) {
624 print STDERR "doc::list_subsections couldn't find section $section\n";
625 return [];
626 }
627
628 return [@{$section_ptr->{'subsection_order'}}];
629}
630
631sub delete_section {
632 my $self = shift (@_);
633 my ($section) = @_;
634
635# my $section_ptr = {'subsection_order'=>[],
636# 'next_subsection'=>1,
637# 'subsections'=>{},
638# 'metadata'=>[],
639# 'text'=>""};
640
641 # if this is the top section reset everything
642 if ($section eq "") {
643 $self->{'subsection_order'} = [];
644 $self->{'subsections'} = {};
645 $self->{'metadata'} = [];
646 $self->{'text'} = "";
647 return;
648 }
649
650 # find the parent of the section to delete
651 my $parent_section = "";
652 my $child = "0";
653 my @section = split (/\./, $section);
654 if (scalar(@section) > 0) {
655 $child = pop (@section);
656 $parent_section = join (".", @section);
657 }
658
659 my $parent_section_ptr = $self->_lookup_section($parent_section);
660 if (!defined $parent_section_ptr) {
661 print STDERR "doc::delete_section couldn't find parent section " .
662 "$parent_section\n";
663 return;
664 }
665
666 # remove this section from the subsection_order list
667 my $i = 0;
668 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
669 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
670 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
671 last;
672 }
673 $i++;
674 }
675
676 # remove this section from the subsection hash
677 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
678 undef $parent_section_ptr->{'subsections'}->{$child};
679 }
680}
681
682#--
683# methods for dealing with metadata
684
685# set_metadata_element and get_metadata_element are for metadata
686# which should only have one value. add_meta_data and get_metadata
687# are for metadata which can have more than one value.
688
689# returns the first metadata value which matches field
690
691# This version of get metadata element works much like the one above,
692# except it allows for the namespace portion of a metadata element to
693# be ignored, thus if you are searching for dc.Title, the first piece
694# of matching metadata ending with the name Title (once any namespace
695# is removed) would be returned.
696# 28-11-2003 John Thompson
697sub get_metadata_element {
698 my $self = shift (@_);
699 my ($section, $field, $ignore_namespace) = @_;
700 my ($data);
701
702 $ignore_namespace = 0 unless defined $ignore_namespace;
703
704 my $section_ptr = $self->_lookup_section($section);
705 if (!defined $section_ptr) {
706 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
707 return;
708 }
709
710 # Remove the any namespace if we are being told to ignore them
711 if($ignore_namespace) {
712 $field =~ s/^\w*\.//;
713 }
714
715 foreach $data (@{$section_ptr->{'metadata'}}) {
716
717 my $data_name = $data->[0];
718
719 # Remove the any namespace if we are being told to ignore them
720 if($ignore_namespace) {
721 $data_name =~ s/^\w*\.//;
722 }
723
724 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
725 }
726
727 return undef; # was not found
728}
729
730# returns a list of the form [value1, value2, ...]
731sub get_metadata {
732 my $self = shift (@_);
733 my ($section, $field, $ignore_namespace) = @_;
734 my ($data);
735
736 $ignore_namespace = 0 unless defined $ignore_namespace;
737
738 my $section_ptr = $self->_lookup_section($section);
739 if (!defined $section_ptr) {
740 print STDERR "doc::get_metadata couldn't find section ",
741 $section, "\n";
742 return;
743 }
744
745 # Remove the any namespace if we are being told to ignore them
746 if($ignore_namespace) {
747 $field =~ s/^\w*\.//;
748 }
749
750 my @metadata = ();
751 foreach $data (@{$section_ptr->{'metadata'}}) {
752
753 my $data_name = $data->[0];
754 # Remove the any namespace if we are being told to ignore them
755 if($ignore_namespace) {
756 $data_name =~ s/^\w*\.//;
757 }
758
759 push (@metadata, $data->[1]) if ($data_name eq $field);
760 }
761
762 return \@metadata;
763}
764
765# returns a list of the form [[field,value],[field,value],...]
766sub get_all_metadata {
767 my $self = shift (@_);
768 my ($section) = @_;
769
770 my $section_ptr = $self->_lookup_section($section);
771 if (!defined $section_ptr) {
772 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
773 return;
774 }
775
776 return $section_ptr->{'metadata'};
777}
778
779# $value is optional
780sub delete_metadata {
781 my $self = shift (@_);
782 my ($section, $field, $value) = @_;
783
784 my $section_ptr = $self->_lookup_section($section);
785 if (!defined $section_ptr) {
786 print STDERR "doc::delete_metadata couldn't find section ", $section, "\n";
787 return;
788 }
789
790 my $i = 0;
791 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
792 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
793 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
794 splice (@{$section_ptr->{'metadata'}}, $i, 1);
795 } else {
796 $i++;
797 }
798 }
799}
800
801sub delete_all_metadata {
802 my $self = shift (@_);
803 my ($section) = @_;
804
805 my $section_ptr = $self->_lookup_section($section);
806 if (!defined $section_ptr) {
807 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
808 return;
809 }
810
811 $section_ptr->{'metadata'} = [];
812}
813
814sub set_metadata_element {
815 my $self = shift (@_);
816 my ($section, $field, $value) = @_;
817
818 $self->set_utf8_metadata_element ($section, $field,
819 &unicode::ascii2utf8(\$value));
820}
821
822# set_utf8_metadata_element assumes the text has already been
823# converted to the UTF-8 encoding.
824sub set_utf8_metadata_element {
825 my $self = shift (@_);
826 my ($section, $field, $value) = @_;
827
828 $self->delete_metadata ($section, $field);
829 $self->add_utf8_metadata ($section, $field, $value);
830}
831
832
833# add_metadata assumes the text is in (extended) ascii form. For
834# text which has already been converted to the UTF-8 format use
835# add_utf8_metadata.
836sub add_metadata {
837 my $self = shift (@_);
838 my ($section, $field, $value) = @_;
839
840 $self->add_utf8_metadata ($section, $field,
841 &unicode::ascii2utf8(\$value));
842}
843
844sub add_utf8_metadata {
845 my $self = shift (@_);
846 my ($section, $field, $value) = @_;
847
848 my $section_ptr = $self->_lookup_section($section);
849 if (!defined $section_ptr) {
850 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
851 return;
852 }
853 if (!defined $value) {
854 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
855 return;
856 }
857 if (!defined $field) {
858 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
859 return;
860 }
861
862 #print STDERR "###$field=$value\n";
863 # double check that the value is utf-8
864 if (!&unicode::check_is_utf8($value)) {
865 print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
866 &unicode::ensure_utf8(\$value);
867 print STDERR " Tried converting to utf8: $value\n";
868 }
869
870 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
871}
872
873
874# methods for dealing with text
875
876# returns the text for a section
877sub get_text {
878 my $self = shift (@_);
879 my ($section) = @_;
880
881 my $section_ptr = $self->_lookup_section($section);
882 if (!defined $section_ptr) {
883 print STDERR "doc::get_text couldn't find section " .
884 "$section\n";
885 return "";
886 }
887
888 return $section_ptr->{'text'};
889}
890
891# returns the (utf-8 encoded) length of the text for a section
892sub get_text_length {
893 my $self = shift (@_);
894 my ($section) = @_;
895
896 my $section_ptr = $self->_lookup_section($section);
897 if (!defined $section_ptr) {
898 print STDERR "doc::get_text_length couldn't find section " .
899 "$section\n";
900 return 0;
901 }
902
903 return length ($section_ptr->{'text'});
904}
905
906sub delete_text {
907 my $self = shift (@_);
908 my ($section) = @_;
909
910 my $section_ptr = $self->_lookup_section($section);
911 if (!defined $section_ptr) {
912 print STDERR "doc::delete_text couldn't find section " .
913 "$section\n";
914 return;
915 }
916
917 $section_ptr->{'text'} = "";
918}
919
920# add_text assumes the text is in (extended) ascii form. For
921# text which has been already converted to the UTF-8 format
922# use add_utf8_text.
923sub add_text {
924 my $self = shift (@_);
925 my ($section, $text) = @_;
926
927 # convert the text to UTF-8 encoded unicode characters
928 # and add the text
929 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
930}
931
932
933# add_utf8_text assumes the text to be added has already
934# been converted to the UTF-8 encoding. For ascii text use
935# add_text
936sub add_utf8_text {
937 my $self = shift (@_);
938 my ($section, $text) = @_;
939
940 my $section_ptr = $self->_lookup_section($section);
941 if (!defined $section_ptr) {
942 print STDERR "doc::add_utf8_text couldn't find section " .
943 "$section\n";
944 return;
945 }
946
947 $section_ptr->{'text'} .= $text;
948}
949
950# returns the Source meta, which is the utf8 filename generated.
951# Added a separate method here for convenience
952sub get_source {
953 my $self = shift (@_);
954 return $self->get_metadata_element ($self->get_top_section(), "Source");
955}
956
957# returns the SourceFile meta, which is the url reference to the URL-encoded
958# version of Source (the utf8 filename). Added a separate method here for convenience
959sub get_sourcefile {
960 my $self = shift (@_);
961 return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
962}
963
964# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
965# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
966sub get_assocfile_from_sourcefile {
967 my $self = shift (@_);
968
969 # get the SourceFile meta, which is a *URL* to a file on the filesystem
970 my $top_section = $self->get_top_section();
971 my $source_file = $self->get_metadata_element($top_section, "SourceFile");
972
973 # get the actual filename as it exists on the filesystem which this url refers to
974 $source_file = &unicode::url_to_filename($source_file);
975 my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
976 return $assocfilename;
977}
978
979# methods for dealing with associated files
980
981# a file is associated with a document, NOT a section.
982# if section is defined it is noted in the data structure
983# only so that files associated from a particular section
984# may be removed later (using delete_section_assoc_files)
985sub associate_file {
986 my $self = shift (@_);
987 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
988 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
989
990 # remove all associated files with the same name
991 $self->delete_assoc_file ($assoc_filename);
992
993 push (@{$self->{'associated_files'}},
994 [$real_filename, $assoc_filename, $mime_type, $section]);
995}
996
997# returns a list of associated files in the form
998# [[real_filename, assoc_filename, mimetype], ...]
999sub get_assoc_files {
1000 my $self = shift (@_);
1001
1002 return $self->{'associated_files'};
1003}
1004
1005sub delete_section_assoc_files {
1006 my $self = shift (@_);
1007 my ($section) = @_;
1008
1009 my $i=0;
1010 while ($i < scalar (@{$self->{'associated_files'}})) {
1011 if (defined $self->{'associated_files'}->[$i]->[3] &&
1012 $self->{'associated_files'}->[$i]->[3] eq $section) {
1013 splice (@{$self->{'associated_files'}}, $i, 1);
1014 } else {
1015 $i++;
1016 }
1017 }
1018}
1019
1020sub delete_assoc_file {
1021 my $self = shift (@_);
1022 my ($assoc_filename) = @_;
1023
1024 my $i=0;
1025 while ($i < scalar (@{$self->{'associated_files'}})) {
1026 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1027 splice (@{$self->{'associated_files'}}, $i, 1);
1028 } else {
1029 $i++;
1030 }
1031 }
1032}
1033
1034sub reset_nextsection_ptr {
1035 my $self = shift (@_);
1036 my ($section) = @_;
1037
1038 my $section_ptr = $self->_lookup_section($section);
1039 $section_ptr->{'next_subsection'} = 1;
1040}
1041
10421;
Note: See TracBrowser for help on using the repository browser.