source: gsdl/trunk/perllib/doc.pm@ 20008

Last change on this file since 20008 was 19829, checked in by davidb, 15 years ago

doc.pm API extended to include call for finding out the original source filename (rather than the one where the rename_method has been applied). Useful for incremental building, and probably other things too

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 28.3 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use ghtml;
40use File::stat;
41##use hashdoc;
42use docprint;
43
44# the document type may be indexed_doc, nonindexed_doc, or
45# classification
46
47our $OIDcount = 0;
48
49sub new {
50 my $class = shift (@_);
51 my ($source_filename, $doc_type, $rename_method) = @_;
52
53
54 my $self = bless {'associated_files'=>[],
55 'subsection_order'=>[],
56 'next_subsection'=>1,
57 'subsections'=>{},
58 'metadata'=>[],
59 'text'=>"",
60 'OIDtype'=>"hash"}, $class;
61
62 # used to set lastmodified here, but this can screw up the HASH ids, so
63 # the docsave processor now calls set_lastmodified
64
65 $self->{'source_path'} = $source_filename;
66
67 if (defined $source_filename) {
68 $source_filename = &util::filename_within_collection($source_filename);
69 print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
70 $self->set_source_filename ($source_filename, $rename_method);
71 }
72
73 $self->set_doc_type ($doc_type) if defined $doc_type;
74
75 return $self;
76}
77# set lastmodified for OAI purposes, added by GRB, moved by kjdon
78sub set_lastmodified {
79 my $self = shift (@_);
80
81 my $source_path = $self->{'source_path'};
82
83 if (defined $source_path && (-e $source_path)) {
84
85 my $file_stat = stat($source_path);
86 my $mtime = $file_stat->mtime;
87 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $file_stat->mtime);
88 }
89}
90
91# clone the $self object
92sub duplicate {
93 my $self = shift (@_);
94
95 my $newobj = {};
96
97 foreach my $k (keys %$self) {
98 $newobj->{$k} = &clone ($self->{$k});
99 }
100
101 bless $newobj, ref($self);
102 return $newobj;
103}
104
105sub clone {
106 my ($from) = @_;
107 my $type = ref ($from);
108
109 if ($type eq "HASH") {
110 my $to = {};
111 foreach my $key (keys %$from) {
112 $to->{$key} = &clone ($from->{$key});
113 }
114 return $to;
115 } elsif ($type eq "ARRAY") {
116 my $to = [];
117 foreach my $v (@$from) {
118 push (@$to, &clone ($v));
119 }
120 return $to;
121 } else {
122 return $from;
123 }
124}
125
126sub set_OIDtype {
127 my $self = shift (@_);
128 my ($type, $metadata) = @_;
129
130 if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|incremental|dirname|assigned)$/) {
131 $self->{'OIDtype'} = $type;
132 } else {
133 $self->{'OIDtype'} = "hash";
134 }
135
136 if ($type =~ /^assigned$/) {
137 if (defined $metadata) {
138 $self->{'OIDmetadata'} = $metadata;
139 } else {
140 $self->{'OIDmetadata'} = "dc.Identifier";
141 }
142 }
143}
144
145sub set_source_filename {
146 my $self = shift (@_);
147 my ($source_filename, $rename_method) = @_;
148
149 # Since the gsdlsourcefilename element goes into the doc.xml it has
150 # to be utf8. However, it should also *represent* the source filename
151 # (in the import directory) which may not be utf8 at all.
152 # For instance, if this meta element (gsdlsourcefilename) will be used
153 # by other applications that parse doc.xml in order to locate
154 # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
155 # encode the real filename as this is a binary-to-text encoding meaning
156 # that the resulting string is ASCII (utf8). Decoding will give the original.
157
158# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
159
160 # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
161 my ($srcfilename,$dirname,$suffix)
162 = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
163# print STDERR "-> $srcfilename -> ";
164 $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
165 $source_filename = &util::filename_cat($dirname, $srcfilename);
166# print STDERR "$source_filename\n";
167
168 $self->set_utf8_metadata_element ($self->get_top_section(),
169 "gsdlsourcefilename",
170 $source_filename);
171}
172
173sub set_converted_filename {
174 my $self = shift (@_);
175 my ($converted_filename) = @_;
176
177 # we know the converted filename is utf8
178 $self->set_utf8_metadata_element ($self->get_top_section(),
179 "gsdlconvertedfilename",
180 $converted_filename);
181}
182
183# returns the source_filename as it was provided
184sub get_unmodified_source_filename {
185 my $self = shift (@_);
186
187 return $self->{'source_path'};
188}
189
190# returns the source_filename with whatever rename_method was given
191sub get_source_filename {
192 my $self = shift (@_);
193
194 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
195}
196
197
198
199# returns converted filename if available else returns source filename
200sub get_filename_for_hashing {
201 my $self = shift (@_);
202
203 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
204
205 if (!defined $filename) {
206 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
207 # if NULPlug processed file, then don't give a filename
208 if (defined $plugin_name && $plugin_name eq "NULPlug") {
209 $filename = undef;
210 } else { # returns the URL encoded source filename!
211 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
212 }
213 }
214 return $filename;
215}
216
217sub set_doc_type {
218 my $self = shift (@_);
219 my ($doc_type) = @_;
220
221 $self->set_metadata_element ($self->get_top_section(),
222 "gsdldoctype",
223 $doc_type);
224}
225
226# returns the gsdldoctype as it was provided
227# the default of "indexed_doc" is used if no document
228# type was provided
229sub get_doc_type {
230 my $self = shift (@_);
231
232 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
233 return $doc_type if (defined $doc_type);
234 return "indexed_doc";
235}
236
237
238# look up the reference to the a particular section
239sub _lookup_section {
240 my $self = shift (@_);
241 my ($section) = @_;
242
243 my ($num);
244 my $sectionref = $self;
245
246 while (defined $section && $section ne "") {
247
248 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
249
250 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
251
252 $section = "" unless defined $section;
253
254
255 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
256 $sectionref = $sectionref->{'subsections'}->{$num};
257 } else {
258 return undef;
259 }
260 }
261
262 return $sectionref;
263}
264
265# calculate OID by hashing the contents of the document
266sub _calc_OID {
267 my $self = shift (@_);
268 my ($filename) = @_;
269
270 my $osexe = &util::get_os_exe();
271
272 my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
273 $ENV{'GSDLOS'},"hashfile$osexe");
274
275 my $result = "NULL";
276
277 if (-e "$hashfile_exe") {
278# $result = `\"$hashfile_exe\" \"$filename\"`;
279 $result = `hashfile$osexe \"$filename\"`;
280 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
281 } else {
282 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
283 }
284 return "HASH$result";
285}
286
287# methods dealing with OID, not groups of them.
288
289# if $OID is not provided one is calculated
290sub set_OID {
291 my $self = shift (@_);
292 my ($OID) = @_;
293
294 my $use_hash_oid = 0;
295 # if an OID wasn't provided calculate one
296 if (!defined $OID) {
297 $OID = "NULL";
298 if ($self->{'OIDtype'} =~ /^hash/) {
299 $use_hash_oid = 1;
300 } elsif ($self->{'OIDtype'} eq "incremental") {
301 $OID = "D" . $OIDcount;
302 $OIDcount ++;
303
304 } elsif ($self->{'OIDtype'} eq "dirname") {
305 $OID = 'J';
306 my $filename = $self->get_source_filename();
307 if (defined($filename)) { # && -e $filename) {
308 $OID = &File::Basename::dirname($filename);
309 if (defined $OID) {
310 $OID = 'J'.&File::Basename::basename($OID);
311 $OID = &util::tidy_up_oid($OID);
312 } else {
313 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
314 $use_hash_oid = 1;
315 }
316 } else {
317 print STDERR "Failed to find filename, generating hash id\n";
318 $use_hash_oid = 1;
319 }
320
321 } elsif ($self->{'OIDtype'} eq "assigned") {
322 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
323 if (defined $identifier && $identifier ne "") {
324 $OID = $identifier;
325 $OID = &util::tidy_up_oid($OID);
326 } else {
327 # need a hash id
328 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
329 $use_hash_oid = 1;
330 }
331
332 } else {
333 $use_hash_oid = 1;
334 }
335
336 if ($use_hash_oid) {
337 my $hash_on_file = 1;
338 if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
339 $hash_on_file = 0;
340 }
341 if ($hash_on_file) {
342 # "hash" OID - feed file to hashfile.exe
343 my $filename = $self->get_filename_for_hashing();
344 # -z: don't want to hash on the file if it is zero size
345 if (defined($filename) && -e $filename && !-z $filename) {
346 $OID = $self->_calc_OID ($filename);
347 } else {
348 $hash_on_file = 0;
349 }
350 }
351 if (!$hash_on_file) {
352 my $filename = &util::get_tmp_filename();
353 if (!open (OUTFILE, ">$filename")) {
354 print STDERR "doc::set_OID could not write to $filename\n";
355 } else {
356 my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
357 print OUTFILE $doc_text;
358 close (OUTFILE);
359 }
360 $OID = $self->_calc_OID ($filename);
361 &util::rm ($filename);
362 }
363 }
364 }
365 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
366}
367
368# this uses hashdoc (embedded c thingy) which is faster but still
369# needs a little work to be suffiently stable
370sub ___set_OID {
371 my $self = shift (@_);
372 my ($OID) = @_;
373
374 # if an OID wasn't provided then calculate hash value based on document
375 if (!defined $OID)
376 {
377 my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
378 my $hash_len = length($hash_text);
379
380 $OID = &hashdoc::buffer($hash_text,$hash_len);
381 }
382
383 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
384}
385
386# returns the OID for this document
387sub get_OID {
388 my $self = shift (@_);
389 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
390 return $OID if (defined $OID);
391 return "NULL";
392}
393
394sub delete_OID {
395 my $self = shift (@_);
396
397 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
398}
399
400
401# methods for manipulating section names
402
403# returns the name of the top-most section (the top
404# level of the document
405sub get_top_section {
406 my $self = shift (@_);
407
408 return "";
409}
410
411# returns a section
412sub get_parent_section {
413 my $self = shift (@_);
414 my ($section) = @_;
415
416 $section =~ s/(^|\.)\d+$//;
417
418 return $section;
419}
420
421# returns the first child section (or the end child
422# if there isn't any)
423sub get_begin_child {
424 my $self = shift (@_);
425 my ($section) = @_;
426
427 my $section_ptr = $self->_lookup_section($section);
428 return "" unless defined $section_ptr;
429
430 if (defined $section_ptr->{'subsection_order'}->[0]) {
431 return "$section.$section_ptr->{'subsection_order'}->[0]";
432 }
433
434 return $self->get_end_child ($section);
435}
436
437# returns the next child of a parent section
438sub get_next_child {
439 my $self = shift (@_);
440 my ($section) = @_;
441
442 my $parent_section = $self->get_parent_section($section);
443 my $parent_section_ptr = $self->_lookup_section($parent_section);
444 return undef unless defined $parent_section_ptr;
445
446 my ($section_num) = $section =~ /(\d+)$/;
447 return undef unless defined $section_num;
448
449 my $i = 0;
450 my $section_order = $parent_section_ptr->{'subsection_order'};
451 while ($i < scalar(@$section_order)) {
452 last if $section_order->[$i] eq $section_num;
453 $i++;
454 }
455
456 $i++; # the next child
457 if ($i < scalar(@$section_order)) {
458 return $section_order->[$i] if $parent_section eq "";
459 return "$parent_section.$section_order->[$i]";
460 }
461
462 # no more sections in this level
463 return undef;
464}
465
466# returns a reference to a list of children
467sub get_children {
468 my $self = shift (@_);
469 my ($section) = @_;
470
471 my $section_ptr = $self->_lookup_section($section);
472 return [] unless defined $section_ptr;
473
474 my @children = @{$section_ptr->{'subsection_order'}};
475
476 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
477 return \@children;
478}
479
480# returns the child section one past the last one (which
481# is coded as "0")
482sub get_end_child {
483 my $self = shift (@_);
484 my ($section) = @_;
485
486 return $section . ".0" unless $section eq "";
487 return "0";
488}
489
490# returns the next section in book order
491sub get_next_section {
492 my $self = shift (@_);
493 my ($section) = @_;
494
495 return undef unless defined $section;
496
497 my $section_ptr = $self->_lookup_section($section);
498 return undef unless defined $section_ptr;
499
500 # first try to find first child
501 if (defined $section_ptr->{'subsection_order'}->[0]) {
502 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
503 return "$section.$section_ptr->{'subsection_order'}->[0]";
504 }
505
506 do {
507 # try to find sibling
508 my $next_child = $self->get_next_child ($section);
509 return $next_child if (defined $next_child);
510
511 # move up one level
512 $section = $self->get_parent_section ($section);
513 } while $section =~ /\d/;
514
515 return undef;
516}
517
518sub is_leaf_section {
519 my $self = shift (@_);
520 my ($section) = @_;
521
522 my $section_ptr = $self->_lookup_section($section);
523 return 1 unless defined $section_ptr;
524
525 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
526}
527
528# methods for dealing with sections
529
530# returns the name of the inserted section
531sub insert_section {
532 my $self = shift (@_);
533 my ($before_section) = @_;
534
535 # get the child to insert before and its parent section
536 my $parent_section = "";
537 my $before_child = "0";
538 my @before_section = split (/\./, $before_section);
539 if (scalar(@before_section) > 0) {
540 $before_child = pop (@before_section);
541 $parent_section = join (".", @before_section);
542 }
543
544 my $parent_section_ptr = $self->_lookup_section($parent_section);
545 if (!defined $parent_section_ptr) {
546 print STDERR "doc::insert_section couldn't find parent section " .
547 "$parent_section\n";
548 return;
549 }
550
551 # get the next section number
552 my $section_num = $parent_section_ptr->{'next_subsection'}++;
553
554 my $i = 0;
555 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
556 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
557 $i++;
558 }
559
560 # insert the section number into the order list
561 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
562
563 # add this section to the parent section
564 my $section_ptr = {'subsection_order'=>[],
565 'next_subsection'=>1,
566 'subsections'=>{},
567 'metadata'=>[],
568 'text'=>""};
569 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
570
571 # work out the full section number
572 my $section = $parent_section;
573 $section .= "." unless $section eq "";
574 $section .= $section_num;
575
576 return $section;
577}
578
579# creates a pre-named section
580sub create_named_section {
581 my $self = shift (@_);
582 my ($mastersection) = @_;
583
584 my ($num);
585 my $section = $mastersection;
586 my $sectionref = $self;
587
588 while ($section ne "") {
589 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
590 $num =~ s/^0+(\d)/$1/; # remove leading 0s
591 $section = "" unless defined $section;
592
593 if (defined $num) {
594 if (!defined $sectionref->{'subsections'}->{$num}) {
595 push (@{$sectionref->{'subsection_order'}}, $num);
596 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
597 'next_subsection'=>1,
598 'subsections'=>{},
599 'metadata'=>[],
600 'text'=>""};
601 if ($num >= $sectionref->{'next_subsection'}) {
602 $sectionref->{'next_subsection'} = $num + 1;
603 }
604 }
605 $sectionref = $sectionref->{'subsections'}->{$num};
606
607 } else {
608 print STDERR "doc::create_named_section couldn't create section ";
609 print STDERR "$mastersection\n";
610 last;
611 }
612 }
613}
614
615# returns a reference to a list of subsections
616sub list_subsections {
617 my $self = shift (@_);
618 my ($section) = @_;
619
620 my $section_ptr = $self->_lookup_section ($section);
621 if (!defined $section_ptr) {
622 print STDERR "doc::list_subsections couldn't find section $section\n";
623 return [];
624 }
625
626 return [@{$section_ptr->{'subsection_order'}}];
627}
628
629sub delete_section {
630 my $self = shift (@_);
631 my ($section) = @_;
632
633# my $section_ptr = {'subsection_order'=>[],
634# 'next_subsection'=>1,
635# 'subsections'=>{},
636# 'metadata'=>[],
637# 'text'=>""};
638
639 # if this is the top section reset everything
640 if ($section eq "") {
641 $self->{'subsection_order'} = [];
642 $self->{'subsections'} = {};
643 $self->{'metadata'} = [];
644 $self->{'text'} = "";
645 return;
646 }
647
648 # find the parent of the section to delete
649 my $parent_section = "";
650 my $child = "0";
651 my @section = split (/\./, $section);
652 if (scalar(@section) > 0) {
653 $child = pop (@section);
654 $parent_section = join (".", @section);
655 }
656
657 my $parent_section_ptr = $self->_lookup_section($parent_section);
658 if (!defined $parent_section_ptr) {
659 print STDERR "doc::delete_section couldn't find parent section " .
660 "$parent_section\n";
661 return;
662 }
663
664 # remove this section from the subsection_order list
665 my $i = 0;
666 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
667 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
668 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
669 last;
670 }
671 $i++;
672 }
673
674 # remove this section from the subsection hash
675 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
676 undef $parent_section_ptr->{'subsections'}->{$child};
677 }
678}
679
680#--
681# methods for dealing with metadata
682
683# set_metadata_element and get_metadata_element are for metadata
684# which should only have one value. add_meta_data and get_metadata
685# are for metadata which can have more than one value.
686
687# returns the first metadata value which matches field
688
689# This version of get metadata element works much like the one above,
690# except it allows for the namespace portion of a metadata element to
691# be ignored, thus if you are searching for dc.Title, the first piece
692# of matching metadata ending with the name Title (once any namespace
693# is removed) would be returned.
694# 28-11-2003 John Thompson
695sub get_metadata_element {
696 my $self = shift (@_);
697 my ($section, $field, $ignore_namespace) = @_;
698 my ($data);
699
700 $ignore_namespace = 0 unless defined $ignore_namespace;
701
702 my $section_ptr = $self->_lookup_section($section);
703 if (!defined $section_ptr) {
704 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
705 return;
706 }
707
708 # Remove the any namespace if we are being told to ignore them
709 if($ignore_namespace) {
710 $field =~ s/^\w*\.//;
711 }
712
713 foreach $data (@{$section_ptr->{'metadata'}}) {
714
715 my $data_name = $data->[0];
716
717 # Remove the any namespace if we are being told to ignore them
718 if($ignore_namespace) {
719 $data_name =~ s/^\w*\.//;
720 }
721
722 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
723 }
724
725 return undef; # was not found
726}
727
728# returns a list of the form [value1, value2, ...]
729sub get_metadata {
730 my $self = shift (@_);
731 my ($section, $field, $ignore_namespace) = @_;
732 my ($data);
733
734 $ignore_namespace = 0 unless defined $ignore_namespace;
735
736 my $section_ptr = $self->_lookup_section($section);
737 if (!defined $section_ptr) {
738 print STDERR "doc::get_metadata couldn't find section ",
739 $section, "\n";
740 return;
741 }
742
743 # Remove the any namespace if we are being told to ignore them
744 if($ignore_namespace) {
745 $field =~ s/^\w*\.//;
746 }
747
748 my @metadata = ();
749 foreach $data (@{$section_ptr->{'metadata'}}) {
750
751 my $data_name = $data->[0];
752 # Remove the any namespace if we are being told to ignore them
753 if($ignore_namespace) {
754 $data_name =~ s/^\w*\.//;
755 }
756
757 push (@metadata, $data->[1]) if ($data_name eq $field);
758 }
759
760 return \@metadata;
761}
762
763# returns a list of the form [[field,value],[field,value],...]
764sub get_all_metadata {
765 my $self = shift (@_);
766 my ($section) = @_;
767
768 my $section_ptr = $self->_lookup_section($section);
769 if (!defined $section_ptr) {
770 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
771 return;
772 }
773
774 return $section_ptr->{'metadata'};
775}
776
777# $value is optional
778sub delete_metadata {
779 my $self = shift (@_);
780 my ($section, $field, $value) = @_;
781
782 my $section_ptr = $self->_lookup_section($section);
783 if (!defined $section_ptr) {
784 print STDERR "doc::delete_metadata couldn't find section ", $section, "\n";
785 return;
786 }
787
788 my $i = 0;
789 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
790 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
791 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
792 splice (@{$section_ptr->{'metadata'}}, $i, 1);
793 } else {
794 $i++;
795 }
796 }
797}
798
799sub delete_all_metadata {
800 my $self = shift (@_);
801 my ($section) = @_;
802
803 my $section_ptr = $self->_lookup_section($section);
804 if (!defined $section_ptr) {
805 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
806 return;
807 }
808
809 $section_ptr->{'metadata'} = [];
810}
811
812sub set_metadata_element {
813 my $self = shift (@_);
814 my ($section, $field, $value) = @_;
815
816 $self->set_utf8_metadata_element ($section, $field,
817 &unicode::ascii2utf8(\$value));
818}
819
820# set_utf8_metadata_element assumes the text has already been
821# converted to the UTF-8 encoding.
822sub set_utf8_metadata_element {
823 my $self = shift (@_);
824 my ($section, $field, $value) = @_;
825
826 $self->delete_metadata ($section, $field);
827 $self->add_utf8_metadata ($section, $field, $value);
828}
829
830
831# add_metadata assumes the text is in (extended) ascii form. For
832# text which has already been converted to the UTF-8 format use
833# add_utf8_metadata.
834sub add_metadata {
835 my $self = shift (@_);
836 my ($section, $field, $value) = @_;
837
838 $self->add_utf8_metadata ($section, $field,
839 &unicode::ascii2utf8(\$value));
840}
841
842sub add_utf8_metadata {
843 my $self = shift (@_);
844 my ($section, $field, $value) = @_;
845
846 my $section_ptr = $self->_lookup_section($section);
847 if (!defined $section_ptr) {
848 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
849 return;
850 }
851 if (!defined $value) {
852 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
853 return;
854 }
855 if (!defined $field) {
856 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
857 return;
858 }
859
860 #print STDERR "###$field=$value\n";
861 # double check that the value is utf-8
862 if (!&unicode::check_is_utf8($value)) {
863 print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
864 &unicode::ensure_utf8(\$value);
865 print STDERR " Tried converting to utf8: $value\n";
866 }
867
868 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
869}
870
871
872# methods for dealing with text
873
874# returns the text for a section
875sub get_text {
876 my $self = shift (@_);
877 my ($section) = @_;
878
879 my $section_ptr = $self->_lookup_section($section);
880 if (!defined $section_ptr) {
881 print STDERR "doc::get_text couldn't find section " .
882 "$section\n";
883 return "";
884 }
885
886 return $section_ptr->{'text'};
887}
888
889# returns the (utf-8 encoded) length of the text for a section
890sub get_text_length {
891 my $self = shift (@_);
892 my ($section) = @_;
893
894 my $section_ptr = $self->_lookup_section($section);
895 if (!defined $section_ptr) {
896 print STDERR "doc::get_text_length couldn't find section " .
897 "$section\n";
898 return 0;
899 }
900
901 return length ($section_ptr->{'text'});
902}
903
904sub delete_text {
905 my $self = shift (@_);
906 my ($section) = @_;
907
908 my $section_ptr = $self->_lookup_section($section);
909 if (!defined $section_ptr) {
910 print STDERR "doc::delete_text couldn't find section " .
911 "$section\n";
912 return;
913 }
914
915 $section_ptr->{'text'} = "";
916}
917
918# add_text assumes the text is in (extended) ascii form. For
919# text which has been already converted to the UTF-8 format
920# use add_utf8_text.
921sub add_text {
922 my $self = shift (@_);
923 my ($section, $text) = @_;
924
925 # convert the text to UTF-8 encoded unicode characters
926 # and add the text
927 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
928}
929
930
931# add_utf8_text assumes the text to be added has already
932# been converted to the UTF-8 encoding. For ascii text use
933# add_text
934sub add_utf8_text {
935 my $self = shift (@_);
936 my ($section, $text) = @_;
937
938 my $section_ptr = $self->_lookup_section($section);
939 if (!defined $section_ptr) {
940 print STDERR "doc::add_utf8_text couldn't find section " .
941 "$section\n";
942 return;
943 }
944
945 $section_ptr->{'text'} .= $text;
946}
947
948# returns the Source meta, which is the utf8 filename generated.
949# Added a separate method here for convenience
950sub get_source {
951 my $self = shift (@_);
952 return $self->get_metadata_element ($self->get_top_section(), "Source");
953}
954
955# returns the SourceFile meta, which is the url reference to the URL-encoded
956# version of Source (the utf8 filename). Added a separate method here for convenience
957sub get_sourcefile {
958 my $self = shift (@_);
959 return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
960}
961
962# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
963# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
964sub get_assocfile_from_sourcefile {
965 my $self = shift (@_);
966
967 # get the SourceFile meta, which is a *URL* to a file on the filesystem
968 my $top_section = $self->get_top_section();
969 my $source_file = $self->get_metadata_element($top_section, "SourceFile");
970
971 # get the actual filename as it exists on the filesystem which this url refers to
972 $source_file = &unicode::url_to_filename($source_file);
973 my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
974 return $assocfilename;
975}
976
977# methods for dealing with associated files
978
979# a file is associated with a document, NOT a section.
980# if section is defined it is noted in the data structure
981# only so that files associated from a particular section
982# may be removed later (using delete_section_assoc_files)
983sub associate_file {
984 my $self = shift (@_);
985 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
986 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
987
988 # remove all associated files with the same name
989 $self->delete_assoc_file ($assoc_filename);
990
991 push (@{$self->{'associated_files'}},
992 [$real_filename, $assoc_filename, $mime_type, $section]);
993}
994
995# returns a list of associated files in the form
996# [[real_filename, assoc_filename, mimetype], ...]
997sub get_assoc_files {
998 my $self = shift (@_);
999
1000 return $self->{'associated_files'};
1001}
1002
1003
1004sub metadata_file {
1005 my $self = shift (@_);
1006 my ($real_filename, $filename) = @_;
1007
1008 push (@{$self->{'metadata_files'}},
1009 [$real_filename, $filename]);
1010}
1011
1012sub get_meta_files {
1013 my $self = shift (@_);
1014
1015 return $self->{'metadata_files'};
1016}
1017
1018sub delete_section_assoc_files {
1019 my $self = shift (@_);
1020 my ($section) = @_;
1021
1022 my $i=0;
1023 while ($i < scalar (@{$self->{'associated_files'}})) {
1024 if (defined $self->{'associated_files'}->[$i]->[3] &&
1025 $self->{'associated_files'}->[$i]->[3] eq $section) {
1026 splice (@{$self->{'associated_files'}}, $i, 1);
1027 } else {
1028 $i++;
1029 }
1030 }
1031}
1032
1033sub delete_assoc_file {
1034 my $self = shift (@_);
1035 my ($assoc_filename) = @_;
1036
1037 my $i=0;
1038 while ($i < scalar (@{$self->{'associated_files'}})) {
1039 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1040 splice (@{$self->{'associated_files'}}, $i, 1);
1041 } else {
1042 $i++;
1043 }
1044 }
1045}
1046
1047sub reset_nextsection_ptr {
1048 my $self = shift (@_);
1049 my ($section) = @_;
1050
1051 my $section_ptr = $self->_lookup_section($section);
1052 $section_ptr->{'next_subsection'} = 1;
1053}
1054
10551;
Note: See TracBrowser for help on using the repository browser.