source: gsdl/trunk/perllib/doc.pm@ 19494

Last change on this file since 19494 was 19494, checked in by davidb, 15 years ago

Supporting routines that exploit the new 'metafiles' structures, introduction to track which metadata.xml file a piece of metadata came from

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 28.5 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use ghtml;
40use File::stat;
41##use hashdoc;
42use docprint;
43
44# the document type may be indexed_doc, nonindexed_doc, or
45# classification
46
47our $OIDcount = 0;
48
49sub new {
50 my $class = shift (@_);
51 my ($source_filename, $doc_type, $rename_method) = @_;
52
53
54 my $self = bless {'associated_files'=>[],
55 'subsection_order'=>[],
56 'next_subsection'=>1,
57 'subsections'=>{},
58 'metadata'=>[],
59 'text'=>"",
60 'OIDtype'=>"hash"}, $class;
61
62 # used to set lastmodified here, but this can screw up the HASH ids, so
63 # the docsave processor now calls set_lastmodified
64
65 $self->{'source_path'} = $source_filename;
66
67 if (defined $source_filename) {
68 $source_filename = &util::filename_within_collection($source_filename);
69 print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
70 $self->set_source_filename ($source_filename, $rename_method);
71 }
72
73 $self->set_doc_type ($doc_type) if defined $doc_type;
74
75 return $self;
76}
77# set lastmodified for OAI purposes, added by GRB, moved by kjdon
78sub set_lastmodified {
79 my $self = shift (@_);
80
81 my $source_path = $self->{'source_path'};
82
83 if (defined $source_path && (-e $source_path)) {
84
85 my $file_stat = stat($source_path);
86 my $mtime = $file_stat->mtime;
87 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $file_stat->mtime);
88 }
89}
90
91# clone the $self object
92sub duplicate {
93 my $self = shift (@_);
94
95 my $newobj = {};
96
97 foreach my $k (keys %$self) {
98 $newobj->{$k} = &clone ($self->{$k});
99 }
100
101 bless $newobj, ref($self);
102 return $newobj;
103}
104
105sub clone {
106 my ($from) = @_;
107 my $type = ref ($from);
108
109 if ($type eq "HASH") {
110 my $to = {};
111 foreach my $key (keys %$from) {
112 $to->{$key} = &clone ($from->{$key});
113 }
114 return $to;
115 } elsif ($type eq "ARRAY") {
116 my $to = [];
117 foreach my $v (@$from) {
118 push (@$to, &clone ($v));
119 }
120 return $to;
121 } else {
122 return $from;
123 }
124}
125
126sub set_OIDtype {
127 my $self = shift (@_);
128 my ($type, $metadata) = @_;
129
130 if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|incremental|dirname|assigned)$/) {
131 $self->{'OIDtype'} = $type;
132 } else {
133 $self->{'OIDtype'} = "hash";
134 }
135
136 if ($type =~ /^assigned$/) {
137 if (defined $metadata) {
138 $self->{'OIDmetadata'} = $metadata;
139 } else {
140 $self->{'OIDmetadata'} = "dc.Identifier";
141 }
142 }
143}
144
145sub set_source_filename {
146 my $self = shift (@_);
147 my ($source_filename, $rename_method) = @_;
148
149 # Since the gsdlsourcefilename element goes into the doc.xml it has
150 # to be utf8. However, it should also *represent* the source filename
151 # (in the import directory) which may not be utf8 at all.
152 # For instance, if this meta element (gsdlsourcefilename) will be used
153 # by other applications that parse doc.xml in order to locate
154 # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
155 # encode the real filename as this is a binary-to-text encoding meaning
156 # that the resulting string is ASCII (utf8). Decoding will give the original.
157
158# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
159
160 # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
161 my ($srcfilename,$dirname,$suffix)
162 = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
163# print STDERR "-> $srcfilename -> ";
164 $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
165 $source_filename = &util::filename_cat($dirname, $srcfilename);
166# print STDERR "$source_filename\n";
167
168 $self->set_utf8_metadata_element ($self->get_top_section(),
169 "gsdlsourcefilename",
170 $source_filename);
171}
172
173sub set_converted_filename {
174 my $self = shift (@_);
175 my ($converted_filename) = @_;
176
177 # we know the converted filename is utf8
178 $self->set_utf8_metadata_element ($self->get_top_section(),
179 "gsdlconvertedfilename",
180 $converted_filename);
181}
182
183# returns the source_filename as it was provided
184sub get_source_filename {
185 my $self = shift (@_);
186
187 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
188}
189
190# returns converted filename if available else returns source filename
191sub get_filename_for_hashing {
192 my $self = shift (@_);
193
194 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
195
196 if (!defined $filename) {
197 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
198 # if NULPlug processed file, then don't give a filename
199 if (defined $plugin_name && $plugin_name eq "NULPlug") {
200 $filename = undef;
201 } else { # returns the URL encoded source filename!
202 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
203 }
204 }
205 return $filename;
206}
207
208sub set_doc_type {
209 my $self = shift (@_);
210 my ($doc_type) = @_;
211
212 $self->set_metadata_element ($self->get_top_section(),
213 "gsdldoctype",
214 $doc_type);
215}
216
217# returns the gsdldoctype as it was provided
218# the default of "indexed_doc" is used if no document
219# type was provided
220sub get_doc_type {
221 my $self = shift (@_);
222
223 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
224 return $doc_type if (defined $doc_type);
225 return "indexed_doc";
226}
227
228
229# look up the reference to the a particular section
230sub _lookup_section {
231 my $self = shift (@_);
232 my ($section) = @_;
233
234 my ($num);
235 my $sectionref = $self;
236
237 while (defined $section && $section ne "") {
238
239 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
240
241 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
242
243 $section = "" unless defined $section;
244
245
246 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
247 $sectionref = $sectionref->{'subsections'}->{$num};
248 } else {
249 return undef;
250 }
251 }
252
253 return $sectionref;
254}
255
256# calculate OID by hashing the contents of the document
257sub _calc_OID {
258 my $self = shift (@_);
259 my ($filename) = @_;
260
261 my $osexe = &util::get_os_exe();
262
263 my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
264 $ENV{'GSDLOS'},"hashfile$osexe");
265
266 my $result = "NULL";
267
268 if (-e "$hashfile_exe") {
269# $result = `\"$hashfile_exe\" \"$filename\"`;
270 $result = `hashfile$osexe \"$filename\"`;
271 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
272 } else {
273 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
274 }
275 return "HASH$result";
276}
277
278# methods dealing with OID, not groups of them.
279
280# if $OID is not provided one is calculated
281sub set_OID {
282 my $self = shift (@_);
283 my ($OID) = @_;
284
285 my $use_hash_oid = 0;
286 # if an OID wasn't provided calculate one
287 if (!defined $OID) {
288 $OID = "NULL";
289 if ($self->{'OIDtype'} =~ /^hash/) {
290 $use_hash_oid = 1;
291 } elsif ($self->{'OIDtype'} eq "incremental") {
292 $OID = "D" . $OIDcount;
293 $OIDcount ++;
294
295 } elsif ($self->{'OIDtype'} eq "dirname") {
296 $OID = 'J';
297 my $filename = $self->get_source_filename();
298 if (defined($filename)) { # && -e $filename) {
299 $OID = &File::Basename::dirname($filename);
300 if (defined $OID) {
301 $OID = 'J'.&File::Basename::basename($OID);
302 $OID =~ s/\.//; #remove any periods
303 } else {
304 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
305 $use_hash_oid = 1;
306 }
307 } else {
308 print STDERR "Failed to find filename, generating hash id\n";
309 $use_hash_oid = 1;
310 }
311
312 } elsif ($self->{'OIDtype'} eq "assigned") {
313 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
314 if (defined $identifier && $identifier ne "") {
315 $OID = $identifier;
316 if ($OID =~ /\./) {
317 print STDERR "Warning, assigned identifier $identifier contains periods (.), removing them\n";
318 $OID =~ s/\.//g; #remove any periods
319 }
320 if ($OID =~ /^[\d]*$/) {
321 print STDERR "Warning, assigned identifier $identifier contains only digits. Prepending 'D'.\n";
322 $OID = "D" . $OID;
323 }
324 } else {
325 # need a hash id
326 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
327 $use_hash_oid = 1;
328 }
329
330 } else {
331 $use_hash_oid = 1;
332 }
333
334 if ($use_hash_oid) {
335 my $hash_on_file = 1;
336 if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
337 $hash_on_file = 0;
338 }
339 if ($hash_on_file) {
340 # "hash" OID - feed file to hashfile.exe
341 my $filename = $self->get_filename_for_hashing();
342 # -z: don't want to hash on the file if it is zero size
343 if (defined($filename) && -e $filename && !-z $filename) {
344 $OID = $self->_calc_OID ($filename);
345 } else {
346 $hash_on_file = 0;
347 }
348 }
349 if (!$hash_on_file) {
350 my $filename = &util::get_tmp_filename();
351 if (!open (OUTFILE, ">$filename")) {
352 print STDERR "doc::set_OID could not write to $filename\n";
353 } else {
354 my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
355 print OUTFILE $doc_text;
356 close (OUTFILE);
357 }
358 $OID = $self->_calc_OID ($filename);
359 &util::rm ($filename);
360 }
361 }
362 }
363 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
364}
365
366# this uses hashdoc (embedded c thingy) which is faster but still
367# needs a little work to be suffiently stable
368sub ___set_OID {
369 my $self = shift (@_);
370 my ($OID) = @_;
371
372 # if an OID wasn't provided then calculate hash value based on document
373 if (!defined $OID)
374 {
375 my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
376 my $hash_len = length($hash_text);
377
378 $OID = &hashdoc::buffer($hash_text,$hash_len);
379 }
380
381 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
382}
383
384# returns the OID for this document
385sub get_OID {
386 my $self = shift (@_);
387 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
388 return $OID if (defined $OID);
389 return "NULL";
390}
391
392sub delete_OID {
393 my $self = shift (@_);
394
395 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
396}
397
398
399# methods for manipulating section names
400
401# returns the name of the top-most section (the top
402# level of the document
403sub get_top_section {
404 my $self = shift (@_);
405
406 return "";
407}
408
409# returns a section
410sub get_parent_section {
411 my $self = shift (@_);
412 my ($section) = @_;
413
414 $section =~ s/(^|\.)\d+$//;
415
416 return $section;
417}
418
419# returns the first child section (or the end child
420# if there isn't any)
421sub get_begin_child {
422 my $self = shift (@_);
423 my ($section) = @_;
424
425 my $section_ptr = $self->_lookup_section($section);
426 return "" unless defined $section_ptr;
427
428 if (defined $section_ptr->{'subsection_order'}->[0]) {
429 return "$section.$section_ptr->{'subsection_order'}->[0]";
430 }
431
432 return $self->get_end_child ($section);
433}
434
435# returns the next child of a parent section
436sub get_next_child {
437 my $self = shift (@_);
438 my ($section) = @_;
439
440 my $parent_section = $self->get_parent_section($section);
441 my $parent_section_ptr = $self->_lookup_section($parent_section);
442 return undef unless defined $parent_section_ptr;
443
444 my ($section_num) = $section =~ /(\d+)$/;
445 return undef unless defined $section_num;
446
447 my $i = 0;
448 my $section_order = $parent_section_ptr->{'subsection_order'};
449 while ($i < scalar(@$section_order)) {
450 last if $section_order->[$i] eq $section_num;
451 $i++;
452 }
453
454 $i++; # the next child
455 if ($i < scalar(@$section_order)) {
456 return $section_order->[$i] if $parent_section eq "";
457 return "$parent_section.$section_order->[$i]";
458 }
459
460 # no more sections in this level
461 return undef;
462}
463
464# returns a reference to a list of children
465sub get_children {
466 my $self = shift (@_);
467 my ($section) = @_;
468
469 my $section_ptr = $self->_lookup_section($section);
470 return [] unless defined $section_ptr;
471
472 my @children = @{$section_ptr->{'subsection_order'}};
473
474 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
475 return \@children;
476}
477
478# returns the child section one past the last one (which
479# is coded as "0")
480sub get_end_child {
481 my $self = shift (@_);
482 my ($section) = @_;
483
484 return $section . ".0" unless $section eq "";
485 return "0";
486}
487
488# returns the next section in book order
489sub get_next_section {
490 my $self = shift (@_);
491 my ($section) = @_;
492
493 return undef unless defined $section;
494
495 my $section_ptr = $self->_lookup_section($section);
496 return undef unless defined $section_ptr;
497
498 # first try to find first child
499 if (defined $section_ptr->{'subsection_order'}->[0]) {
500 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
501 return "$section.$section_ptr->{'subsection_order'}->[0]";
502 }
503
504 do {
505 # try to find sibling
506 my $next_child = $self->get_next_child ($section);
507 return $next_child if (defined $next_child);
508
509 # move up one level
510 $section = $self->get_parent_section ($section);
511 } while $section =~ /\d/;
512
513 return undef;
514}
515
516sub is_leaf_section {
517 my $self = shift (@_);
518 my ($section) = @_;
519
520 my $section_ptr = $self->_lookup_section($section);
521 return 1 unless defined $section_ptr;
522
523 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
524}
525
526# methods for dealing with sections
527
528# returns the name of the inserted section
529sub insert_section {
530 my $self = shift (@_);
531 my ($before_section) = @_;
532
533 # get the child to insert before and its parent section
534 my $parent_section = "";
535 my $before_child = "0";
536 my @before_section = split (/\./, $before_section);
537 if (scalar(@before_section) > 0) {
538 $before_child = pop (@before_section);
539 $parent_section = join (".", @before_section);
540 }
541
542 my $parent_section_ptr = $self->_lookup_section($parent_section);
543 if (!defined $parent_section_ptr) {
544 print STDERR "doc::insert_section couldn't find parent section " .
545 "$parent_section\n";
546 return;
547 }
548
549 # get the next section number
550 my $section_num = $parent_section_ptr->{'next_subsection'}++;
551
552 my $i = 0;
553 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
554 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
555 $i++;
556 }
557
558 # insert the section number into the order list
559 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
560
561 # add this section to the parent section
562 my $section_ptr = {'subsection_order'=>[],
563 'next_subsection'=>1,
564 'subsections'=>{},
565 'metadata'=>[],
566 'text'=>""};
567 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
568
569 # work out the full section number
570 my $section = $parent_section;
571 $section .= "." unless $section eq "";
572 $section .= $section_num;
573
574 return $section;
575}
576
577# creates a pre-named section
578sub create_named_section {
579 my $self = shift (@_);
580 my ($mastersection) = @_;
581
582 my ($num);
583 my $section = $mastersection;
584 my $sectionref = $self;
585
586 while ($section ne "") {
587 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
588 $num =~ s/^0+(\d)/$1/; # remove leading 0s
589 $section = "" unless defined $section;
590
591 if (defined $num) {
592 if (!defined $sectionref->{'subsections'}->{$num}) {
593 push (@{$sectionref->{'subsection_order'}}, $num);
594 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
595 'next_subsection'=>1,
596 'subsections'=>{},
597 'metadata'=>[],
598 'text'=>""};
599 if ($num >= $sectionref->{'next_subsection'}) {
600 $sectionref->{'next_subsection'} = $num + 1;
601 }
602 }
603 $sectionref = $sectionref->{'subsections'}->{$num};
604
605 } else {
606 print STDERR "doc::create_named_section couldn't create section ";
607 print STDERR "$mastersection\n";
608 last;
609 }
610 }
611}
612
613# returns a reference to a list of subsections
614sub list_subsections {
615 my $self = shift (@_);
616 my ($section) = @_;
617
618 my $section_ptr = $self->_lookup_section ($section);
619 if (!defined $section_ptr) {
620 print STDERR "doc::list_subsections couldn't find section $section\n";
621 return [];
622 }
623
624 return [@{$section_ptr->{'subsection_order'}}];
625}
626
627sub delete_section {
628 my $self = shift (@_);
629 my ($section) = @_;
630
631# my $section_ptr = {'subsection_order'=>[],
632# 'next_subsection'=>1,
633# 'subsections'=>{},
634# 'metadata'=>[],
635# 'text'=>""};
636
637 # if this is the top section reset everything
638 if ($section eq "") {
639 $self->{'subsection_order'} = [];
640 $self->{'subsections'} = {};
641 $self->{'metadata'} = [];
642 $self->{'text'} = "";
643 return;
644 }
645
646 # find the parent of the section to delete
647 my $parent_section = "";
648 my $child = "0";
649 my @section = split (/\./, $section);
650 if (scalar(@section) > 0) {
651 $child = pop (@section);
652 $parent_section = join (".", @section);
653 }
654
655 my $parent_section_ptr = $self->_lookup_section($parent_section);
656 if (!defined $parent_section_ptr) {
657 print STDERR "doc::delete_section couldn't find parent section " .
658 "$parent_section\n";
659 return;
660 }
661
662 # remove this section from the subsection_order list
663 my $i = 0;
664 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
665 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
666 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
667 last;
668 }
669 $i++;
670 }
671
672 # remove this section from the subsection hash
673 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
674 undef $parent_section_ptr->{'subsections'}->{$child};
675 }
676}
677
678#--
679# methods for dealing with metadata
680
681# set_metadata_element and get_metadata_element are for metadata
682# which should only have one value. add_meta_data and get_metadata
683# are for metadata which can have more than one value.
684
685# returns the first metadata value which matches field
686
687# This version of get metadata element works much like the one above,
688# except it allows for the namespace portion of a metadata element to
689# be ignored, thus if you are searching for dc.Title, the first piece
690# of matching metadata ending with the name Title (once any namespace
691# is removed) would be returned.
692# 28-11-2003 John Thompson
693sub get_metadata_element {
694 my $self = shift (@_);
695 my ($section, $field, $ignore_namespace) = @_;
696 my ($data);
697
698 $ignore_namespace = 0 unless defined $ignore_namespace;
699
700 my $section_ptr = $self->_lookup_section($section);
701 if (!defined $section_ptr) {
702 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
703 return;
704 }
705
706 # Remove the any namespace if we are being told to ignore them
707 if($ignore_namespace) {
708 $field =~ s/^\w*\.//;
709 }
710
711 foreach $data (@{$section_ptr->{'metadata'}}) {
712
713 my $data_name = $data->[0];
714
715 # Remove the any namespace if we are being told to ignore them
716 if($ignore_namespace) {
717 $data_name =~ s/^\w*\.//;
718 }
719
720 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
721 }
722
723 return undef; # was not found
724}
725
726# returns a list of the form [value1, value2, ...]
727sub get_metadata {
728 my $self = shift (@_);
729 my ($section, $field, $ignore_namespace) = @_;
730 my ($data);
731
732 $ignore_namespace = 0 unless defined $ignore_namespace;
733
734 my $section_ptr = $self->_lookup_section($section);
735 if (!defined $section_ptr) {
736 print STDERR "doc::get_metadata couldn't find section ",
737 $section, "\n";
738 return;
739 }
740
741 # Remove the any namespace if we are being told to ignore them
742 if($ignore_namespace) {
743 $field =~ s/^\w*\.//;
744 }
745
746 my @metadata = ();
747 foreach $data (@{$section_ptr->{'metadata'}}) {
748
749 my $data_name = $data->[0];
750 # Remove the any namespace if we are being told to ignore them
751 if($ignore_namespace) {
752 $data_name =~ s/^\w*\.//;
753 }
754
755 push (@metadata, $data->[1]) if ($data_name eq $field);
756 }
757
758 return \@metadata;
759}
760
761# returns a list of the form [[field,value],[field,value],...]
762sub get_all_metadata {
763 my $self = shift (@_);
764 my ($section) = @_;
765
766 my $section_ptr = $self->_lookup_section($section);
767 if (!defined $section_ptr) {
768 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
769 return;
770 }
771
772 return $section_ptr->{'metadata'};
773}
774
775# $value is optional
776sub delete_metadata {
777 my $self = shift (@_);
778 my ($section, $field, $value) = @_;
779
780 my $section_ptr = $self->_lookup_section($section);
781 if (!defined $section_ptr) {
782 print STDERR "doc::delete_metadata couldn't find section ", $section, "\n";
783 return;
784 }
785
786 my $i = 0;
787 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
788 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
789 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
790 splice (@{$section_ptr->{'metadata'}}, $i, 1);
791 } else {
792 $i++;
793 }
794 }
795}
796
797sub delete_all_metadata {
798 my $self = shift (@_);
799 my ($section) = @_;
800
801 my $section_ptr = $self->_lookup_section($section);
802 if (!defined $section_ptr) {
803 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
804 return;
805 }
806
807 $section_ptr->{'metadata'} = [];
808}
809
810sub set_metadata_element {
811 my $self = shift (@_);
812 my ($section, $field, $value) = @_;
813
814 $self->set_utf8_metadata_element ($section, $field,
815 &unicode::ascii2utf8(\$value));
816}
817
818# set_utf8_metadata_element assumes the text has already been
819# converted to the UTF-8 encoding.
820sub set_utf8_metadata_element {
821 my $self = shift (@_);
822 my ($section, $field, $value) = @_;
823
824 $self->delete_metadata ($section, $field);
825 $self->add_utf8_metadata ($section, $field, $value);
826}
827
828
829# add_metadata assumes the text is in (extended) ascii form. For
830# text which has already been converted to the UTF-8 format use
831# add_utf8_metadata.
832sub add_metadata {
833 my $self = shift (@_);
834 my ($section, $field, $value) = @_;
835
836 $self->add_utf8_metadata ($section, $field,
837 &unicode::ascii2utf8(\$value));
838}
839
840sub add_utf8_metadata {
841 my $self = shift (@_);
842 my ($section, $field, $value) = @_;
843
844 my $section_ptr = $self->_lookup_section($section);
845 if (!defined $section_ptr) {
846 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
847 return;
848 }
849 if (!defined $value) {
850 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
851 return;
852 }
853 if (!defined $field) {
854 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
855 return;
856 }
857
858 #print STDERR "###$field=$value\n";
859 # double check that the value is utf-8
860 if (!&unicode::check_is_utf8($value)) {
861 print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
862 &unicode::ensure_utf8(\$value);
863 print STDERR " Tried converting to utf8: $value\n";
864 }
865
866 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
867}
868
869
870# methods for dealing with text
871
872# returns the text for a section
873sub get_text {
874 my $self = shift (@_);
875 my ($section) = @_;
876
877 my $section_ptr = $self->_lookup_section($section);
878 if (!defined $section_ptr) {
879 print STDERR "doc::get_text couldn't find section " .
880 "$section\n";
881 return "";
882 }
883
884 return $section_ptr->{'text'};
885}
886
887# returns the (utf-8 encoded) length of the text for a section
888sub get_text_length {
889 my $self = shift (@_);
890 my ($section) = @_;
891
892 my $section_ptr = $self->_lookup_section($section);
893 if (!defined $section_ptr) {
894 print STDERR "doc::get_text_length couldn't find section " .
895 "$section\n";
896 return 0;
897 }
898
899 return length ($section_ptr->{'text'});
900}
901
902sub delete_text {
903 my $self = shift (@_);
904 my ($section) = @_;
905
906 my $section_ptr = $self->_lookup_section($section);
907 if (!defined $section_ptr) {
908 print STDERR "doc::delete_text couldn't find section " .
909 "$section\n";
910 return;
911 }
912
913 $section_ptr->{'text'} = "";
914}
915
916# add_text assumes the text is in (extended) ascii form. For
917# text which has been already converted to the UTF-8 format
918# use add_utf8_text.
919sub add_text {
920 my $self = shift (@_);
921 my ($section, $text) = @_;
922
923 # convert the text to UTF-8 encoded unicode characters
924 # and add the text
925 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
926}
927
928
929# add_utf8_text assumes the text to be added has already
930# been converted to the UTF-8 encoding. For ascii text use
931# add_text
932sub add_utf8_text {
933 my $self = shift (@_);
934 my ($section, $text) = @_;
935
936 my $section_ptr = $self->_lookup_section($section);
937 if (!defined $section_ptr) {
938 print STDERR "doc::add_utf8_text couldn't find section " .
939 "$section\n";
940 return;
941 }
942
943 $section_ptr->{'text'} .= $text;
944}
945
946# returns the Source meta, which is the utf8 filename generated.
947# Added a separate method here for convenience
948sub get_source {
949 my $self = shift (@_);
950 return $self->get_metadata_element ($self->get_top_section(), "Source");
951}
952
953# returns the SourceFile meta, which is the url reference to the URL-encoded
954# version of Source (the utf8 filename). Added a separate method here for convenience
955sub get_sourcefile {
956 my $self = shift (@_);
957 return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
958}
959
960# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
961# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
962sub get_assocfile_from_sourcefile {
963 my $self = shift (@_);
964
965 # get the SourceFile meta, which is a *URL* to a file on the filesystem
966 my $top_section = $self->get_top_section();
967 my $source_file = $self->get_metadata_element($top_section, "SourceFile");
968
969 # get the actual filename as it exists on the filesystem which this url refers to
970 $source_file = &unicode::url_to_filename($source_file);
971 my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
972 return $assocfilename;
973}
974
975# methods for dealing with associated files
976
977# a file is associated with a document, NOT a section.
978# if section is defined it is noted in the data structure
979# only so that files associated from a particular section
980# may be removed later (using delete_section_assoc_files)
981sub associate_file {
982 my $self = shift (@_);
983 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
984 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
985
986 # remove all associated files with the same name
987 $self->delete_assoc_file ($assoc_filename);
988
989 push (@{$self->{'associated_files'}},
990 [$real_filename, $assoc_filename, $mime_type, $section]);
991}
992
993# returns a list of associated files in the form
994# [[real_filename, assoc_filename, mimetype], ...]
995sub get_assoc_files {
996 my $self = shift (@_);
997
998 return $self->{'associated_files'};
999}
1000
1001
1002sub metadata_file {
1003 my $self = shift (@_);
1004 my ($real_filename, $filename) = @_;
1005
1006 push (@{$self->{'metadata_files'}},
1007 [$real_filename, $filename]);
1008}
1009
1010sub get_meta_files {
1011 my $self = shift (@_);
1012
1013 return $self->{'metadata_files'};
1014}
1015
1016sub delete_section_assoc_files {
1017 my $self = shift (@_);
1018 my ($section) = @_;
1019
1020 my $i=0;
1021 while ($i < scalar (@{$self->{'associated_files'}})) {
1022 if (defined $self->{'associated_files'}->[$i]->[3] &&
1023 $self->{'associated_files'}->[$i]->[3] eq $section) {
1024 splice (@{$self->{'associated_files'}}, $i, 1);
1025 } else {
1026 $i++;
1027 }
1028 }
1029}
1030
1031sub delete_assoc_file {
1032 my $self = shift (@_);
1033 my ($assoc_filename) = @_;
1034
1035 my $i=0;
1036 while ($i < scalar (@{$self->{'associated_files'}})) {
1037 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1038 splice (@{$self->{'associated_files'}}, $i, 1);
1039 } else {
1040 $i++;
1041 }
1042 }
1043}
1044
1045sub reset_nextsection_ptr {
1046 my $self = shift (@_);
1047 my ($section) = @_;
1048
1049 my $section_ptr = $self->_lookup_section($section);
1050 $section_ptr->{'next_subsection'} = 1;
1051}
1052
10531;
Note: See TracBrowser for help on using the repository browser.