source: gsdl/trunk/perllib/doc.pm@ 18430

Last change on this file since 18430 was 18319, checked in by ak19, 15 years ago

Now plugins provide the option of base64 encoding or url encoding filenames that are to be renamed (when copied into the archives dir). Previously renamed files would always be url-encoded. URL-encoding is the default now for most plugins except MP3Plugin and OggVorbisPlugin, where the default is base64 encoding. Base64 encoding filenames upon renaming them was introduced so that more files that browsers try to open in external applications can open them, since url encoding does not seem to be implemented the same everywhere (for instance, windows media player is unable to handle url-encoded wmv filenames when such files are launched in it through the browser).

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 28.3 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use ghtml;
40use File::stat;
41##use hashdoc;
42use docprint;
43
44# the document type may be indexed_doc, nonindexed_doc, or
45# classification
46
47my $OIDcount = 0;
48
49sub new {
50 my $class = shift (@_);
51 my ($source_filename, $doc_type, $rename_method) = @_;
52
53
54 my $self = bless {'associated_files'=>[],
55 'subsection_order'=>[],
56 'next_subsection'=>1,
57 'subsections'=>{},
58 'metadata'=>[],
59 'text'=>"",
60 'OIDtype'=>"hash"}, $class;
61
62 # used to set lastmodified here, but this can screw up the HASH ids, so
63 # the docsave processor now calls set_lastmodified
64
65 $self->{'source_path'} = $source_filename;
66
67 if (defined $source_filename) {
68 $source_filename = &util::filename_within_collection($source_filename);
69 print STDERR "******doc.pm::new(): no file rename method provided\n" unless $rename_method;
70 $self->set_source_filename ($source_filename, $rename_method);
71 }
72
73 $self->set_doc_type ($doc_type) if defined $doc_type;
74
75 return $self;
76}
77# set lastmodified for OAI purposes, added by GRB, moved by kjdon
78sub set_lastmodified {
79 my $self = shift (@_);
80
81 my $source_path = $self->{'source_path'};
82
83 if (defined $source_path && (-e $source_path)) {
84
85 my $file_stat = stat($source_path);
86 my $mtime = $file_stat->mtime;
87 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $file_stat->mtime);
88 }
89}
90
91# clone the $self object
92sub duplicate {
93 my $self = shift (@_);
94
95 my $newobj = {};
96
97 foreach my $k (keys %$self) {
98 $newobj->{$k} = &clone ($self->{$k});
99 }
100
101 bless $newobj, ref($self);
102 return $newobj;
103}
104
105sub clone {
106 my ($from) = @_;
107 my $type = ref ($from);
108
109 if ($type eq "HASH") {
110 my $to = {};
111 foreach my $key (keys %$from) {
112 $to->{$key} = &clone ($from->{$key});
113 }
114 return $to;
115 } elsif ($type eq "ARRAY") {
116 my $to = [];
117 foreach my $v (@$from) {
118 push (@$to, &clone ($v));
119 }
120 return $to;
121 } else {
122 return $from;
123 }
124}
125
126sub set_OIDtype {
127 my $self = shift (@_);
128 my ($type, $metadata) = @_;
129
130 if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|incremental|dirname|assigned)$/) {
131 $self->{'OIDtype'} = $type;
132 } else {
133 $self->{'OIDtype'} = "hash";
134 }
135
136 if ($type =~ /^assigned$/) {
137 if (defined $metadata) {
138 $self->{'OIDmetadata'} = $metadata;
139 } else {
140 $self->{'OIDmetadata'} = "dc.Identifier";
141 }
142 }
143}
144
145sub set_source_filename {
146 my $self = shift (@_);
147 my ($source_filename, $rename_method) = @_;
148
149 # Since the gsdlsourcefilename element goes into the doc.xml it has
150 # to be utf8. However, it should also *represent* the source filename
151 # (in the import directory) which may not be utf8 at all.
152 # For instance, if this meta element (gsdlsourcefilename) will be used
153 # by other applications that parse doc.xml in order to locate
154 # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
155 # encode the real filename as this is a binary-to-text encoding meaning
156 # that the resulting string is ASCII (utf8). Decoding will give the original.
157
158# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
159
160 # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
161 my ($srcfilename,$dirname,$suffix)
162 = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
163# print STDERR "-> $srcfilename -> ";
164 $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
165 $source_filename = &util::filename_cat($dirname, $srcfilename);
166# print STDERR "$source_filename\n";
167
168 $self->set_utf8_metadata_element ($self->get_top_section(),
169 "gsdlsourcefilename",
170 $source_filename);
171}
172
173sub set_converted_filename {
174 my $self = shift (@_);
175 my ($converted_filename) = @_;
176
177 # we know the converted filename is utf8
178 $self->set_utf8_metadata_element ($self->get_top_section(),
179 "gsdlconvertedfilename",
180 $converted_filename);
181}
182
183# returns the source_filename as it was provided
184sub get_source_filename {
185 my $self = shift (@_);
186
187 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
188}
189
190# returns converted filename if available else returns source filename
191sub get_filename_for_hashing {
192 my $self = shift (@_);
193
194 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
195
196 if (!defined $filename) {
197 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
198 # if NULPlug processed file, then don't give a filename
199 if (defined $plugin_name && $plugin_name eq "NULPlug") {
200 $filename = undef;
201 } else { # returns the URL encoded source filename!
202 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
203 }
204 }
205 return $filename;
206}
207
208sub set_doc_type {
209 my $self = shift (@_);
210 my ($doc_type) = @_;
211
212 $self->set_metadata_element ($self->get_top_section(),
213 "gsdldoctype",
214 $doc_type);
215}
216
217# returns the gsdldoctype as it was provided
218# the default of "indexed_doc" is used if no document
219# type was provided
220sub get_doc_type {
221 my $self = shift (@_);
222
223 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
224 return $doc_type if (defined $doc_type);
225 return "indexed_doc";
226}
227
228
229# look up the reference to the a particular section
230sub _lookup_section {
231 my $self = shift (@_);
232 my ($section) = @_;
233
234 my ($num);
235 my $sectionref = $self;
236
237 while (defined $section && $section ne "") {
238
239 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
240
241 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
242
243 $section = "" unless defined $section;
244
245
246 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
247 $sectionref = $sectionref->{'subsections'}->{$num};
248 } else {
249 return undef;
250 }
251 }
252
253 return $sectionref;
254}
255
256# calculate OID by hashing the contents of the document
257sub _calc_OID {
258 my $self = shift (@_);
259 my ($filename) = @_;
260
261 my $osexe = &util::get_os_exe();
262
263 my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
264 $ENV{'GSDLOS'},"hashfile$osexe");
265
266 my $result = "NULL";
267
268 if (-e "$hashfile_exe") {
269# $result = `\"$hashfile_exe\" \"$filename\"`;
270 $result = `hashfile$osexe \"$filename\"`;
271 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
272 } else {
273 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
274 }
275 return "HASH$result";
276}
277
278# methods dealing with OID, not groups of them.
279
280# if $OID is not provided one is calculated
281sub set_OID {
282 my $self = shift (@_);
283 my ($OID) = @_;
284
285 my $use_hash_oid = 0;
286 # if an OID wasn't provided calculate one
287 if (!defined $OID) {
288 $OID = "NULL";
289 if ($self->{'OIDtype'} =~ /^hash/) {
290 $use_hash_oid = 1;
291 } elsif ($self->{'OIDtype'} eq "incremental") {
292 $OID = "D" . $OIDcount;
293 $OIDcount ++;
294
295 } elsif ($self->{'OIDtype'} eq "dirname") {
296 $OID = 'J';
297 my $filename = $self->get_source_filename();
298 if (defined($filename)) { # && -e $filename) {
299 $OID = &File::Basename::dirname($filename);
300 if (defined $OID) {
301 $OID = 'J'.&File::Basename::basename($OID);
302 $OID =~ s/\.//; #remove any periods
303 } else {
304 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
305 $use_hash_oid = 1;
306 }
307 } else {
308 print STDERR "Failed to find filename, generating hash id\n";
309 $use_hash_oid = 1;
310 }
311
312 } elsif ($self->{'OIDtype'} eq "assigned") {
313 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
314 if (defined $identifier && $identifier ne "") {
315 $OID = $identifier;
316 if ($OID =~ /\./) {
317 print STDERR "Warning, assigned identifier $identifier contains periods (.), removing them\n";
318 $OID =~ s/\.//g; #remove any periods
319 }
320 if ($OID =~ /^\d*$/) {
321 print STDERR "Warning, assigned identifier $identifier contains only digits. Prepending 'D'.\n";
322 $OID = "D" . $OID;
323 }
324 if ($OID =~ /^[\d]*$/) {
325 $OID = "D" . $OID;
326 print STDERR "OID only contains numbers, adding a D\n";
327 }
328
329 } else {
330 # need a hash id
331 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
332 $use_hash_oid = 1;
333 }
334
335 } else {
336 $use_hash_oid = 1;
337 }
338
339 if ($use_hash_oid) {
340 my $hash_on_file = 1;
341 if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
342 $hash_on_file = 0;
343 }
344 if ($hash_on_file) {
345 # "hash" OID - feed file to hashfile.exe
346 my $filename = $self->get_filename_for_hashing();
347 # -z: don't want to hash on the file if it is zero size
348 if (defined($filename) && -e $filename && !-z $filename) {
349 $OID = $self->_calc_OID ($filename);
350 } else {
351 $hash_on_file = 0;
352 }
353 }
354 if (!$hash_on_file) {
355 my $filename = &util::get_tmp_filename();
356 if (!open (OUTFILE, ">$filename")) {
357 print STDERR "doc::set_OID could not write to $filename\n";
358 } else {
359 my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
360 print OUTFILE $doc_text;
361 close (OUTFILE);
362 }
363 $OID = $self->_calc_OID ($filename);
364 &util::rm ($filename);
365 }
366 }
367 }
368 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
369}
370
371# this uses hashdoc (embedded c thingy) which is faster but still
372# needs a little work to be suffiently stable
373sub ___set_OID {
374 my $self = shift (@_);
375 my ($OID) = @_;
376
377 # if an OID wasn't provided then calculate hash value based on document
378 if (!defined $OID)
379 {
380 my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
381 my $hash_len = length($hash_text);
382
383 $OID = &hashdoc::buffer($hash_text,$hash_len);
384 }
385
386 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
387}
388
389# returns the OID for this document
390sub get_OID {
391 my $self = shift (@_);
392 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
393 return $OID if (defined $OID);
394 return "NULL";
395}
396
397sub delete_OID {
398 my $self = shift (@_);
399
400 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
401}
402
403
404# methods for manipulating section names
405
406# returns the name of the top-most section (the top
407# level of the document
408sub get_top_section {
409 my $self = shift (@_);
410
411 return "";
412}
413
414# returns a section
415sub get_parent_section {
416 my $self = shift (@_);
417 my ($section) = @_;
418
419 $section =~ s/(^|\.)\d+$//;
420
421 return $section;
422}
423
424# returns the first child section (or the end child
425# if there isn't any)
426sub get_begin_child {
427 my $self = shift (@_);
428 my ($section) = @_;
429
430 my $section_ptr = $self->_lookup_section($section);
431 return "" unless defined $section_ptr;
432
433 if (defined $section_ptr->{'subsection_order'}->[0]) {
434 return "$section.$section_ptr->{'subsection_order'}->[0]";
435 }
436
437 return $self->get_end_child ($section);
438}
439
440# returns the next child of a parent section
441sub get_next_child {
442 my $self = shift (@_);
443 my ($section) = @_;
444
445 my $parent_section = $self->get_parent_section($section);
446 my $parent_section_ptr = $self->_lookup_section($parent_section);
447 return undef unless defined $parent_section_ptr;
448
449 my ($section_num) = $section =~ /(\d+)$/;
450 return undef unless defined $section_num;
451
452 my $i = 0;
453 my $section_order = $parent_section_ptr->{'subsection_order'};
454 while ($i < scalar(@$section_order)) {
455 last if $section_order->[$i] eq $section_num;
456 $i++;
457 }
458
459 $i++; # the next child
460 if ($i < scalar(@$section_order)) {
461 return $section_order->[$i] if $parent_section eq "";
462 return "$parent_section.$section_order->[$i]";
463 }
464
465 # no more sections in this level
466 return undef;
467}
468
469# returns a reference to a list of children
470sub get_children {
471 my $self = shift (@_);
472 my ($section) = @_;
473
474 my $section_ptr = $self->_lookup_section($section);
475 return [] unless defined $section_ptr;
476
477 my @children = @{$section_ptr->{'subsection_order'}};
478
479 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
480 return \@children;
481}
482
483# returns the child section one past the last one (which
484# is coded as "0")
485sub get_end_child {
486 my $self = shift (@_);
487 my ($section) = @_;
488
489 return $section . ".0" unless $section eq "";
490 return "0";
491}
492
493# returns the next section in book order
494sub get_next_section {
495 my $self = shift (@_);
496 my ($section) = @_;
497
498 return undef unless defined $section;
499
500 my $section_ptr = $self->_lookup_section($section);
501 return undef unless defined $section_ptr;
502
503 # first try to find first child
504 if (defined $section_ptr->{'subsection_order'}->[0]) {
505 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
506 return "$section.$section_ptr->{'subsection_order'}->[0]";
507 }
508
509 do {
510 # try to find sibling
511 my $next_child = $self->get_next_child ($section);
512 return $next_child if (defined $next_child);
513
514 # move up one level
515 $section = $self->get_parent_section ($section);
516 } while $section =~ /\d/;
517
518 return undef;
519}
520
521sub is_leaf_section {
522 my $self = shift (@_);
523 my ($section) = @_;
524
525 my $section_ptr = $self->_lookup_section($section);
526 return 1 unless defined $section_ptr;
527
528 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
529}
530
531# methods for dealing with sections
532
533# returns the name of the inserted section
534sub insert_section {
535 my $self = shift (@_);
536 my ($before_section) = @_;
537
538 # get the child to insert before and its parent section
539 my $parent_section = "";
540 my $before_child = "0";
541 my @before_section = split (/\./, $before_section);
542 if (scalar(@before_section) > 0) {
543 $before_child = pop (@before_section);
544 $parent_section = join (".", @before_section);
545 }
546
547 my $parent_section_ptr = $self->_lookup_section($parent_section);
548 if (!defined $parent_section_ptr) {
549 print STDERR "doc::insert_section couldn't find parent section " .
550 "$parent_section\n";
551 return;
552 }
553
554 # get the next section number
555 my $section_num = $parent_section_ptr->{'next_subsection'}++;
556
557 my $i = 0;
558 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
559 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
560 $i++;
561 }
562
563 # insert the section number into the order list
564 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
565
566 # add this section to the parent section
567 my $section_ptr = {'subsection_order'=>[],
568 'next_subsection'=>1,
569 'subsections'=>{},
570 'metadata'=>[],
571 'text'=>""};
572 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
573
574 # work out the full section number
575 my $section = $parent_section;
576 $section .= "." unless $section eq "";
577 $section .= $section_num;
578
579 return $section;
580}
581
582# creates a pre-named section
583sub create_named_section {
584 my $self = shift (@_);
585 my ($mastersection) = @_;
586
587 my ($num);
588 my $section = $mastersection;
589 my $sectionref = $self;
590
591 while ($section ne "") {
592 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
593 $num =~ s/^0+(\d)/$1/; # remove leading 0s
594 $section = "" unless defined $section;
595
596 if (defined $num) {
597 if (!defined $sectionref->{'subsections'}->{$num}) {
598 push (@{$sectionref->{'subsection_order'}}, $num);
599 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
600 'next_subsection'=>1,
601 'subsections'=>{},
602 'metadata'=>[],
603 'text'=>""};
604 if ($num >= $sectionref->{'next_subsection'}) {
605 $sectionref->{'next_subsection'} = $num + 1;
606 }
607 }
608 $sectionref = $sectionref->{'subsections'}->{$num};
609
610 } else {
611 print STDERR "doc::create_named_section couldn't create section ";
612 print STDERR "$mastersection\n";
613 last;
614 }
615 }
616}
617
618# returns a reference to a list of subsections
619sub list_subsections {
620 my $self = shift (@_);
621 my ($section) = @_;
622
623 my $section_ptr = $self->_lookup_section ($section);
624 if (!defined $section_ptr) {
625 print STDERR "doc::list_subsections couldn't find section $section\n";
626 return [];
627 }
628
629 return [@{$section_ptr->{'subsection_order'}}];
630}
631
632sub delete_section {
633 my $self = shift (@_);
634 my ($section) = @_;
635
636# my $section_ptr = {'subsection_order'=>[],
637# 'next_subsection'=>1,
638# 'subsections'=>{},
639# 'metadata'=>[],
640# 'text'=>""};
641
642 # if this is the top section reset everything
643 if ($section eq "") {
644 $self->{'subsection_order'} = [];
645 $self->{'subsections'} = {};
646 $self->{'metadata'} = [];
647 $self->{'text'} = "";
648 return;
649 }
650
651 # find the parent of the section to delete
652 my $parent_section = "";
653 my $child = "0";
654 my @section = split (/\./, $section);
655 if (scalar(@section) > 0) {
656 $child = pop (@section);
657 $parent_section = join (".", @section);
658 }
659
660 my $parent_section_ptr = $self->_lookup_section($parent_section);
661 if (!defined $parent_section_ptr) {
662 print STDERR "doc::delete_section couldn't find parent section " .
663 "$parent_section\n";
664 return;
665 }
666
667 # remove this section from the subsection_order list
668 my $i = 0;
669 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
670 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
671 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
672 last;
673 }
674 $i++;
675 }
676
677 # remove this section from the subsection hash
678 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
679 undef $parent_section_ptr->{'subsections'}->{$child};
680 }
681}
682
683#--
684# methods for dealing with metadata
685
686# set_metadata_element and get_metadata_element are for metadata
687# which should only have one value. add_meta_data and get_metadata
688# are for metadata which can have more than one value.
689
690# returns the first metadata value which matches field
691
692# This version of get metadata element works much like the one above,
693# except it allows for the namespace portion of a metadata element to
694# be ignored, thus if you are searching for dc.Title, the first piece
695# of matching metadata ending with the name Title (once any namespace
696# is removed) would be returned.
697# 28-11-2003 John Thompson
698sub get_metadata_element {
699 my $self = shift (@_);
700 my ($section, $field, $ignore_namespace) = @_;
701 my ($data);
702
703 $ignore_namespace = 0 unless defined $ignore_namespace;
704
705 my $section_ptr = $self->_lookup_section($section);
706 if (!defined $section_ptr) {
707 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
708 return;
709 }
710
711 # Remove the any namespace if we are being told to ignore them
712 if($ignore_namespace) {
713 $field =~ s/^\w*\.//;
714 }
715
716 foreach $data (@{$section_ptr->{'metadata'}}) {
717
718 my $data_name = $data->[0];
719
720 # Remove the any namespace if we are being told to ignore them
721 if($ignore_namespace) {
722 $data_name =~ s/^\w*\.//;
723 }
724
725 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
726 }
727
728 return undef; # was not found
729}
730
731# returns a list of the form [value1, value2, ...]
732sub get_metadata {
733 my $self = shift (@_);
734 my ($section, $field, $ignore_namespace) = @_;
735 my ($data);
736
737 $ignore_namespace = 0 unless defined $ignore_namespace;
738
739 my $section_ptr = $self->_lookup_section($section);
740 if (!defined $section_ptr) {
741 print STDERR "doc::get_metadata couldn't find section ",
742 $section, "\n";
743 return;
744 }
745
746 # Remove the any namespace if we are being told to ignore them
747 if($ignore_namespace) {
748 $field =~ s/^\w*\.//;
749 }
750
751 my @metadata = ();
752 foreach $data (@{$section_ptr->{'metadata'}}) {
753
754 my $data_name = $data->[0];
755 # Remove the any namespace if we are being told to ignore them
756 if($ignore_namespace) {
757 $data_name =~ s/^\w*\.//;
758 }
759
760 push (@metadata, $data->[1]) if ($data_name eq $field);
761 }
762
763 return \@metadata;
764}
765
766# returns a list of the form [[field,value],[field,value],...]
767sub get_all_metadata {
768 my $self = shift (@_);
769 my ($section) = @_;
770
771 my $section_ptr = $self->_lookup_section($section);
772 if (!defined $section_ptr) {
773 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
774 return;
775 }
776
777 return $section_ptr->{'metadata'};
778}
779
780# $value is optional
781sub delete_metadata {
782 my $self = shift (@_);
783 my ($section, $field, $value) = @_;
784
785 my $section_ptr = $self->_lookup_section($section);
786 if (!defined $section_ptr) {
787 print STDERR "doc::delete_metadata couldn't find section ", $section, "\n";
788 return;
789 }
790
791 my $i = 0;
792 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
793 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
794 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
795 splice (@{$section_ptr->{'metadata'}}, $i, 1);
796 } else {
797 $i++;
798 }
799 }
800}
801
802sub delete_all_metadata {
803 my $self = shift (@_);
804 my ($section) = @_;
805
806 my $section_ptr = $self->_lookup_section($section);
807 if (!defined $section_ptr) {
808 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
809 return;
810 }
811
812 $section_ptr->{'metadata'} = [];
813}
814
815sub set_metadata_element {
816 my $self = shift (@_);
817 my ($section, $field, $value) = @_;
818
819 $self->set_utf8_metadata_element ($section, $field,
820 &unicode::ascii2utf8(\$value));
821}
822
823# set_utf8_metadata_element assumes the text has already been
824# converted to the UTF-8 encoding.
825sub set_utf8_metadata_element {
826 my $self = shift (@_);
827 my ($section, $field, $value) = @_;
828
829 $self->delete_metadata ($section, $field);
830 $self->add_utf8_metadata ($section, $field, $value);
831}
832
833
834# add_metadata assumes the text is in (extended) ascii form. For
835# text which has already been converted to the UTF-8 format use
836# add_utf8_metadata.
837sub add_metadata {
838 my $self = shift (@_);
839 my ($section, $field, $value) = @_;
840
841 $self->add_utf8_metadata ($section, $field,
842 &unicode::ascii2utf8(\$value));
843}
844
845sub add_utf8_metadata {
846 my $self = shift (@_);
847 my ($section, $field, $value) = @_;
848
849 my $section_ptr = $self->_lookup_section($section);
850 if (!defined $section_ptr) {
851 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
852 return;
853 }
854 if (!defined $value) {
855 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
856 return;
857 }
858 if (!defined $field) {
859 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
860 return;
861 }
862
863 #print STDERR "###$field=$value\n";
864 # double check that the value is utf-8
865 if (!&unicode::check_is_utf8($value)) {
866 print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
867 &unicode::ensure_utf8(\$value);
868 print STDERR " Tried converting to utf8: $value\n";
869 }
870
871 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
872}
873
874
875# methods for dealing with text
876
877# returns the text for a section
878sub get_text {
879 my $self = shift (@_);
880 my ($section) = @_;
881
882 my $section_ptr = $self->_lookup_section($section);
883 if (!defined $section_ptr) {
884 print STDERR "doc::get_text couldn't find section " .
885 "$section\n";
886 return "";
887 }
888
889 return $section_ptr->{'text'};
890}
891
892# returns the (utf-8 encoded) length of the text for a section
893sub get_text_length {
894 my $self = shift (@_);
895 my ($section) = @_;
896
897 my $section_ptr = $self->_lookup_section($section);
898 if (!defined $section_ptr) {
899 print STDERR "doc::get_text_length couldn't find section " .
900 "$section\n";
901 return 0;
902 }
903
904 return length ($section_ptr->{'text'});
905}
906
907sub delete_text {
908 my $self = shift (@_);
909 my ($section) = @_;
910
911 my $section_ptr = $self->_lookup_section($section);
912 if (!defined $section_ptr) {
913 print STDERR "doc::delete_text couldn't find section " .
914 "$section\n";
915 return;
916 }
917
918 $section_ptr->{'text'} = "";
919}
920
921# add_text assumes the text is in (extended) ascii form. For
922# text which has been already converted to the UTF-8 format
923# use add_utf8_text.
924sub add_text {
925 my $self = shift (@_);
926 my ($section, $text) = @_;
927
928 # convert the text to UTF-8 encoded unicode characters
929 # and add the text
930 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
931}
932
933
934# add_utf8_text assumes the text to be added has already
935# been converted to the UTF-8 encoding. For ascii text use
936# add_text
937sub add_utf8_text {
938 my $self = shift (@_);
939 my ($section, $text) = @_;
940
941 my $section_ptr = $self->_lookup_section($section);
942 if (!defined $section_ptr) {
943 print STDERR "doc::add_utf8_text couldn't find section " .
944 "$section\n";
945 return;
946 }
947
948 $section_ptr->{'text'} .= $text;
949}
950
951# returns the Source meta, which is the utf8 filename generated.
952# Added a separate method here for convenience
953sub get_source {
954 my $self = shift (@_);
955 return $self->get_metadata_element ($self->get_top_section(), "Source");
956}
957
958# returns the SourceFile meta, which is the url reference to the URL-encoded
959# version of Source (the utf8 filename). Added a separate method here for convenience
960sub get_sourcefile {
961 my $self = shift (@_);
962 return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
963}
964
965# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
966# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
967sub get_assocfile_from_sourcefile {
968 my $self = shift (@_);
969
970 # get the SourceFile meta, which is a *URL* to a file on the filesystem
971 my $top_section = $self->get_top_section();
972 my $source_file = $self->get_metadata_element($top_section, "SourceFile");
973
974 # get the actual filename as it exists on the filesystem which this url refers to
975 $source_file = &unicode::url_to_filename($source_file);
976 my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
977 return $assocfilename;
978}
979
980# methods for dealing with associated files
981
982# a file is associated with a document, NOT a section.
983# if section is defined it is noted in the data structure
984# only so that files associated from a particular section
985# may be removed later (using delete_section_assoc_files)
986sub associate_file {
987 my $self = shift (@_);
988 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
989 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
990
991 # remove all associated files with the same name
992 $self->delete_assoc_file ($assoc_filename);
993
994 push (@{$self->{'associated_files'}},
995 [$real_filename, $assoc_filename, $mime_type, $section]);
996}
997
998# returns a list of associated files in the form
999# [[real_filename, assoc_filename, mimetype], ...]
1000sub get_assoc_files {
1001 my $self = shift (@_);
1002
1003 return $self->{'associated_files'};
1004}
1005
1006sub delete_section_assoc_files {
1007 my $self = shift (@_);
1008 my ($section) = @_;
1009
1010 my $i=0;
1011 while ($i < scalar (@{$self->{'associated_files'}})) {
1012 if (defined $self->{'associated_files'}->[$i]->[3] &&
1013 $self->{'associated_files'}->[$i]->[3] eq $section) {
1014 splice (@{$self->{'associated_files'}}, $i, 1);
1015 } else {
1016 $i++;
1017 }
1018 }
1019}
1020
1021sub delete_assoc_file {
1022 my $self = shift (@_);
1023 my ($assoc_filename) = @_;
1024
1025 my $i=0;
1026 while ($i < scalar (@{$self->{'associated_files'}})) {
1027 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1028 splice (@{$self->{'associated_files'}}, $i, 1);
1029 } else {
1030 $i++;
1031 }
1032 }
1033}
1034
1035sub reset_nextsection_ptr {
1036 my $self = shift (@_);
1037 my ($section) = @_;
1038
1039 my $section_ptr = $self->_lookup_section($section);
1040 $section_ptr->{'next_subsection'} = 1;
1041}
1042
10431;
Note: See TracBrowser for help on using the repository browser.