source: gsdl/trunk/perllib/doc.pm@ 18508

Last change on this file since 18508 was 18508, checked in by davidb, 15 years ago

Had to move location of where deletion of archive files was done

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 28.3 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use ghtml;
40use File::stat;
41##use hashdoc;
42use docprint;
43
44# the document type may be indexed_doc, nonindexed_doc, or
45# classification
46
47my $OIDcount = 0;
48
49sub new {
50 my $class = shift (@_);
51 my ($source_filename, $doc_type, $rename_method) = @_;
52
53
54 my $self = bless {'associated_files'=>[],
55 'subsection_order'=>[],
56 'next_subsection'=>1,
57 'subsections'=>{},
58 'metadata'=>[],
59 'text'=>"",
60 'OIDtype'=>"hash"}, $class;
61
62 # used to set lastmodified here, but this can screw up the HASH ids, so
63 # the docsave processor now calls set_lastmodified
64
65 $self->{'source_path'} = $source_filename;
66
67 if (defined $source_filename) {
68 $source_filename = &util::filename_within_collection($source_filename);
69 print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
70 $self->set_source_filename ($source_filename, $rename_method);
71 }
72
73 $self->set_doc_type ($doc_type) if defined $doc_type;
74
75 return $self;
76}
77# set lastmodified for OAI purposes, added by GRB, moved by kjdon
78sub set_lastmodified {
79 my $self = shift (@_);
80
81 my $source_path = $self->{'source_path'};
82
83 if (defined $source_path && (-e $source_path)) {
84
85 my $file_stat = stat($source_path);
86 my $mtime = $file_stat->mtime;
87 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $file_stat->mtime);
88 }
89}
90
91# clone the $self object
92sub duplicate {
93 my $self = shift (@_);
94
95 my $newobj = {};
96
97 foreach my $k (keys %$self) {
98 $newobj->{$k} = &clone ($self->{$k});
99 }
100
101 bless $newobj, ref($self);
102 return $newobj;
103}
104
105sub clone {
106 my ($from) = @_;
107 my $type = ref ($from);
108
109 if ($type eq "HASH") {
110 my $to = {};
111 foreach my $key (keys %$from) {
112 $to->{$key} = &clone ($from->{$key});
113 }
114 return $to;
115 } elsif ($type eq "ARRAY") {
116 my $to = [];
117 foreach my $v (@$from) {
118 push (@$to, &clone ($v));
119 }
120 return $to;
121 } else {
122 return $from;
123 }
124}
125
126sub set_OIDtype {
127 my $self = shift (@_);
128 my ($type, $metadata) = @_;
129
130 if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|incremental|dirname|assigned)$/) {
131 $self->{'OIDtype'} = $type;
132 } else {
133 $self->{'OIDtype'} = "hash";
134 }
135
136 if ($type =~ /^assigned$/) {
137 if (defined $metadata) {
138 $self->{'OIDmetadata'} = $metadata;
139 } else {
140 $self->{'OIDmetadata'} = "dc.Identifier";
141 }
142 }
143}
144
145sub set_source_filename {
146 my $self = shift (@_);
147 my ($source_filename, $rename_method) = @_;
148
149 # Since the gsdlsourcefilename element goes into the doc.xml it has
150 # to be utf8. However, it should also *represent* the source filename
151 # (in the import directory) which may not be utf8 at all.
152 # For instance, if this meta element (gsdlsourcefilename) will be used
153 # by other applications that parse doc.xml in order to locate
154 # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
155 # encode the real filename as this is a binary-to-text encoding meaning
156 # that the resulting string is ASCII (utf8). Decoding will give the original.
157
158# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
159
160 # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
161 my ($srcfilename,$dirname,$suffix)
162 = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
163# print STDERR "-> $srcfilename -> ";
164 $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
165 $source_filename = &util::filename_cat($dirname, $srcfilename);
166# print STDERR "$source_filename\n";
167
168 $self->set_utf8_metadata_element ($self->get_top_section(),
169 "gsdlsourcefilename",
170 $source_filename);
171}
172
173sub set_converted_filename {
174 my $self = shift (@_);
175 my ($converted_filename) = @_;
176
177 # we know the converted filename is utf8
178 $self->set_utf8_metadata_element ($self->get_top_section(),
179 "gsdlconvertedfilename",
180 $converted_filename);
181}
182
183# returns the source_filename as it was provided
184sub get_source_filename {
185 my $self = shift (@_);
186
187 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
188}
189
190# returns converted filename if available else returns source filename
191sub get_filename_for_hashing {
192 my $self = shift (@_);
193
194 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
195
196 if (!defined $filename) {
197 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
198 # if NULPlug processed file, then don't give a filename
199 if (defined $plugin_name && $plugin_name eq "NULPlug") {
200 $filename = undef;
201 } else { # returns the URL encoded source filename!
202 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
203 }
204 }
205 return $filename;
206}
207
208sub set_doc_type {
209 my $self = shift (@_);
210 my ($doc_type) = @_;
211
212 $self->set_metadata_element ($self->get_top_section(),
213 "gsdldoctype",
214 $doc_type);
215}
216
217# returns the gsdldoctype as it was provided
218# the default of "indexed_doc" is used if no document
219# type was provided
220sub get_doc_type {
221 my $self = shift (@_);
222
223 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
224 return $doc_type if (defined $doc_type);
225 return "indexed_doc";
226}
227
228
229# look up the reference to the a particular section
230sub _lookup_section {
231 my $self = shift (@_);
232 my ($section) = @_;
233
234 my ($num);
235 my $sectionref = $self;
236
237 while (defined $section && $section ne "") {
238
239 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
240
241 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
242
243 $section = "" unless defined $section;
244
245
246 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
247 $sectionref = $sectionref->{'subsections'}->{$num};
248 } else {
249 return undef;
250 }
251 }
252
253 return $sectionref;
254}
255
256# calculate OID by hashing the contents of the document
257sub _calc_OID {
258 my $self = shift (@_);
259 my ($filename) = @_;
260
261 my $osexe = &util::get_os_exe();
262
263 my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
264 $ENV{'GSDLOS'},"hashfile$osexe");
265
266 my $result = "NULL";
267
268 if (-e "$hashfile_exe") {
269# $result = `\"$hashfile_exe\" \"$filename\"`;
270 $result = `hashfile$osexe \"$filename\"`;
271 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
272 } else {
273 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
274 }
275 return "HASH$result";
276}
277
278# methods dealing with OID, not groups of them.
279
280# if $OID is not provided one is calculated
281sub set_OID {
282 my $self = shift (@_);
283 my ($OID) = @_;
284
285 my $use_hash_oid = 0;
286 # if an OID wasn't provided calculate one
287 if (!defined $OID) {
288 $OID = "NULL";
289 if ($self->{'OIDtype'} =~ /^hash/) {
290 $use_hash_oid = 1;
291 } elsif ($self->{'OIDtype'} eq "incremental") {
292 $OID = "D" . $OIDcount;
293 $OIDcount ++;
294
295 } elsif ($self->{'OIDtype'} eq "dirname") {
296 $OID = 'J';
297 my $filename = $self->get_source_filename();
298 if (defined($filename)) { # && -e $filename) {
299 $OID = &File::Basename::dirname($filename);
300 if (defined $OID) {
301 $OID = 'J'.&File::Basename::basename($OID);
302 $OID =~ s/\.//; #remove any periods
303 } else {
304 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
305 $use_hash_oid = 1;
306 }
307 } else {
308 print STDERR "Failed to find filename, generating hash id\n";
309 $use_hash_oid = 1;
310 }
311
312 } elsif ($self->{'OIDtype'} eq "assigned") {
313 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
314 if (defined $identifier && $identifier ne "") {
315 $OID = $identifier;
316 if ($OID =~ /\./) {
317 print STDERR "Warning, assigned identifier $identifier contains periods (.), removing them\n";
318 $OID =~ s/\.//g; #remove any periods
319 }
320 if ($OID =~ /^\d*$/) {
321 print STDERR "Warning, assigned identifier $identifier contains only digits. Prepending 'D'.\n";
322 $OID = "D" . $OID;
323 }
324 if ($OID =~ /^[\d]*$/) {
325 $OID = "D" . $OID;
326 print STDERR "OID only contains numbers, adding a D\n";
327 }
328
329 } else {
330 # need a hash id
331 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
332 $use_hash_oid = 1;
333 }
334
335 } else {
336 $use_hash_oid = 1;
337 }
338
339 if ($use_hash_oid) {
340 my $hash_on_file = 1;
341 if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
342 $hash_on_file = 0;
343 }
344 if ($hash_on_file) {
345 # "hash" OID - feed file to hashfile.exe
346 my $filename = $self->get_filename_for_hashing();
347 # -z: don't want to hash on the file if it is zero size
348 if (defined($filename) && -e $filename && !-z $filename) {
349 $OID = $self->_calc_OID ($filename);
350 } else {
351 $hash_on_file = 0;
352 }
353 }
354 if (!$hash_on_file) {
355 my $filename = &util::get_tmp_filename();
356 if (!open (OUTFILE, ">$filename")) {
357 print STDERR "doc::set_OID could not write to $filename\n";
358 } else {
359 my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
360 print OUTFILE $doc_text;
361 close (OUTFILE);
362 }
363 $OID = $self->_calc_OID ($filename);
364 &util::rm ($filename);
365 }
366 }
367 }
368 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
369}
370
371# this uses hashdoc (embedded c thingy) which is faster but still
372# needs a little work to be suffiently stable
373sub ___set_OID {
374 my $self = shift (@_);
375 my ($OID) = @_;
376
377 # if an OID wasn't provided then calculate hash value based on document
378 if (!defined $OID)
379 {
380 my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
381 my $hash_len = length($hash_text);
382
383 $OID = &hashdoc::buffer($hash_text,$hash_len);
384 }
385
386 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
387}
388
389# returns the OID for this document
390sub get_OID {
391 my $self = shift (@_);
392 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
393 return $OID if (defined $OID);
394 return "NULL";
395}
396
397sub delete_OID {
398 my $self = shift (@_);
399
400 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
401}
402
403
404# methods for manipulating section names
405
406# returns the name of the top-most section (the top
407# level of the document
408sub get_top_section {
409 my $self = shift (@_);
410
411 return "";
412}
413
414# returns a section
415sub get_parent_section {
416 my $self = shift (@_);
417 my ($section) = @_;
418
419 $section =~ s/(^|\.)\d+$//;
420
421 return $section;
422}
423
424# returns the first child section (or the end child
425# if there isn't any)
426sub get_begin_child {
427 my $self = shift (@_);
428 my ($section) = @_;
429
430 my $section_ptr = $self->_lookup_section($section);
431 return "" unless defined $section_ptr;
432
433 if (defined $section_ptr->{'subsection_order'}->[0]) {
434 return "$section.$section_ptr->{'subsection_order'}->[0]";
435 }
436
437 return $self->get_end_child ($section);
438}
439
440# returns the next child of a parent section
441sub get_next_child {
442 my $self = shift (@_);
443 my ($section) = @_;
444
445 my $parent_section = $self->get_parent_section($section);
446 my $parent_section_ptr = $self->_lookup_section($parent_section);
447 return undef unless defined $parent_section_ptr;
448
449 my ($section_num) = $section =~ /(\d+)$/;
450 return undef unless defined $section_num;
451
452 my $i = 0;
453 my $section_order = $parent_section_ptr->{'subsection_order'};
454 while ($i < scalar(@$section_order)) {
455 last if $section_order->[$i] eq $section_num;
456 $i++;
457 }
458
459 $i++; # the next child
460 if ($i < scalar(@$section_order)) {
461 return $section_order->[$i] if $parent_section eq "";
462 return "$parent_section.$section_order->[$i]";
463 }
464
465 # no more sections in this level
466 return undef;
467}
468
469# returns a reference to a list of children
470sub get_children {
471 my $self = shift (@_);
472 my ($section) = @_;
473
474 my $section_ptr = $self->_lookup_section($section);
475 return [] unless defined $section_ptr;
476
477 my @children = @{$section_ptr->{'subsection_order'}};
478
479 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
480 return \@children;
481}
482
483# returns the child section one past the last one (which
484# is coded as "0")
485sub get_end_child {
486 my $self = shift (@_);
487 my ($section) = @_;
488
489 return $section . ".0" unless $section eq "";
490 return "0";
491}
492
493# returns the next section in book order
494sub get_next_section {
495 my $self = shift (@_);
496 my ($section) = @_;
497
498 return undef unless defined $section;
499
500 my $section_ptr = $self->_lookup_section($section);
501 return undef unless defined $section_ptr;
502
503 # first try to find first child
504 if (defined $section_ptr->{'subsection_order'}->[0]) {
505 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
506 return "$section.$section_ptr->{'subsection_order'}->[0]";
507 }
508
509 do {
510 # try to find sibling
511 my $next_child = $self->get_next_child ($section);
512 return $next_child if (defined $next_child);
513
514 # move up one level
515 $section = $self->get_parent_section ($section);
516 } while $section =~ /\d/;
517
518 return undef;
519}
520
521sub is_leaf_section {
522 my $self = shift (@_);
523 my ($section) = @_;
524
525 my $section_ptr = $self->_lookup_section($section);
526 return 1 unless defined $section_ptr;
527
528 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
529}
530
531# methods for dealing with sections
532
533# returns the name of the inserted section
534sub insert_section {
535 my $self = shift (@_);
536 my ($before_section) = @_;
537
538 # get the child to insert before and its parent section
539 my $parent_section = "";
540 my $before_child = "0";
541 my @before_section = split (/\./, $before_section);
542 if (scalar(@before_section) > 0) {
543 $before_child = pop (@before_section);
544 $parent_section = join (".", @before_section);
545 }
546
547 my $parent_section_ptr = $self->_lookup_section($parent_section);
548 if (!defined $parent_section_ptr) {
549 print STDERR "doc::insert_section couldn't find parent section " .
550 "$parent_section\n";
551 return;
552 }
553
554 # get the next section number
555 my $section_num = $parent_section_ptr->{'next_subsection'}++;
556
557 my $i = 0;
558 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
559 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
560 $i++;
561 }
562
563 # insert the section number into the order list
564 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
565
566 # add this section to the parent section
567 my $section_ptr = {'subsection_order'=>[],
568 'next_subsection'=>1,
569 'subsections'=>{},
570 'metadata'=>[],
571 'text'=>""};
572 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
573
574 # work out the full section number
575 my $section = $parent_section;
576 $section .= "." unless $section eq "";
577 $section .= $section_num;
578
579 return $section;
580}
581
582# creates a pre-named section
583sub create_named_section {
584 my $self = shift (@_);
585 my ($mastersection) = @_;
586
587 my ($num);
588 my $section = $mastersection;
589 my $sectionref = $self;
590
591 while ($section ne "") {
592 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
593 $num =~ s/^0+(\d)/$1/; # remove leading 0s
594 $section = "" unless defined $section;
595
596 if (defined $num) {
597 if (!defined $sectionref->{'subsections'}->{$num}) {
598 push (@{$sectionref->{'subsection_order'}}, $num);
599 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
600 'next_subsection'=>1,
601 'subsections'=>{},
602 'metadata'=>[],
603 'text'=>""};
604 if ($num >= $sectionref->{'next_subsection'}) {
605 $sectionref->{'next_subsection'} = $num + 1;
606 }
607 }
608 $sectionref = $sectionref->{'subsections'}->{$num};
609
610 } else {
611 print STDERR "doc::create_named_section couldn't create section ";
612 print STDERR "$mastersection\n";
613 last;
614 }
615 }
616}
617
618# returns a reference to a list of subsections
619sub list_subsections {
620 my $self = shift (@_);
621 my ($section) = @_;
622
623 my $section_ptr = $self->_lookup_section ($section);
624 if (!defined $section_ptr) {
625 print STDERR "doc::list_subsections couldn't find section $section\n";
626 return [];
627 }
628
629 return [@{$section_ptr->{'subsection_order'}}];
630}
631
632sub delete_section {
633 my $self = shift (@_);
634 my ($section) = @_;
635
636# my $section_ptr = {'subsection_order'=>[],
637# 'next_subsection'=>1,
638# 'subsections'=>{},
639# 'metadata'=>[],
640# 'text'=>""};
641
642 # if this is the top section reset everything
643 if ($section eq "") {
644 $self->{'subsection_order'} = [];
645 $self->{'subsections'} = {};
646 $self->{'metadata'} = [];
647 $self->{'text'} = "";
648 return;
649 }
650
651 # find the parent of the section to delete
652 my $parent_section = "";
653 my $child = "0";
654 my @section = split (/\./, $section);
655 if (scalar(@section) > 0) {
656 $child = pop (@section);
657 $parent_section = join (".", @section);
658 }
659
660 my $parent_section_ptr = $self->_lookup_section($parent_section);
661 if (!defined $parent_section_ptr) {
662 print STDERR "doc::delete_section couldn't find parent section " .
663 "$parent_section\n";
664 return;
665 }
666
667 # remove this section from the subsection_order list
668 my $i = 0;
669 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
670 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
671 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
672 last;
673 }
674 $i++;
675 }
676
677 # remove this section from the subsection hash
678 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
679 undef $parent_section_ptr->{'subsections'}->{$child};
680 }
681}
682
683#--
684# methods for dealing with metadata
685
686# set_metadata_element and get_metadata_element are for metadata
687# which should only have one value. add_meta_data and get_metadata
688# are for metadata which can have more than one value.
689
690# returns the first metadata value which matches field
691
692# This version of get metadata element works much like the one above,
693# except it allows for the namespace portion of a metadata element to
694# be ignored, thus if you are searching for dc.Title, the first piece
695# of matching metadata ending with the name Title (once any namespace
696# is removed) would be returned.
697# 28-11-2003 John Thompson
698sub get_metadata_element {
699 my $self = shift (@_);
700 my ($section, $field, $ignore_namespace) = @_;
701 my ($data);
702
703 $ignore_namespace = 0 unless defined $ignore_namespace;
704
705 my $section_ptr = $self->_lookup_section($section);
706 if (!defined $section_ptr) {
707 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
708 return;
709 }
710
711 # Remove the any namespace if we are being told to ignore them
712 if($ignore_namespace) {
713 $field =~ s/^\w*\.//;
714 }
715
716 foreach $data (@{$section_ptr->{'metadata'}}) {
717
718 my $data_name = $data->[0];
719
720 # Remove the any namespace if we are being told to ignore them
721 if($ignore_namespace) {
722 $data_name =~ s/^\w*\.//;
723 }
724
725 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
726 }
727
728 return undef; # was not found
729}
730
731# returns a list of the form [value1, value2, ...]
732sub get_metadata {
733 my $self = shift (@_);
734 my ($section, $field, $ignore_namespace) = @_;
735 my ($data);
736
737 $ignore_namespace = 0 unless defined $ignore_namespace;
738
739 my $section_ptr = $self->_lookup_section($section);
740 if (!defined $section_ptr) {
741 print STDERR "doc::get_metadata couldn't find section ",
742 $section, "\n";
743 return;
744 }
745
746 # Remove the any namespace if we are being told to ignore them
747 if($ignore_namespace) {
748 $field =~ s/^\w*\.//;
749 }
750
751 my @metadata = ();
752 foreach $data (@{$section_ptr->{'metadata'}}) {
753
754 my $data_name = $data->[0];
755 # Remove the any namespace if we are being told to ignore them
756 if($ignore_namespace) {
757 $data_name =~ s/^\w*\.//;
758 }
759
760 push (@metadata, $data->[1]) if ($data_name eq $field);
761 }
762
763 return \@metadata;
764}
765
766# returns a list of the form [[field,value],[field,value],...]
767sub get_all_metadata {
768 my $self = shift (@_);
769 my ($section) = @_;
770
771 my $section_ptr = $self->_lookup_section($section);
772 if (!defined $section_ptr) {
773 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
774 return;
775 }
776
777 return $section_ptr->{'metadata'};
778}
779
780# $value is optional
781sub delete_metadata {
782 my $self = shift (@_);
783 my ($section, $field, $value) = @_;
784
785 my $section_ptr = $self->_lookup_section($section);
786 if (!defined $section_ptr) {
787 print STDERR "doc::delete_metadata couldn't find section ", $section, "\n";
788 return;
789 }
790
791 my $i = 0;
792 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
793 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
794 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
795 splice (@{$section_ptr->{'metadata'}}, $i, 1);
796 } else {
797 $i++;
798 }
799 }
800}
801
802sub delete_all_metadata {
803 my $self = shift (@_);
804 my ($section) = @_;
805
806 my $section_ptr = $self->_lookup_section($section);
807 if (!defined $section_ptr) {
808 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
809 return;
810 }
811
812 $section_ptr->{'metadata'} = [];
813}
814
815sub set_metadata_element {
816 my $self = shift (@_);
817 my ($section, $field, $value) = @_;
818
819 $self->set_utf8_metadata_element ($section, $field,
820 &unicode::ascii2utf8(\$value));
821}
822
823# set_utf8_metadata_element assumes the text has already been
824# converted to the UTF-8 encoding.
825sub set_utf8_metadata_element {
826 my $self = shift (@_);
827 my ($section, $field, $value) = @_;
828
829 $self->delete_metadata ($section, $field);
830 $self->add_utf8_metadata ($section, $field, $value);
831}
832
833
834# add_metadata assumes the text is in (extended) ascii form. For
835# text which has already been converted to the UTF-8 format use
836# add_utf8_metadata.
837sub add_metadata {
838 my $self = shift (@_);
839 my ($section, $field, $value) = @_;
840
841 $self->add_utf8_metadata ($section, $field,
842 &unicode::ascii2utf8(\$value));
843}
844
845sub add_utf8_metadata {
846 my $self = shift (@_);
847 my ($section, $field, $value) = @_;
848
849 my $section_ptr = $self->_lookup_section($section);
850 if (!defined $section_ptr) {
851 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
852 return;
853 }
854 if (!defined $value) {
855 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
856 return;
857 }
858 if (!defined $field) {
859 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
860 return;
861 }
862
863 #print STDERR "###$field=$value\n";
864 # double check that the value is utf-8
865 if (!&unicode::check_is_utf8($value)) {
866 print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
867 &unicode::ensure_utf8(\$value);
868 print STDERR " Tried converting to utf8: $value\n";
869 }
870
871 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
872}
873
874
875# methods for dealing with text
876
877# returns the text for a section
878sub get_text {
879 my $self = shift (@_);
880 my ($section) = @_;
881
882 my $section_ptr = $self->_lookup_section($section);
883 if (!defined $section_ptr) {
884 print STDERR "doc::get_text couldn't find section " .
885 "$section\n";
886 return "";
887 }
888
889 return $section_ptr->{'text'};
890}
891
892# returns the (utf-8 encoded) length of the text for a section
893sub get_text_length {
894 my $self = shift (@_);
895 my ($section) = @_;
896
897 my $section_ptr = $self->_lookup_section($section);
898 if (!defined $section_ptr) {
899 print STDERR "doc::get_text_length couldn't find section " .
900 "$section\n";
901 return 0;
902 }
903
904 return length ($section_ptr->{'text'});
905}
906
907sub delete_text {
908 my $self = shift (@_);
909 my ($section) = @_;
910
911 my $section_ptr = $self->_lookup_section($section);
912 if (!defined $section_ptr) {
913 print STDERR "doc::delete_text couldn't find section " .
914 "$section\n";
915 return;
916 }
917
918 $section_ptr->{'text'} = "";
919}
920
921# add_text assumes the text is in (extended) ascii form. For
922# text which has been already converted to the UTF-8 format
923# use add_utf8_text.
924sub add_text {
925 my $self = shift (@_);
926 my ($section, $text) = @_;
927
928 # convert the text to UTF-8 encoded unicode characters
929 # and add the text
930 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
931}
932
933
934# add_utf8_text assumes the text to be added has already
935# been converted to the UTF-8 encoding. For ascii text use
936# add_text
937sub add_utf8_text {
938 my $self = shift (@_);
939 my ($section, $text) = @_;
940
941 my $section_ptr = $self->_lookup_section($section);
942 if (!defined $section_ptr) {
943 print STDERR "doc::add_utf8_text couldn't find section " .
944 "$section\n";
945 return;
946 }
947
948 $section_ptr->{'text'} .= $text;
949}
950
951# returns the Source meta, which is the utf8 filename generated.
952# Added a separate method here for convenience
953sub get_source {
954 my $self = shift (@_);
955 return $self->get_metadata_element ($self->get_top_section(), "Source");
956}
957
958# returns the SourceFile meta, which is the url reference to the URL-encoded
959# version of Source (the utf8 filename). Added a separate method here for convenience
960sub get_sourcefile {
961 my $self = shift (@_);
962 return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
963}
964
965# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
966# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
967sub get_assocfile_from_sourcefile {
968 my $self = shift (@_);
969
970 # get the SourceFile meta, which is a *URL* to a file on the filesystem
971 my $top_section = $self->get_top_section();
972 my $source_file = $self->get_metadata_element($top_section, "SourceFile");
973
974 # get the actual filename as it exists on the filesystem which this url refers to
975 $source_file = &unicode::url_to_filename($source_file);
976 my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
977 return $assocfilename;
978}
979
980# methods for dealing with associated files
981
982# a file is associated with a document, NOT a section.
983# if section is defined it is noted in the data structure
984# only so that files associated from a particular section
985# may be removed later (using delete_section_assoc_files)
986sub associate_file {
987 my $self = shift (@_);
988 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
989 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
990
991 # remove all associated files with the same name
992 $self->delete_assoc_file ($assoc_filename);
993
994 push (@{$self->{'associated_files'}},
995 [$real_filename, $assoc_filename, $mime_type, $section]);
996}
997
998# returns a list of associated files in the form
999# [[real_filename, assoc_filename, mimetype], ...]
1000sub get_assoc_files {
1001 my $self = shift (@_);
1002
1003 return $self->{'associated_files'};
1004}
1005
1006sub delete_section_assoc_files {
1007 my $self = shift (@_);
1008 my ($section) = @_;
1009
1010 my $i=0;
1011 while ($i < scalar (@{$self->{'associated_files'}})) {
1012 if (defined $self->{'associated_files'}->[$i]->[3] &&
1013 $self->{'associated_files'}->[$i]->[3] eq $section) {
1014 splice (@{$self->{'associated_files'}}, $i, 1);
1015 } else {
1016 $i++;
1017 }
1018 }
1019}
1020
1021sub delete_assoc_file {
1022 my $self = shift (@_);
1023 my ($assoc_filename) = @_;
1024
1025 my $i=0;
1026 while ($i < scalar (@{$self->{'associated_files'}})) {
1027 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1028 splice (@{$self->{'associated_files'}}, $i, 1);
1029 } else {
1030 $i++;
1031 }
1032 }
1033}
1034
1035sub reset_nextsection_ptr {
1036 my $self = shift (@_);
1037 my ($section) = @_;
1038
1039 my $section_ptr = $self->_lookup_section($section);
1040 $section_ptr->{'next_subsection'} = 1;
1041}
1042
10431;
Note: See TracBrowser for help on using the repository browser.