source: main/trunk/greenstone2/perllib/doc.pm@ 23278

Last change on this file since 23278 was 23278, checked in by kjdon, 13 years ago

split out the encoding filename bit from set_source_metadata so that other places can use it

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 30.9 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use ghtml;
40use File::stat;
41##use hashdoc;
42use docprint;
43
44# the document type may be indexed_doc, nonindexed_doc, or
45# classification
46
47our $OIDcount = 0;
48
49# rename_method can be 'url', 'none', 'base64'
50sub new {
51 my $class = shift (@_);
52 my ($source_filename, $doc_type, $rename_method) = @_;
53
54
55 my $self = bless {'associated_files'=>[],
56 'subsection_order'=>[],
57 'next_subsection'=>1,
58 'subsections'=>{},
59 'metadata'=>[],
60 'text'=>"",
61 'OIDtype'=>"hash"}, $class;
62
63 # used to set lastmodified here, but this can screw up the HASH ids, so
64 # the docsave processor now calls set_lastmodified
65
66 $self->{'source_path'} = $source_filename;
67
68 if (defined $source_filename) {
69 $source_filename = &util::filename_within_collection($source_filename);
70 print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
71 $self->set_source_filename ($source_filename, $rename_method);
72 }
73
74 $self->set_doc_type ($doc_type) if defined $doc_type;
75
76 return $self;
77}
78# set lastmodified for OAI purposes, added by GRB, moved by kjdon
79sub set_lastmodified {
80 my $self = shift (@_);
81
82 my $source_path = $self->{'source_path'};
83
84 if (defined $source_path && (-e $source_path)) {
85
86 my $file_stat = stat($source_path);
87 my $mtime = $file_stat->mtime;
88 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
89 $wday, $yday, $isdst) = localtime($mtime);
90
91 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
92
93 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $file_stat->mtime);
94 $self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
95 }
96}
97
98# clone the $self object
99sub duplicate {
100 my $self = shift (@_);
101
102 my $newobj = {};
103
104 foreach my $k (keys %$self) {
105 $newobj->{$k} = &clone ($self->{$k});
106 }
107
108 bless $newobj, ref($self);
109 return $newobj;
110}
111
112sub clone {
113 my ($from) = @_;
114 my $type = ref ($from);
115
116 if ($type eq "HASH") {
117 my $to = {};
118 foreach my $key (keys %$from) {
119 $to->{$key} = &clone ($from->{$key});
120 }
121 return $to;
122 } elsif ($type eq "ARRAY") {
123 my $to = [];
124 foreach my $v (@$from) {
125 push (@$to, &clone ($v));
126 }
127 return $to;
128 } else {
129 return $from;
130 }
131}
132
133sub set_OIDtype {
134 my $self = shift (@_);
135 my ($type, $metadata) = @_;
136
137 if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|incremental|dirname|assigned)$/) {
138 $self->{'OIDtype'} = $type;
139 } else {
140 $self->{'OIDtype'} = "hash";
141 }
142
143 if ($type =~ /^assigned$/) {
144 if (defined $metadata) {
145 $self->{'OIDmetadata'} = $metadata;
146 } else {
147 $self->{'OIDmetadata'} = "dc.Identifier";
148 }
149 }
150}
151
152# rename_method can be 'url', 'none', 'base64'
153sub set_source_filename {
154 my $self = shift (@_);
155 my ($source_filename, $rename_method) = @_;
156
157 # Since the gsdlsourcefilename element goes into the doc.xml it has
158 # to be utf8. However, it should also *represent* the source filename
159 # (in the import directory) which may not be utf8 at all.
160 # For instance, if this meta element (gsdlsourcefilename) will be used
161 # by other applications that parse doc.xml in order to locate
162 # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
163 # encode the real filename as this is a binary-to-text encoding meaning
164 # that the resulting string is ASCII (utf8). Decoding will give the original.
165
166# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
167
168 # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
169 $source_filename = $self->encode_filename($source_filename, $rename_method);
170# my ($srcfilename,$dirname,$suffix)
171# = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
172# print STDERR "-> $srcfilename -> ";
173# $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
174# $source_filename = &util::filename_cat($dirname, $srcfilename);
175# print STDERR "$source_filename\n";
176
177 $self->set_utf8_metadata_element ($self->get_top_section(),
178 "gsdlsourcefilename",
179 $source_filename);
180}
181
182sub encode_filename {
183 my $self = shift (@_);
184 my ($source_filename, $rename_method) = @_;
185
186 my ($srcfilename,$dirname,$suffix)
187 = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
188# print STDERR "-> $srcfilename -> ";
189 $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
190 $source_filename = &util::filename_cat($dirname, $srcfilename);
191
192 return $source_filename;
193}
194
195sub set_converted_filename {
196 my $self = shift (@_);
197 my ($converted_filename) = @_;
198
199 # we know the converted filename is utf8
200 $self->set_utf8_metadata_element ($self->get_top_section(),
201 "gsdlconvertedfilename",
202 $converted_filename);
203}
204
205# returns the source_filename as it was provided
206sub get_unmodified_source_filename {
207 my $self = shift (@_);
208
209 return $self->{'source_path'};
210}
211
212# returns the source_filename with whatever rename_method was given
213sub get_source_filename {
214 my $self = shift (@_);
215
216 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
217}
218
219
220
221# returns converted filename if available else returns source filename
222sub get_filename_for_hashing {
223 my $self = shift (@_);
224
225 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
226
227 if (!defined $filename) {
228 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
229 # if NULPlug processed file, then don't give a filename
230 if (defined $plugin_name && $plugin_name eq "NULPlug") {
231 $filename = undef;
232 } else { # returns the URL encoded source filename!
233 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
234 }
235 }
236 return $filename;
237}
238
239sub set_doc_type {
240 my $self = shift (@_);
241 my ($doc_type) = @_;
242
243 $self->set_metadata_element ($self->get_top_section(),
244 "gsdldoctype",
245 $doc_type);
246}
247
248# returns the gsdldoctype as it was provided
249# the default of "indexed_doc" is used if no document
250# type was provided
251sub get_doc_type {
252 my $self = shift (@_);
253
254 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
255 return $doc_type if (defined $doc_type);
256 return "indexed_doc";
257}
258
259
260# look up the reference to the a particular section
261sub _lookup_section {
262 my $self = shift (@_);
263 my ($section) = @_;
264
265 my ($num);
266 my $sectionref = $self;
267
268 while (defined $section && $section ne "") {
269
270 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
271
272 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
273
274 $section = "" unless defined $section;
275
276
277 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
278 $sectionref = $sectionref->{'subsections'}->{$num};
279 } else {
280 return undef;
281 }
282 }
283
284 return $sectionref;
285}
286
287# calculate OID by hashing the contents of the document
288sub _calc_OID {
289 my $self = shift (@_);
290 my ($filename) = @_;
291
292
293 my $osexe = &util::get_os_exe();
294
295 my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
296 $ENV{'GSDLOS'},"hashfile$osexe");
297
298 my $result = "NULL";
299
300
301 if (-e "$hashfile_exe") {
302# $result = `\"$hashfile_exe\" \"$filename\"`;
303# $result = `hashfile$osexe \"$filename\" 2>&1`;
304 $result = `hashfile$osexe \"$filename\"`;
305
306 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
307 } else {
308 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
309 }
310 return "HASH$result";
311}
312
313# methods dealing with OID, not groups of them.
314
315# if $OID is not provided one is calculated
316sub set_OID {
317 my $self = shift (@_);
318 my ($OID) = @_;
319
320 my $use_hash_oid = 0;
321 # if an OID wasn't provided calculate one
322 if (!defined $OID) {
323 $OID = "NULL";
324 if ($self->{'OIDtype'} =~ /^hash/) {
325 $use_hash_oid = 1;
326 } elsif ($self->{'OIDtype'} eq "incremental") {
327 $OID = "D" . $OIDcount;
328 $OIDcount ++;
329
330 } elsif ($self->{'OIDtype'} eq "dirname") {
331 $OID = 'J';
332 my $filename = $self->get_source_filename();
333 if (defined($filename)) { # && -e $filename) {
334 $OID = &File::Basename::dirname($filename);
335 if (defined $OID) {
336 $OID = 'J'.&File::Basename::basename($OID);
337 $OID = &util::tidy_up_oid($OID);
338 } else {
339 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
340 $use_hash_oid = 1;
341 }
342 } else {
343 print STDERR "Failed to find filename, generating hash id\n";
344 $use_hash_oid = 1;
345 }
346
347 } elsif ($self->{'OIDtype'} eq "assigned") {
348 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
349 if (defined $identifier && $identifier ne "") {
350 $OID = $identifier;
351 $OID = &util::tidy_up_oid($OID);
352 } else {
353 # need a hash id
354 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
355 $use_hash_oid = 1;
356 }
357
358 } else {
359 $use_hash_oid = 1;
360 }
361
362 if ($use_hash_oid) {
363 my $hash_on_file = 1;
364 if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
365 $hash_on_file = 0;
366 }
367 if ($hash_on_file) {
368 # "hash" OID - feed file to hashfile.exe
369 my $filename = $self->get_filename_for_hashing();
370
371 # -z: don't want to hash on the file if it is zero size
372 if (defined($filename) && -e $filename && !-z $filename) {
373 $OID = $self->_calc_OID ($filename);
374 } else {
375 $hash_on_file = 0;
376 }
377 }
378 if (!$hash_on_file) {
379 my $filename = &util::get_tmp_filename();
380 if (!open (OUTFILE, ">:utf8", $filename)) {
381 print STDERR "doc::set_OID could not write to $filename\n";
382 } else {
383 my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
384 print OUTFILE $doc_text;
385 close (OUTFILE);
386 }
387 $OID = $self->_calc_OID ($filename);
388 &util::rm ($filename);
389 }
390 }
391 }
392 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
393}
394
395# this uses hashdoc (embedded c thingy) which is faster but still
396# needs a little work to be suffiently stable
397sub ___set_OID {
398 my $self = shift (@_);
399 my ($OID) = @_;
400
401 # if an OID wasn't provided then calculate hash value based on document
402 if (!defined $OID)
403 {
404 my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
405 my $hash_len = length($hash_text);
406
407 $OID = &hashdoc::buffer($hash_text,$hash_len);
408 }
409
410 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
411}
412
413# returns the OID for this document
414sub get_OID {
415 my $self = shift (@_);
416 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
417 return $OID if (defined $OID);
418 return "NULL";
419}
420
421sub delete_OID {
422 my $self = shift (@_);
423
424 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
425}
426
427
428# methods for manipulating section names
429
430# returns the name of the top-most section (the top
431# level of the document
432sub get_top_section {
433 my $self = shift (@_);
434
435 return "";
436}
437
438# returns a section
439sub get_parent_section {
440 my $self = shift (@_);
441 my ($section) = @_;
442
443 $section =~ s/(^|\.)\d+$//;
444
445 return $section;
446}
447
448# returns the first child section (or the end child
449# if there isn't any)
450sub get_begin_child {
451 my $self = shift (@_);
452 my ($section) = @_;
453
454 my $section_ptr = $self->_lookup_section($section);
455 return "" unless defined $section_ptr;
456
457 if (defined $section_ptr->{'subsection_order'}->[0]) {
458 return "$section.$section_ptr->{'subsection_order'}->[0]";
459 }
460
461 return $self->get_end_child ($section);
462}
463
464# returns the next child of a parent section
465sub get_next_child {
466 my $self = shift (@_);
467 my ($section) = @_;
468
469 my $parent_section = $self->get_parent_section($section);
470 my $parent_section_ptr = $self->_lookup_section($parent_section);
471 return undef unless defined $parent_section_ptr;
472
473 my ($section_num) = $section =~ /(\d+)$/;
474 return undef unless defined $section_num;
475
476 my $i = 0;
477 my $section_order = $parent_section_ptr->{'subsection_order'};
478 while ($i < scalar(@$section_order)) {
479 last if $section_order->[$i] eq $section_num;
480 $i++;
481 }
482
483 $i++; # the next child
484 if ($i < scalar(@$section_order)) {
485 return $section_order->[$i] if $parent_section eq "";
486 return "$parent_section.$section_order->[$i]";
487 }
488
489 # no more sections in this level
490 return undef;
491}
492
493# returns a reference to a list of children
494sub get_children {
495 my $self = shift (@_);
496 my ($section) = @_;
497
498 my $section_ptr = $self->_lookup_section($section);
499 return [] unless defined $section_ptr;
500
501 my @children = @{$section_ptr->{'subsection_order'}};
502
503 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
504 return \@children;
505}
506
507# returns the child section one past the last one (which
508# is coded as "0")
509sub get_end_child {
510 my $self = shift (@_);
511 my ($section) = @_;
512
513 return $section . ".0" unless $section eq "";
514 return "0";
515}
516
517# returns the next section in book order
518sub get_next_section {
519 my $self = shift (@_);
520 my ($section) = @_;
521
522 return undef unless defined $section;
523
524 my $section_ptr = $self->_lookup_section($section);
525 return undef unless defined $section_ptr;
526
527 # first try to find first child
528 if (defined $section_ptr->{'subsection_order'}->[0]) {
529 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
530 return "$section.$section_ptr->{'subsection_order'}->[0]";
531 }
532
533 do {
534 # try to find sibling
535 my $next_child = $self->get_next_child ($section);
536 return $next_child if (defined $next_child);
537
538 # move up one level
539 $section = $self->get_parent_section ($section);
540 } while $section =~ /\d/;
541
542 return undef;
543}
544
545sub is_leaf_section {
546 my $self = shift (@_);
547 my ($section) = @_;
548
549 my $section_ptr = $self->_lookup_section($section);
550 return 1 unless defined $section_ptr;
551
552 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
553}
554
555# methods for dealing with sections
556
557# returns the name of the inserted section
558sub insert_section {
559 my $self = shift (@_);
560 my ($before_section) = @_;
561
562 # get the child to insert before and its parent section
563 my $parent_section = "";
564 my $before_child = "0";
565 my @before_section = split (/\./, $before_section);
566 if (scalar(@before_section) > 0) {
567 $before_child = pop (@before_section);
568 $parent_section = join (".", @before_section);
569 }
570
571 my $parent_section_ptr = $self->_lookup_section($parent_section);
572 if (!defined $parent_section_ptr) {
573 print STDERR "doc::insert_section couldn't find parent section " .
574 "$parent_section\n";
575 return;
576 }
577
578 # get the next section number
579 my $section_num = $parent_section_ptr->{'next_subsection'}++;
580
581 my $i = 0;
582 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
583 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
584 $i++;
585 }
586
587 # insert the section number into the order list
588 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
589
590 # add this section to the parent section
591 my $section_ptr = {'subsection_order'=>[],
592 'next_subsection'=>1,
593 'subsections'=>{},
594 'metadata'=>[],
595 'text'=>""};
596 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
597
598 # work out the full section number
599 my $section = $parent_section;
600 $section .= "." unless $section eq "";
601 $section .= $section_num;
602
603 return $section;
604}
605
606# creates a pre-named section
607sub create_named_section {
608 my $self = shift (@_);
609 my ($mastersection) = @_;
610
611 my ($num);
612 my $section = $mastersection;
613 my $sectionref = $self;
614
615 while ($section ne "") {
616 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
617 $num =~ s/^0+(\d)/$1/; # remove leading 0s
618 $section = "" unless defined $section;
619
620 if (defined $num) {
621 if (!defined $sectionref->{'subsections'}->{$num}) {
622 push (@{$sectionref->{'subsection_order'}}, $num);
623 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
624 'next_subsection'=>1,
625 'subsections'=>{},
626 'metadata'=>[],
627 'text'=>""};
628 if ($num >= $sectionref->{'next_subsection'}) {
629 $sectionref->{'next_subsection'} = $num + 1;
630 }
631 }
632 $sectionref = $sectionref->{'subsections'}->{$num};
633
634 } else {
635 print STDERR "doc::create_named_section couldn't create section ";
636 print STDERR "$mastersection\n";
637 last;
638 }
639 }
640}
641
642# returns a reference to a list of subsections
643sub list_subsections {
644 my $self = shift (@_);
645 my ($section) = @_;
646
647 my $section_ptr = $self->_lookup_section ($section);
648 if (!defined $section_ptr) {
649 print STDERR "doc::list_subsections couldn't find section $section\n";
650 return [];
651 }
652
653 return [@{$section_ptr->{'subsection_order'}}];
654}
655
656sub delete_section {
657 my $self = shift (@_);
658 my ($section) = @_;
659
660# my $section_ptr = {'subsection_order'=>[],
661# 'next_subsection'=>1,
662# 'subsections'=>{},
663# 'metadata'=>[],
664# 'text'=>""};
665
666 # if this is the top section reset everything
667 if ($section eq "") {
668 $self->{'subsection_order'} = [];
669 $self->{'subsections'} = {};
670 $self->{'metadata'} = [];
671 $self->{'text'} = "";
672 return;
673 }
674
675 # find the parent of the section to delete
676 my $parent_section = "";
677 my $child = "0";
678 my @section = split (/\./, $section);
679 if (scalar(@section) > 0) {
680 $child = pop (@section);
681 $parent_section = join (".", @section);
682 }
683
684 my $parent_section_ptr = $self->_lookup_section($parent_section);
685 if (!defined $parent_section_ptr) {
686 print STDERR "doc::delete_section couldn't find parent section " .
687 "$parent_section\n";
688 return;
689 }
690
691 # remove this section from the subsection_order list
692 my $i = 0;
693 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
694 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
695 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
696 last;
697 }
698 $i++;
699 }
700
701 # remove this section from the subsection hash
702 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
703 undef $parent_section_ptr->{'subsections'}->{$child};
704 }
705}
706
707#--
708# methods for dealing with metadata
709
710# set_metadata_element and get_metadata_element are for metadata
711# which should only have one value. add_meta_data and get_metadata
712# are for metadata which can have more than one value.
713
714# returns the first metadata value which matches field
715
716# This version of get metadata element works much like the one above,
717# except it allows for the namespace portion of a metadata element to
718# be ignored, thus if you are searching for dc.Title, the first piece
719# of matching metadata ending with the name Title (once any namespace
720# is removed) would be returned.
721# 28-11-2003 John Thompson
722sub get_metadata_element {
723 my $self = shift (@_);
724 my ($section, $field, $ignore_namespace) = @_;
725 my ($data);
726
727 $ignore_namespace = 0 unless defined $ignore_namespace;
728
729 my $section_ptr = $self->_lookup_section($section);
730 if (!defined $section_ptr) {
731 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
732 return;
733 }
734
735 # Remove the namespace if we are being told to ignore them
736 if($ignore_namespace) {
737 $field =~ s/^\w*\.//;
738 }
739
740 foreach $data (@{$section_ptr->{'metadata'}}) {
741
742 my $data_name = $data->[0];
743
744 # Remove the any namespace if we are being told to ignore them
745 if($ignore_namespace) {
746 $data_name =~ s/^\w*\.//;
747 }
748 $data_name =~ s/^ex\.//; # we always remove ex. - it maybe there in doc_obj, but we will never ask for it.
749 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
750 }
751
752 return undef; # was not found
753}
754
755# returns a list of the form [value1, value2, ...]
756sub get_metadata {
757 my $self = shift (@_);
758 my ($section, $field, $ignore_namespace) = @_;
759 my ($data);
760
761 $ignore_namespace = 0 unless defined $ignore_namespace;
762
763 my $section_ptr = $self->_lookup_section($section);
764 if (!defined $section_ptr) {
765 print STDERR "doc::get_metadata couldn't find section ",
766 $section, "\n";
767 return;
768 }
769
770 # Remove the any namespace if we are being told to ignore them
771 if($ignore_namespace) {
772 $field =~ s/^\w*\.//;
773 }
774
775 my @metadata = ();
776 foreach $data (@{$section_ptr->{'metadata'}}) {
777
778 my $data_name = $data->[0];
779 # Remove the any namespace if we are being told to ignore them
780 if($ignore_namespace) {
781 $data_name =~ s/^\w*\.//;
782 }
783 $data_name =~ s/^ex\.//; # we always remove ex. - it maybe there in doc_obj, but we will never ask for it.
784 push (@metadata, $data->[1]) if ($data_name eq $field);
785 }
786
787 return \@metadata;
788}
789
790# returns a list of the form [[field,value],[field,value],...]
791sub get_all_metadata {
792 my $self = shift (@_);
793 my ($section) = @_;
794
795 my $section_ptr = $self->_lookup_section($section);
796 if (!defined $section_ptr) {
797 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
798 return;
799 }
800
801 return $section_ptr->{'metadata'};
802}
803
804# $value is optional
805sub delete_metadata {
806 my $self = shift (@_);
807 my ($section, $field, $value) = @_;
808
809 my $section_ptr = $self->_lookup_section($section);
810 if (!defined $section_ptr) {
811 print STDERR "doc::delete_metadata couldn't find section ", $section, "\n";
812 return;
813 }
814
815 my $i = 0;
816 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
817 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
818 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
819 splice (@{$section_ptr->{'metadata'}}, $i, 1);
820 } else {
821 $i++;
822 }
823 }
824}
825
826sub delete_all_metadata {
827 my $self = shift (@_);
828 my ($section) = @_;
829
830 my $section_ptr = $self->_lookup_section($section);
831 if (!defined $section_ptr) {
832 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
833 return;
834 }
835
836 $section_ptr->{'metadata'} = [];
837}
838
839sub set_metadata_element {
840 my $self = shift (@_);
841 my ($section, $field, $value) = @_;
842
843 $self->set_utf8_metadata_element ($section, $field,
844 &unicode::ascii2utf8(\$value));
845}
846
847# set_utf8_metadata_element assumes the text has already been
848# converted to the UTF-8 encoding.
849sub set_utf8_metadata_element {
850 my $self = shift (@_);
851 my ($section, $field, $value) = @_;
852
853 $self->delete_metadata ($section, $field);
854 $self->add_utf8_metadata ($section, $field, $value);
855}
856
857
858# add_metadata assumes the text is in (extended) ascii form. For
859# text which has already been converted to the UTF-8 format use
860# add_utf8_metadata.
861sub add_metadata {
862 my $self = shift (@_);
863 my ($section, $field, $value) = @_;
864
865 $self->add_utf8_metadata ($section, $field,
866 &unicode::ascii2utf8(\$value));
867}
868
869sub add_utf8_metadata {
870 my $self = shift (@_);
871 my ($section, $field, $value) = @_;
872
873# my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
874# my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
875# print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
876
877 my $section_ptr = $self->_lookup_section($section);
878 if (!defined $section_ptr) {
879 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
880 return;
881 }
882 if (!defined $value) {
883 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
884 return;
885 }
886 if (!defined $field) {
887 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
888 return;
889 }
890
891 #print STDERR "###$field=$value\n";
892
893 # For now, supress this check. Given that text data read in is now
894 # Unicode aware, then the following block of code can (ironically enough)
895 # cause our unicode compliant string to be re-encoded (leading to
896 # a double-encoded UTF-8 string, which we definitely don't want!).
897
898
899 # double check that the value is utf-8
900# if (!&unicode::check_is_utf8($value)) {
901# print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
902# &unicode::ensure_utf8(\$value);
903# print STDERR " Tried converting to utf8: $value\n";
904# }
905
906 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
907}
908
909
910# methods for dealing with text
911
912# returns the text for a section
913sub get_text {
914 my $self = shift (@_);
915 my ($section) = @_;
916
917 my $section_ptr = $self->_lookup_section($section);
918 if (!defined $section_ptr) {
919 print STDERR "doc::get_text couldn't find section " .
920 "$section\n";
921 return "";
922 }
923
924 return $section_ptr->{'text'};
925}
926
927# returns the (utf-8 encoded) length of the text for a section
928sub get_text_length {
929 my $self = shift (@_);
930 my ($section) = @_;
931
932 my $section_ptr = $self->_lookup_section($section);
933 if (!defined $section_ptr) {
934 print STDERR "doc::get_text_length couldn't find section " .
935 "$section\n";
936 return 0;
937 }
938
939 return length ($section_ptr->{'text'});
940}
941
942# returns the total length for all the sections
943sub get_total_text_length {
944 my $self = shift (@_);
945
946 my $section = $self->get_top_section();
947 my $length = 0;
948 while (defined $section) {
949 $length += $self->get_text_length($section);
950 $section = $self->get_next_section($section);
951 }
952 return $length;
953}
954
955sub delete_text {
956 my $self = shift (@_);
957 my ($section) = @_;
958
959 my $section_ptr = $self->_lookup_section($section);
960 if (!defined $section_ptr) {
961 print STDERR "doc::delete_text couldn't find section " .
962 "$section\n";
963 return;
964 }
965
966 $section_ptr->{'text'} = "";
967}
968
969# add_text assumes the text is in (extended) ascii form. For
970# text which has been already converted to the UTF-8 format
971# use add_utf8_text.
972sub add_text {
973 my $self = shift (@_);
974 my ($section, $text) = @_;
975
976 # convert the text to UTF-8 encoded unicode characters
977 # and add the text
978 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
979}
980
981
982# add_utf8_text assumes the text to be added has already
983# been converted to the UTF-8 encoding. For ascii text use
984# add_text
985sub add_utf8_text {
986 my $self = shift (@_);
987 my ($section, $text) = @_;
988
989 my $section_ptr = $self->_lookup_section($section);
990 if (!defined $section_ptr) {
991 print STDERR "doc::add_utf8_text couldn't find section " .
992 "$section\n";
993 return;
994 }
995
996 $section_ptr->{'text'} .= $text;
997}
998
999# returns the Source meta, which is the utf8 filename generated.
1000# Added a separate method here for convenience
1001sub get_source {
1002 my $self = shift (@_);
1003 return $self->get_metadata_element ($self->get_top_section(), "Source");
1004}
1005
1006# returns the SourceFile meta, which is the url reference to the URL-encoded
1007# version of Source (the utf8 filename). Added a separate method here for convenience
1008sub get_sourcefile {
1009 my $self = shift (@_);
1010 return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
1011}
1012
1013# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
1014# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
1015sub get_assocfile_from_sourcefile {
1016 my $self = shift (@_);
1017
1018 # get the SourceFile meta, which is a *URL* to a file on the filesystem
1019 my $top_section = $self->get_top_section();
1020 my $source_file = $self->get_metadata_element($top_section, "SourceFile");
1021
1022 # get the actual filename as it exists on the filesystem which this url refers to
1023 $source_file = &unicode::url_to_filename($source_file);
1024 my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
1025 return $assocfilename;
1026}
1027
1028# methods for dealing with associated files
1029
1030# a file is associated with a document, NOT a section.
1031# if section is defined it is noted in the data structure
1032# only so that files associated from a particular section
1033# may be removed later (using delete_section_assoc_files)
1034sub associate_file {
1035 my $self = shift (@_);
1036 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1037 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1038
1039 # remove all associated files with the same name
1040 $self->delete_assoc_file ($assoc_filename);
1041
1042 push (@{$self->{'associated_files'}},
1043 [$real_filename, $assoc_filename, $mime_type, $section]);
1044}
1045
1046# returns a list of associated files in the form
1047# [[real_filename, assoc_filename, mimetype], ...]
1048sub get_assoc_files {
1049 my $self = shift (@_);
1050
1051 return $self->{'associated_files'};
1052}
1053
1054# the following two methods used to keep track of original associated files
1055# for incremental building. eg a txt file used by an item file does not end
1056# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1057# file for incremental build
1058sub associate_source_file {
1059 my $self = shift (@_);
1060 my ($full_filename) = @_;
1061
1062 push (@{$self->{'source_assoc_files'}}, $full_filename);
1063
1064}
1065
1066sub get_source_assoc_files {
1067 my $self = shift (@_);
1068
1069 return $self->{'source_assoc_files'};
1070
1071
1072}
1073sub metadata_file {
1074 my $self = shift (@_);
1075 my ($real_filename, $filename) = @_;
1076
1077 push (@{$self->{'metadata_files'}},
1078 [$real_filename, $filename]);
1079}
1080
1081# used for writing out the archiveinf-doc info database, to list all the metadata files
1082sub get_meta_files {
1083 my $self = shift (@_);
1084
1085 return $self->{'metadata_files'};
1086}
1087
1088sub delete_section_assoc_files {
1089 my $self = shift (@_);
1090 my ($section) = @_;
1091
1092 my $i=0;
1093 while ($i < scalar (@{$self->{'associated_files'}})) {
1094 if (defined $self->{'associated_files'}->[$i]->[3] &&
1095 $self->{'associated_files'}->[$i]->[3] eq $section) {
1096 splice (@{$self->{'associated_files'}}, $i, 1);
1097 } else {
1098 $i++;
1099 }
1100 }
1101}
1102
1103sub delete_assoc_file {
1104 my $self = shift (@_);
1105 my ($assoc_filename) = @_;
1106
1107 my $i=0;
1108 while ($i < scalar (@{$self->{'associated_files'}})) {
1109 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1110 splice (@{$self->{'associated_files'}}, $i, 1);
1111 } else {
1112 $i++;
1113 }
1114 }
1115}
1116
1117sub reset_nextsection_ptr {
1118 my $self = shift (@_);
1119 my ($section) = @_;
1120
1121 my $section_ptr = $self->_lookup_section($section);
1122 $section_ptr->{'next_subsection'} = 1;
1123}
1124
11251;
Note: See TracBrowser for help on using the repository browser.