source: main/trunk/greenstone2/perllib/doc.pm@ 23362

Last change on this file since 23362 was 23362, checked in by davidb, 13 years ago

Additional routines (and few upgraded) to help support Greenstone working with filenames under Windows when then go beyond Latin-1 and start turning up in their DOS abbreviated form (e.g. Test~1.txt)

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 32.0 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use ghtml;
40use File::stat;
41##use hashdoc;
42use docprint;
43
44# the document type may be indexed_doc, nonindexed_doc, or
45# classification
46
47our $OIDcount = 0;
48
49# rename_method can be 'url', 'none', 'base64'
50sub new {
51 my $class = shift (@_);
52 my ($source_filename, $doc_type, $rename_method) = @_;
53
54
55 my $self = bless {'associated_files'=>[],
56 'subsection_order'=>[],
57 'next_subsection'=>1,
58 'subsections'=>{},
59 'metadata'=>[],
60 'text'=>"",
61 'OIDtype'=>"hash"}, $class;
62
63 # used to set lastmodified here, but this can screw up the HASH ids, so
64 # the docsave processor now calls set_lastmodified
65
66 $self->set_source_path($source_filename);
67
68 if (defined $source_filename) {
69 $source_filename = &util::filename_within_collection($source_filename);
70 print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
71 $self->set_source_filename ($source_filename, $rename_method);
72 }
73
74 $self->set_doc_type ($doc_type) if defined $doc_type;
75
76 return $self;
77}
78
79
80sub set_source_path
81{
82 my $self = shift @_;
83 my ($source_filename) = @_;
84
85 if (defined $source_filename) {
86 # On Windows the source_filename can be in terse DOS format
87 # e.g. test~1.txt
88
89 $self->{'terse_source_path'} = $source_filename;
90
91 if (-e $source_filename) {
92 # See if we can do better for Windows with a filename
93 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
94 require Win32;
95 $self->{'source_path'} = Win32::GetLongPathName($source_filename);
96 }
97 else {
98 # For Unix-based systems, there is no difference between the two
99 $self->{'source_path'} = $source_filename;
100 }
101 }
102 else {
103 print STDERR "Warning: In doc::set_source_path(), file\n";
104 print STDERR " $source_filename\n";
105 print STDERR " does not exist\n";
106
107 # (default) Set it to whatever we were given
108 $self->{'source_path'} = $source_filename;
109 }
110 }
111 else {
112 # Previous code for setting source_path allowed for
113 # it to be undefined, so continue this practice
114 $self->{'terse_source_path'} = undef;
115 $self->{'source_path'} = undef;
116 }
117}
118
119
120sub get_source_path
121{
122 my $self = shift @_;
123
124 return $self->{'terse_source_path'};
125}
126
127# set lastmodified for OAI purposes, added by GRB, moved by kjdon
128sub set_lastmodified {
129 my $self = shift (@_);
130
131 my $source_path = $self->{'terse_source_path'};
132
133 if (defined $source_path && (-e $source_path)) {
134
135 my $file_stat = stat($source_path);
136 my $mtime = $file_stat->mtime;
137 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
138 $wday, $yday, $isdst) = localtime($mtime);
139
140 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
141
142 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $file_stat->mtime);
143 $self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
144 }
145}
146
147# clone the $self object
148sub duplicate {
149 my $self = shift (@_);
150
151 my $newobj = {};
152
153 foreach my $k (keys %$self) {
154 $newobj->{$k} = &clone ($self->{$k});
155 }
156
157 bless $newobj, ref($self);
158 return $newobj;
159}
160
161sub clone {
162 my ($from) = @_;
163 my $type = ref ($from);
164
165 if ($type eq "HASH") {
166 my $to = {};
167 foreach my $key (keys %$from) {
168 $to->{$key} = &clone ($from->{$key});
169 }
170 return $to;
171 } elsif ($type eq "ARRAY") {
172 my $to = [];
173 foreach my $v (@$from) {
174 push (@$to, &clone ($v));
175 }
176 return $to;
177 } else {
178 return $from;
179 }
180}
181
182sub set_OIDtype {
183 my $self = shift (@_);
184 my ($type, $metadata) = @_;
185
186 if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|incremental|dirname|assigned)$/) {
187 $self->{'OIDtype'} = $type;
188 } else {
189 $self->{'OIDtype'} = "hash";
190 }
191
192 if ($type =~ /^assigned$/) {
193 if (defined $metadata) {
194 $self->{'OIDmetadata'} = $metadata;
195 } else {
196 $self->{'OIDmetadata'} = "dc.Identifier";
197 }
198 }
199}
200
201# rename_method can be 'url', 'none', 'base64'
202sub set_source_filename {
203 my $self = shift (@_);
204 my ($source_filename, $rename_method) = @_;
205
206 # Since the gsdlsourcefilename element goes into the doc.xml it has
207 # to be utf8. However, it should also *represent* the source filename
208 # (in the import directory) which may not be utf8 at all.
209 # For instance, if this meta element (gsdlsourcefilename) will be used
210 # by other applications that parse doc.xml in order to locate
211 # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
212 # encode the real filename as this is a binary-to-text encoding meaning
213 # that the resulting string is ASCII (utf8). Decoding will give the original.
214
215# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
216
217 # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
218 $source_filename = $self->encode_filename($source_filename, $rename_method);
219# my ($srcfilename,$dirname,$suffix)
220# = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
221# print STDERR "-> $srcfilename -> ";
222# $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
223# $source_filename = &util::filename_cat($dirname, $srcfilename);
224# print STDERR "$source_filename\n";
225
226 $self->set_utf8_metadata_element ($self->get_top_section(),
227 "gsdlsourcefilename",
228 $source_filename);
229}
230
231sub encode_filename {
232 my $self = shift (@_);
233 my ($source_filename, $rename_method) = @_;
234
235 my ($srcfilename,$dirname,$suffix)
236 = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
237# print STDERR "-> $srcfilename -> ";
238 $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
239 $source_filename = &util::filename_cat($dirname, $srcfilename);
240
241 return $source_filename;
242}
243
244sub set_converted_filename {
245 my $self = shift (@_);
246 my ($converted_filename) = @_;
247
248 # we know the converted filename is utf8
249 $self->set_utf8_metadata_element ($self->get_top_section(),
250 "gsdlconvertedfilename",
251 $converted_filename);
252}
253
254# returns the source_filename as it was provided
255sub get_unmodified_source_filename {
256 my $self = shift (@_);
257
258 return $self->{'terse_source_path'};
259}
260
261# returns the source_filename with whatever rename_method was given
262sub get_source_filename {
263 my $self = shift (@_);
264
265 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
266}
267
268
269
270# returns converted filename if available else returns source filename
271sub get_filename_for_hashing {
272 my $self = shift (@_);
273
274 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
275
276 if (!defined $filename) {
277 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
278 # if NULPlug processed file, then don't give a filename
279 if (defined $plugin_name && $plugin_name eq "NULPlug") {
280 $filename = undef;
281 } else { # returns the URL encoded source filename!
282 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
283 }
284 }
285 return $filename;
286}
287
288sub set_doc_type {
289 my $self = shift (@_);
290 my ($doc_type) = @_;
291
292 $self->set_metadata_element ($self->get_top_section(),
293 "gsdldoctype",
294 $doc_type);
295}
296
297# returns the gsdldoctype as it was provided
298# the default of "indexed_doc" is used if no document
299# type was provided
300sub get_doc_type {
301 my $self = shift (@_);
302
303 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
304 return $doc_type if (defined $doc_type);
305 return "indexed_doc";
306}
307
308
309# look up the reference to the a particular section
310sub _lookup_section {
311 my $self = shift (@_);
312 my ($section) = @_;
313
314 my ($num);
315 my $sectionref = $self;
316
317 while (defined $section && $section ne "") {
318
319 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
320
321 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
322
323 $section = "" unless defined $section;
324
325
326 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
327 $sectionref = $sectionref->{'subsections'}->{$num};
328 } else {
329 return undef;
330 }
331 }
332
333 return $sectionref;
334}
335
336# calculate OID by hashing the contents of the document
337sub _calc_OID {
338 my $self = shift (@_);
339 my ($filename) = @_;
340
341
342 my $osexe = &util::get_os_exe();
343
344 my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
345 $ENV{'GSDLOS'},"hashfile$osexe");
346
347 my $result = "NULL";
348
349
350 if (-e "$hashfile_exe") {
351# $result = `\"$hashfile_exe\" \"$filename\"`;
352# $result = `hashfile$osexe \"$filename\" 2>&1`;
353 $result = `hashfile$osexe \"$filename\"`;
354
355 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
356 } else {
357 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
358 }
359 return "HASH$result";
360}
361
362# methods dealing with OID, not groups of them.
363
364# if $OID is not provided one is calculated
365sub set_OID {
366 my $self = shift (@_);
367 my ($OID) = @_;
368
369 my $use_hash_oid = 0;
370 # if an OID wasn't provided calculate one
371 if (!defined $OID) {
372 $OID = "NULL";
373 if ($self->{'OIDtype'} =~ /^hash/) {
374 $use_hash_oid = 1;
375 } elsif ($self->{'OIDtype'} eq "incremental") {
376 $OID = "D" . $OIDcount;
377 $OIDcount ++;
378
379 } elsif ($self->{'OIDtype'} eq "dirname") {
380 $OID = 'J';
381 my $filename = $self->get_source_filename();
382 if (defined($filename)) { # && -e $filename) {
383 $OID = &File::Basename::dirname($filename);
384 if (defined $OID) {
385 $OID = 'J'.&File::Basename::basename($OID);
386 $OID = &util::tidy_up_oid($OID);
387 } else {
388 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
389 $use_hash_oid = 1;
390 }
391 } else {
392 print STDERR "Failed to find filename, generating hash id\n";
393 $use_hash_oid = 1;
394 }
395
396 } elsif ($self->{'OIDtype'} eq "assigned") {
397 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
398 if (defined $identifier && $identifier ne "") {
399 $OID = $identifier;
400 $OID = &util::tidy_up_oid($OID);
401 } else {
402 # need a hash id
403 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
404 $use_hash_oid = 1;
405 }
406
407 } else {
408 $use_hash_oid = 1;
409 }
410
411 if ($use_hash_oid) {
412 my $hash_on_file = 1;
413 if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
414 $hash_on_file = 0;
415 }
416 if ($hash_on_file) {
417 # "hash" OID - feed file to hashfile.exe
418 my $filename = $self->get_filename_for_hashing();
419
420 # -z: don't want to hash on the file if it is zero size
421 if (defined($filename) && -e $filename && !-z $filename) {
422 $OID = $self->_calc_OID ($filename);
423 } else {
424 $hash_on_file = 0;
425 }
426 }
427 if (!$hash_on_file) {
428 my $filename = &util::get_tmp_filename();
429 if (!open (OUTFILE, ">:utf8", $filename)) {
430 print STDERR "doc::set_OID could not write to $filename\n";
431 } else {
432 my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
433 print OUTFILE $doc_text;
434 close (OUTFILE);
435 }
436 $OID = $self->_calc_OID ($filename);
437 &util::rm ($filename);
438 }
439 }
440 }
441 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
442}
443
444# this uses hashdoc (embedded c thingy) which is faster but still
445# needs a little work to be suffiently stable
446sub ___set_OID {
447 my $self = shift (@_);
448 my ($OID) = @_;
449
450 # if an OID wasn't provided then calculate hash value based on document
451 if (!defined $OID)
452 {
453 my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
454 my $hash_len = length($hash_text);
455
456 $OID = &hashdoc::buffer($hash_text,$hash_len);
457 }
458
459 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
460}
461
462# returns the OID for this document
463sub get_OID {
464 my $self = shift (@_);
465 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
466 return $OID if (defined $OID);
467 return "NULL";
468}
469
470sub delete_OID {
471 my $self = shift (@_);
472
473 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
474}
475
476
477# methods for manipulating section names
478
479# returns the name of the top-most section (the top
480# level of the document
481sub get_top_section {
482 my $self = shift (@_);
483
484 return "";
485}
486
487# returns a section
488sub get_parent_section {
489 my $self = shift (@_);
490 my ($section) = @_;
491
492 $section =~ s/(^|\.)\d+$//;
493
494 return $section;
495}
496
497# returns the first child section (or the end child
498# if there isn't any)
499sub get_begin_child {
500 my $self = shift (@_);
501 my ($section) = @_;
502
503 my $section_ptr = $self->_lookup_section($section);
504 return "" unless defined $section_ptr;
505
506 if (defined $section_ptr->{'subsection_order'}->[0]) {
507 return "$section.$section_ptr->{'subsection_order'}->[0]";
508 }
509
510 return $self->get_end_child ($section);
511}
512
513# returns the next child of a parent section
514sub get_next_child {
515 my $self = shift (@_);
516 my ($section) = @_;
517
518 my $parent_section = $self->get_parent_section($section);
519 my $parent_section_ptr = $self->_lookup_section($parent_section);
520 return undef unless defined $parent_section_ptr;
521
522 my ($section_num) = $section =~ /(\d+)$/;
523 return undef unless defined $section_num;
524
525 my $i = 0;
526 my $section_order = $parent_section_ptr->{'subsection_order'};
527 while ($i < scalar(@$section_order)) {
528 last if $section_order->[$i] eq $section_num;
529 $i++;
530 }
531
532 $i++; # the next child
533 if ($i < scalar(@$section_order)) {
534 return $section_order->[$i] if $parent_section eq "";
535 return "$parent_section.$section_order->[$i]";
536 }
537
538 # no more sections in this level
539 return undef;
540}
541
542# returns a reference to a list of children
543sub get_children {
544 my $self = shift (@_);
545 my ($section) = @_;
546
547 my $section_ptr = $self->_lookup_section($section);
548 return [] unless defined $section_ptr;
549
550 my @children = @{$section_ptr->{'subsection_order'}};
551
552 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
553 return \@children;
554}
555
556# returns the child section one past the last one (which
557# is coded as "0")
558sub get_end_child {
559 my $self = shift (@_);
560 my ($section) = @_;
561
562 return $section . ".0" unless $section eq "";
563 return "0";
564}
565
566# returns the next section in book order
567sub get_next_section {
568 my $self = shift (@_);
569 my ($section) = @_;
570
571 return undef unless defined $section;
572
573 my $section_ptr = $self->_lookup_section($section);
574 return undef unless defined $section_ptr;
575
576 # first try to find first child
577 if (defined $section_ptr->{'subsection_order'}->[0]) {
578 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
579 return "$section.$section_ptr->{'subsection_order'}->[0]";
580 }
581
582 do {
583 # try to find sibling
584 my $next_child = $self->get_next_child ($section);
585 return $next_child if (defined $next_child);
586
587 # move up one level
588 $section = $self->get_parent_section ($section);
589 } while $section =~ /\d/;
590
591 return undef;
592}
593
594sub is_leaf_section {
595 my $self = shift (@_);
596 my ($section) = @_;
597
598 my $section_ptr = $self->_lookup_section($section);
599 return 1 unless defined $section_ptr;
600
601 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
602}
603
604# methods for dealing with sections
605
606# returns the name of the inserted section
607sub insert_section {
608 my $self = shift (@_);
609 my ($before_section) = @_;
610
611 # get the child to insert before and its parent section
612 my $parent_section = "";
613 my $before_child = "0";
614 my @before_section = split (/\./, $before_section);
615 if (scalar(@before_section) > 0) {
616 $before_child = pop (@before_section);
617 $parent_section = join (".", @before_section);
618 }
619
620 my $parent_section_ptr = $self->_lookup_section($parent_section);
621 if (!defined $parent_section_ptr) {
622 print STDERR "doc::insert_section couldn't find parent section " .
623 "$parent_section\n";
624 return;
625 }
626
627 # get the next section number
628 my $section_num = $parent_section_ptr->{'next_subsection'}++;
629
630 my $i = 0;
631 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
632 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
633 $i++;
634 }
635
636 # insert the section number into the order list
637 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
638
639 # add this section to the parent section
640 my $section_ptr = {'subsection_order'=>[],
641 'next_subsection'=>1,
642 'subsections'=>{},
643 'metadata'=>[],
644 'text'=>""};
645 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
646
647 # work out the full section number
648 my $section = $parent_section;
649 $section .= "." unless $section eq "";
650 $section .= $section_num;
651
652 return $section;
653}
654
655# creates a pre-named section
656sub create_named_section {
657 my $self = shift (@_);
658 my ($mastersection) = @_;
659
660 my ($num);
661 my $section = $mastersection;
662 my $sectionref = $self;
663
664 while ($section ne "") {
665 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
666 $num =~ s/^0+(\d)/$1/; # remove leading 0s
667 $section = "" unless defined $section;
668
669 if (defined $num) {
670 if (!defined $sectionref->{'subsections'}->{$num}) {
671 push (@{$sectionref->{'subsection_order'}}, $num);
672 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
673 'next_subsection'=>1,
674 'subsections'=>{},
675 'metadata'=>[],
676 'text'=>""};
677 if ($num >= $sectionref->{'next_subsection'}) {
678 $sectionref->{'next_subsection'} = $num + 1;
679 }
680 }
681 $sectionref = $sectionref->{'subsections'}->{$num};
682
683 } else {
684 print STDERR "doc::create_named_section couldn't create section ";
685 print STDERR "$mastersection\n";
686 last;
687 }
688 }
689}
690
691# returns a reference to a list of subsections
692sub list_subsections {
693 my $self = shift (@_);
694 my ($section) = @_;
695
696 my $section_ptr = $self->_lookup_section ($section);
697 if (!defined $section_ptr) {
698 print STDERR "doc::list_subsections couldn't find section $section\n";
699 return [];
700 }
701
702 return [@{$section_ptr->{'subsection_order'}}];
703}
704
705sub delete_section {
706 my $self = shift (@_);
707 my ($section) = @_;
708
709# my $section_ptr = {'subsection_order'=>[],
710# 'next_subsection'=>1,
711# 'subsections'=>{},
712# 'metadata'=>[],
713# 'text'=>""};
714
715 # if this is the top section reset everything
716 if ($section eq "") {
717 $self->{'subsection_order'} = [];
718 $self->{'subsections'} = {};
719 $self->{'metadata'} = [];
720 $self->{'text'} = "";
721 return;
722 }
723
724 # find the parent of the section to delete
725 my $parent_section = "";
726 my $child = "0";
727 my @section = split (/\./, $section);
728 if (scalar(@section) > 0) {
729 $child = pop (@section);
730 $parent_section = join (".", @section);
731 }
732
733 my $parent_section_ptr = $self->_lookup_section($parent_section);
734 if (!defined $parent_section_ptr) {
735 print STDERR "doc::delete_section couldn't find parent section " .
736 "$parent_section\n";
737 return;
738 }
739
740 # remove this section from the subsection_order list
741 my $i = 0;
742 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
743 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
744 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
745 last;
746 }
747 $i++;
748 }
749
750 # remove this section from the subsection hash
751 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
752 undef $parent_section_ptr->{'subsections'}->{$child};
753 }
754}
755
756#--
757# methods for dealing with metadata
758
759# set_metadata_element and get_metadata_element are for metadata
760# which should only have one value. add_meta_data and get_metadata
761# are for metadata which can have more than one value.
762
763# returns the first metadata value which matches field
764
765# This version of get metadata element works much like the one above,
766# except it allows for the namespace portion of a metadata element to
767# be ignored, thus if you are searching for dc.Title, the first piece
768# of matching metadata ending with the name Title (once any namespace
769# is removed) would be returned.
770# 28-11-2003 John Thompson
771sub get_metadata_element {
772 my $self = shift (@_);
773 my ($section, $field, $ignore_namespace) = @_;
774 my ($data);
775
776 $ignore_namespace = 0 unless defined $ignore_namespace;
777
778 my $section_ptr = $self->_lookup_section($section);
779 if (!defined $section_ptr) {
780 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
781 return;
782 }
783
784 # Remove the namespace if we are being told to ignore them
785 if($ignore_namespace) {
786 $field =~ s/^\w*\.//;
787 }
788
789 foreach $data (@{$section_ptr->{'metadata'}}) {
790
791 my $data_name = $data->[0];
792
793 # Remove the any namespace if we are being told to ignore them
794 if($ignore_namespace) {
795 $data_name =~ s/^\w*\.//;
796 }
797 $data_name =~ s/^ex\.//; # we always remove ex. - it maybe there in doc_obj, but we will never ask for it.
798 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
799 }
800
801 return undef; # was not found
802}
803
804# returns a list of the form [value1, value2, ...]
805sub get_metadata {
806 my $self = shift (@_);
807 my ($section, $field, $ignore_namespace) = @_;
808 my ($data);
809
810 $ignore_namespace = 0 unless defined $ignore_namespace;
811
812 my $section_ptr = $self->_lookup_section($section);
813 if (!defined $section_ptr) {
814 print STDERR "doc::get_metadata couldn't find section ",
815 $section, "\n";
816 return;
817 }
818
819 # Remove the any namespace if we are being told to ignore them
820 if($ignore_namespace) {
821 $field =~ s/^\w*\.//;
822 }
823
824 my @metadata = ();
825 foreach $data (@{$section_ptr->{'metadata'}}) {
826
827 my $data_name = $data->[0];
828 # Remove the any namespace if we are being told to ignore them
829 if($ignore_namespace) {
830 $data_name =~ s/^\w*\.//;
831 }
832 $data_name =~ s/^ex\.//; # we always remove ex. - it maybe there in doc_obj, but we will never ask for it.
833 push (@metadata, $data->[1]) if ($data_name eq $field);
834 }
835
836 return \@metadata;
837}
838
839# returns a list of the form [[field,value],[field,value],...]
840sub get_all_metadata {
841 my $self = shift (@_);
842 my ($section) = @_;
843
844 my $section_ptr = $self->_lookup_section($section);
845 if (!defined $section_ptr) {
846 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
847 return;
848 }
849
850 return $section_ptr->{'metadata'};
851}
852
853# $value is optional
854sub delete_metadata {
855 my $self = shift (@_);
856 my ($section, $field, $value) = @_;
857
858 my $section_ptr = $self->_lookup_section($section);
859 if (!defined $section_ptr) {
860 print STDERR "doc::delete_metadata couldn't find section ", $section, "\n";
861 return;
862 }
863
864 my $i = 0;
865 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
866 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
867 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
868 splice (@{$section_ptr->{'metadata'}}, $i, 1);
869 } else {
870 $i++;
871 }
872 }
873}
874
875sub delete_all_metadata {
876 my $self = shift (@_);
877 my ($section) = @_;
878
879 my $section_ptr = $self->_lookup_section($section);
880 if (!defined $section_ptr) {
881 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
882 return;
883 }
884
885 $section_ptr->{'metadata'} = [];
886}
887
888sub set_metadata_element {
889 my $self = shift (@_);
890 my ($section, $field, $value) = @_;
891
892 $self->set_utf8_metadata_element ($section, $field,
893 &unicode::ascii2utf8(\$value));
894}
895
896# set_utf8_metadata_element assumes the text has already been
897# converted to the UTF-8 encoding.
898sub set_utf8_metadata_element {
899 my $self = shift (@_);
900 my ($section, $field, $value) = @_;
901
902 $self->delete_metadata ($section, $field);
903 $self->add_utf8_metadata ($section, $field, $value);
904}
905
906
907# add_metadata assumes the text is in (extended) ascii form. For
908# text which has already been converted to the UTF-8 format use
909# add_utf8_metadata.
910sub add_metadata {
911 my $self = shift (@_);
912 my ($section, $field, $value) = @_;
913
914 $self->add_utf8_metadata ($section, $field,
915 &unicode::ascii2utf8(\$value));
916}
917
918sub add_utf8_metadata {
919 my $self = shift (@_);
920 my ($section, $field, $value) = @_;
921
922# my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
923# my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
924# print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
925
926 my $section_ptr = $self->_lookup_section($section);
927 if (!defined $section_ptr) {
928 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
929 return;
930 }
931 if (!defined $value) {
932 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
933 return;
934 }
935 if (!defined $field) {
936 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
937 return;
938 }
939
940 #print STDERR "###$field=$value\n";
941
942 # For now, supress this check. Given that text data read in is now
943 # Unicode aware, then the following block of code can (ironically enough)
944 # cause our unicode compliant string to be re-encoded (leading to
945 # a double-encoded UTF-8 string, which we definitely don't want!).
946
947
948 # double check that the value is utf-8
949# if (!&unicode::check_is_utf8($value)) {
950# print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
951# &unicode::ensure_utf8(\$value);
952# print STDERR " Tried converting to utf8: $value\n";
953# }
954
955 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
956}
957
958
959# methods for dealing with text
960
961# returns the text for a section
962sub get_text {
963 my $self = shift (@_);
964 my ($section) = @_;
965
966 my $section_ptr = $self->_lookup_section($section);
967 if (!defined $section_ptr) {
968 print STDERR "doc::get_text couldn't find section " .
969 "$section\n";
970 return "";
971 }
972
973 return $section_ptr->{'text'};
974}
975
976# returns the (utf-8 encoded) length of the text for a section
977sub get_text_length {
978 my $self = shift (@_);
979 my ($section) = @_;
980
981 my $section_ptr = $self->_lookup_section($section);
982 if (!defined $section_ptr) {
983 print STDERR "doc::get_text_length couldn't find section " .
984 "$section\n";
985 return 0;
986 }
987
988 return length ($section_ptr->{'text'});
989}
990
991# returns the total length for all the sections
992sub get_total_text_length {
993 my $self = shift (@_);
994
995 my $section = $self->get_top_section();
996 my $length = 0;
997 while (defined $section) {
998 $length += $self->get_text_length($section);
999 $section = $self->get_next_section($section);
1000 }
1001 return $length;
1002}
1003
1004sub delete_text {
1005 my $self = shift (@_);
1006 my ($section) = @_;
1007
1008 my $section_ptr = $self->_lookup_section($section);
1009 if (!defined $section_ptr) {
1010 print STDERR "doc::delete_text couldn't find section " .
1011 "$section\n";
1012 return;
1013 }
1014
1015 $section_ptr->{'text'} = "";
1016}
1017
1018# add_text assumes the text is in (extended) ascii form. For
1019# text which has been already converted to the UTF-8 format
1020# use add_utf8_text.
1021sub add_text {
1022 my $self = shift (@_);
1023 my ($section, $text) = @_;
1024
1025 # convert the text to UTF-8 encoded unicode characters
1026 # and add the text
1027 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1028}
1029
1030
1031# add_utf8_text assumes the text to be added has already
1032# been converted to the UTF-8 encoding. For ascii text use
1033# add_text
1034sub add_utf8_text {
1035 my $self = shift (@_);
1036 my ($section, $text) = @_;
1037
1038 my $section_ptr = $self->_lookup_section($section);
1039 if (!defined $section_ptr) {
1040 print STDERR "doc::add_utf8_text couldn't find section " .
1041 "$section\n";
1042 return;
1043 }
1044
1045 $section_ptr->{'text'} .= $text;
1046}
1047
1048# returns the Source meta, which is the utf8 filename generated.
1049# Added a separate method here for convenience
1050sub get_source {
1051 my $self = shift (@_);
1052 return $self->get_metadata_element ($self->get_top_section(), "Source");
1053}
1054
1055# returns the SourceFile meta, which is the url reference to the URL-encoded
1056# version of Source (the utf8 filename). Added a separate method here for convenience
1057sub get_sourcefile {
1058 my $self = shift (@_);
1059 return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
1060}
1061
1062# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
1063# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
1064sub get_assocfile_from_sourcefile {
1065 my $self = shift (@_);
1066
1067 # get the SourceFile meta, which is a *URL* to a file on the filesystem
1068 my $top_section = $self->get_top_section();
1069 my $source_file = $self->get_metadata_element($top_section, "SourceFile");
1070
1071 # get the actual filename as it exists on the filesystem which this url refers to
1072 $source_file = &unicode::url_to_filename($source_file);
1073 my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
1074 return $assocfilename;
1075}
1076
1077# methods for dealing with associated files
1078
1079# a file is associated with a document, NOT a section.
1080# if section is defined it is noted in the data structure
1081# only so that files associated from a particular section
1082# may be removed later (using delete_section_assoc_files)
1083sub associate_file {
1084 my $self = shift (@_);
1085 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1086 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1087
1088 # remove all associated files with the same name
1089 $self->delete_assoc_file ($assoc_filename);
1090
1091 push (@{$self->{'associated_files'}},
1092 [$real_filename, $assoc_filename, $mime_type, $section]);
1093}
1094
1095# returns a list of associated files in the form
1096# [[real_filename, assoc_filename, mimetype], ...]
1097sub get_assoc_files {
1098 my $self = shift (@_);
1099
1100 return $self->{'associated_files'};
1101}
1102
1103# the following two methods used to keep track of original associated files
1104# for incremental building. eg a txt file used by an item file does not end
1105# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1106# file for incremental build
1107sub associate_source_file {
1108 my $self = shift (@_);
1109 my ($full_filename) = @_;
1110
1111 push (@{$self->{'source_assoc_files'}}, $full_filename);
1112
1113}
1114
1115sub get_source_assoc_files {
1116 my $self = shift (@_);
1117
1118 return $self->{'source_assoc_files'};
1119
1120
1121}
1122sub metadata_file {
1123 my $self = shift (@_);
1124 my ($real_filename, $filename) = @_;
1125
1126 push (@{$self->{'metadata_files'}},
1127 [$real_filename, $filename]);
1128}
1129
1130# used for writing out the archiveinf-doc info database, to list all the metadata files
1131sub get_meta_files {
1132 my $self = shift (@_);
1133
1134 return $self->{'metadata_files'};
1135}
1136
1137sub delete_section_assoc_files {
1138 my $self = shift (@_);
1139 my ($section) = @_;
1140
1141 my $i=0;
1142 while ($i < scalar (@{$self->{'associated_files'}})) {
1143 if (defined $self->{'associated_files'}->[$i]->[3] &&
1144 $self->{'associated_files'}->[$i]->[3] eq $section) {
1145 splice (@{$self->{'associated_files'}}, $i, 1);
1146 } else {
1147 $i++;
1148 }
1149 }
1150}
1151
1152sub delete_assoc_file {
1153 my $self = shift (@_);
1154 my ($assoc_filename) = @_;
1155
1156 my $i=0;
1157 while ($i < scalar (@{$self->{'associated_files'}})) {
1158 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1159 splice (@{$self->{'associated_files'}}, $i, 1);
1160 } else {
1161 $i++;
1162 }
1163 }
1164}
1165
1166sub reset_nextsection_ptr {
1167 my $self = shift (@_);
1168 my ($section) = @_;
1169
1170 my $section_ptr = $self->_lookup_section($section);
1171 $section_ptr->{'next_subsection'} = 1;
1172}
1173
11741;
Note: See TracBrowser for help on using the repository browser.