source: main/trunk/greenstone2/perllib/doc.pm@ 23827

Last change on this file since 23827 was 23827, checked in by sjm84, 13 years ago

Phase four of commiting the files changed to extend the DSpace exporting capabilities to include more than just dublin core metadata

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 33.5 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use ghtml;
40use File::stat;
41##use hashdoc;
42use docprint;
43
44# the document type may be indexed_doc, nonindexed_doc, or
45# classification
46
47our $OIDcount = 0;
48
49# rename_method can be 'url', 'none', 'base64'
50sub new {
51 my $class = shift (@_);
52 my ($source_filename, $doc_type, $rename_method) = @_;
53
54
55 my $self = bless {'associated_files'=>[],
56 'subsection_order'=>[],
57 'next_subsection'=>1,
58 'subsections'=>{},
59 'metadata'=>[],
60 'text'=>"",
61 'OIDtype'=>"hash"}, $class;
62
63 # used to set lastmodified here, but this can screw up the HASH ids, so
64 # the docsave processor now calls set_lastmodified
65
66 $self->set_source_path($source_filename);
67
68 if (defined $source_filename) {
69 $source_filename = &util::filename_within_collection($source_filename);
70 print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
71 $self->set_source_filename ($source_filename, $rename_method);
72 }
73
74 $self->set_doc_type ($doc_type) if defined $doc_type;
75
76 return $self;
77}
78
79
80sub set_source_path
81{
82 my $self = shift @_;
83 my ($source_filename) = @_;
84
85 if (defined $source_filename) {
86 # On Windows the source_filename can be in terse DOS format
87 # e.g. test~1.txt
88
89 $self->{'terse_source_path'} = $source_filename;
90
91 if (-e $source_filename) {
92 # See if we can do better for Windows with a filename
93 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
94 require Win32;
95 $self->{'source_path'} = Win32::GetLongPathName($source_filename);
96 }
97 else {
98 # For Unix-based systems, there is no difference between the two
99 $self->{'source_path'} = $source_filename;
100 }
101 }
102 else {
103 print STDERR "Warning: In doc::set_source_path(), file\n";
104 print STDERR " $source_filename\n";
105 print STDERR " does not exist\n";
106
107 # (default) Set it to whatever we were given
108 $self->{'source_path'} = $source_filename;
109 }
110 }
111 else {
112 # Previous code for setting source_path allowed for
113 # it to be undefined, so continue this practice
114 $self->{'terse_source_path'} = undef;
115 $self->{'source_path'} = undef;
116 }
117}
118
119
120sub get_source_path
121{
122 my $self = shift @_;
123
124 return $self->{'terse_source_path'};
125}
126
127# set lastmodified for OAI purposes, added by GRB, moved by kjdon
128sub set_lastmodified {
129 my $self = shift (@_);
130
131 my $source_path = $self->{'terse_source_path'};
132
133 if (defined $source_path && (-e $source_path)) {
134
135 my $file_stat = stat($source_path);
136 my $mtime = $file_stat->mtime;
137 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
138 $wday, $yday, $isdst) = localtime($mtime);
139
140 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
141
142 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $file_stat->mtime);
143 $self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
144 }
145}
146
147# clone the $self object
148sub duplicate {
149 my $self = shift (@_);
150
151 my $newobj = {};
152
153 foreach my $k (keys %$self) {
154 $newobj->{$k} = &clone ($self->{$k});
155 }
156
157 bless $newobj, ref($self);
158 return $newobj;
159}
160
161sub clone {
162 my ($from) = @_;
163 my $type = ref ($from);
164
165 if ($type eq "HASH") {
166 my $to = {};
167 foreach my $key (keys %$from) {
168 $to->{$key} = &clone ($from->{$key});
169 }
170 return $to;
171 } elsif ($type eq "ARRAY") {
172 my $to = [];
173 foreach my $v (@$from) {
174 push (@$to, &clone ($v));
175 }
176 return $to;
177 } else {
178 return $from;
179 }
180}
181
182sub set_OIDtype {
183 my $self = shift (@_);
184 my ($type, $metadata) = @_;
185
186 if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|incremental|dirname|assigned)$/) {
187 $self->{'OIDtype'} = $type;
188 } else {
189 $self->{'OIDtype'} = "hash";
190 }
191
192 if ($type =~ /^assigned$/) {
193 if (defined $metadata) {
194 $self->{'OIDmetadata'} = $metadata;
195 } else {
196 $self->{'OIDmetadata'} = "dc.Identifier";
197 }
198 }
199}
200
201# rename_method can be 'url', 'none', 'base64'
202sub set_source_filename {
203 my $self = shift (@_);
204 my ($source_filename, $rename_method) = @_;
205
206 # Since the gsdlsourcefilename element goes into the doc.xml it has
207 # to be utf8. However, it should also *represent* the source filename
208 # (in the import directory) which may not be utf8 at all.
209 # For instance, if this meta element (gsdlsourcefilename) will be used
210 # by other applications that parse doc.xml in order to locate
211 # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
212 # encode the real filename as this is a binary-to-text encoding meaning
213 # that the resulting string is ASCII (utf8). Decoding will give the original.
214
215# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
216
217 # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
218 $source_filename = $self->encode_filename($source_filename, $rename_method);
219# my ($srcfilename,$dirname,$suffix)
220# = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
221# print STDERR "-> $srcfilename -> ";
222# $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
223# $source_filename = &util::filename_cat($dirname, $srcfilename);
224# print STDERR "$source_filename\n";
225
226 $self->set_utf8_metadata_element ($self->get_top_section(),
227 "gsdlsourcefilename",
228 $source_filename);
229}
230
231sub encode_filename {
232 my $self = shift (@_);
233 my ($source_filename, $rename_method) = @_;
234
235 my ($srcfilename,$dirname,$suffix)
236 = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
237# print STDERR "-> $srcfilename -> ";
238 $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
239 $source_filename = &util::filename_cat($dirname, $srcfilename);
240
241 return $source_filename;
242}
243
244sub set_converted_filename {
245 my $self = shift (@_);
246 my ($converted_filename) = @_;
247
248 # we know the converted filename is utf8
249 $self->set_utf8_metadata_element ($self->get_top_section(),
250 "gsdlconvertedfilename",
251 $converted_filename);
252}
253
254# returns the source_filename as it was provided
255sub get_unmodified_source_filename {
256 my $self = shift (@_);
257
258 return $self->{'terse_source_path'};
259}
260
261# returns the source_filename with whatever rename_method was given
262sub get_source_filename {
263 my $self = shift (@_);
264
265 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
266}
267
268
269
270# returns converted filename if available else returns source filename
271sub get_filename_for_hashing {
272 my $self = shift (@_);
273
274 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
275
276 if (!defined $filename) {
277 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
278 # if NULPlug processed file, then don't give a filename
279 if (defined $plugin_name && $plugin_name eq "NULPlug") {
280 $filename = undef;
281 } else { # returns the URL encoded source filename!
282 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
283 }
284 }
285
286 if (!&util::filename_is_absolute($filename)) {
287 $filename = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$filename);
288 }
289
290 return $filename;
291}
292
293sub set_doc_type {
294 my $self = shift (@_);
295 my ($doc_type) = @_;
296
297 $self->set_metadata_element ($self->get_top_section(),
298 "gsdldoctype",
299 $doc_type);
300}
301
302# returns the gsdldoctype as it was provided
303# the default of "indexed_doc" is used if no document
304# type was provided
305sub get_doc_type {
306 my $self = shift (@_);
307
308 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
309 return $doc_type if (defined $doc_type);
310 return "indexed_doc";
311}
312
313
314# look up the reference to the a particular section
315sub _lookup_section {
316 my $self = shift (@_);
317 my ($section) = @_;
318
319 my ($num);
320 my $sectionref = $self;
321
322 while (defined $section && $section ne "") {
323
324 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
325
326 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
327
328 $section = "" unless defined $section;
329
330
331 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
332 $sectionref = $sectionref->{'subsections'}->{$num};
333 } else {
334 return undef;
335 }
336 }
337
338 return $sectionref;
339}
340
341# calculate OID by hashing the contents of the document
342sub _calc_OID {
343 my $self = shift (@_);
344 my ($filename) = @_;
345
346
347 my $osexe = &util::get_os_exe();
348
349 my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
350 $ENV{'GSDLOS'},"hashfile$osexe");
351
352 my $result = "NULL";
353
354
355 if (-e "$hashfile_exe") {
356# $result = `\"$hashfile_exe\" \"$filename\"`;
357# $result = `hashfile$osexe \"$filename\" 2>&1`;
358 $result = `hashfile$osexe \"$filename\"`;
359
360 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
361 } else {
362 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
363 }
364 return "HASH$result";
365}
366
367# methods dealing with OID, not groups of them.
368
369# if $OID is not provided one is calculated
370sub set_OID {
371 my $self = shift (@_);
372 my ($OID) = @_;
373
374 my $use_hash_oid = 0;
375 # if an OID wasn't provided calculate one
376 if (!defined $OID) {
377 $OID = "NULL";
378 if ($self->{'OIDtype'} =~ /^hash/) {
379 $use_hash_oid = 1;
380 } elsif ($self->{'OIDtype'} eq "incremental") {
381 $OID = "D" . $OIDcount;
382 $OIDcount ++;
383
384 } elsif ($self->{'OIDtype'} eq "dirname") {
385 $OID = 'J';
386 my $filename = $self->get_source_filename();
387 if (defined($filename)) { # && -e $filename) {
388 $OID = &File::Basename::dirname($filename);
389 if (defined $OID) {
390 $OID = 'J'.&File::Basename::basename($OID);
391 $OID = &util::tidy_up_oid($OID);
392 } else {
393 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
394 $use_hash_oid = 1;
395 }
396 } else {
397 print STDERR "Failed to find filename, generating hash id\n";
398 $use_hash_oid = 1;
399 }
400
401 } elsif ($self->{'OIDtype'} eq "assigned") {
402 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
403 if (defined $identifier && $identifier ne "") {
404 $OID = $identifier;
405 $OID = &util::tidy_up_oid($OID);
406 } else {
407 # need a hash id
408 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
409 $use_hash_oid = 1;
410 }
411
412 } else {
413 $use_hash_oid = 1;
414 }
415
416 if ($use_hash_oid) {
417 my $hash_on_file = 1;
418 if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
419 $hash_on_file = 0;
420 }
421 if ($hash_on_file) {
422 # "hash" OID - feed file to hashfile.exe
423 my $filename = $self->get_filename_for_hashing();
424
425 # -z: don't want to hash on the file if it is zero size
426 if (defined($filename) && -e $filename && !-z $filename) {
427 $OID = $self->_calc_OID ($filename);
428 } else {
429 $hash_on_file = 0;
430 }
431 }
432 if (!$hash_on_file) {
433 my $filename = &util::get_tmp_filename();
434 if (!open (OUTFILE, ">:utf8", $filename)) {
435 print STDERR "doc::set_OID could not write to $filename\n";
436 } else {
437 my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
438 print OUTFILE $doc_text;
439 close (OUTFILE);
440 }
441 $OID = $self->_calc_OID ($filename);
442 &util::rm ($filename);
443 }
444 }
445 }
446 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
447}
448
449# this uses hashdoc (embedded c thingy) which is faster but still
450# needs a little work to be suffiently stable
451sub ___set_OID {
452 my $self = shift (@_);
453 my ($OID) = @_;
454
455 # if an OID wasn't provided then calculate hash value based on document
456 if (!defined $OID)
457 {
458 my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
459 my $hash_len = length($hash_text);
460
461 $OID = &hashdoc::buffer($hash_text,$hash_len);
462 }
463
464 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
465}
466
467# returns the OID for this document
468sub get_OID {
469 my $self = shift (@_);
470 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
471 return $OID if (defined $OID);
472 return "NULL";
473}
474
475sub delete_OID {
476 my $self = shift (@_);
477
478 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
479}
480
481
482# methods for manipulating section names
483
484# returns the name of the top-most section (the top
485# level of the document
486sub get_top_section {
487 my $self = shift (@_);
488
489 return "";
490}
491
492# returns a section
493sub get_parent_section {
494 my $self = shift (@_);
495 my ($section) = @_;
496
497 $section =~ s/(^|\.)\d+$//;
498
499 return $section;
500}
501
502# returns the first child section (or the end child
503# if there isn't any)
504sub get_begin_child {
505 my $self = shift (@_);
506 my ($section) = @_;
507
508 my $section_ptr = $self->_lookup_section($section);
509 return "" unless defined $section_ptr;
510
511 if (defined $section_ptr->{'subsection_order'}->[0]) {
512 return "$section.$section_ptr->{'subsection_order'}->[0]";
513 }
514
515 return $self->get_end_child ($section);
516}
517
518# returns the next child of a parent section
519sub get_next_child {
520 my $self = shift (@_);
521 my ($section) = @_;
522
523 my $parent_section = $self->get_parent_section($section);
524 my $parent_section_ptr = $self->_lookup_section($parent_section);
525 return undef unless defined $parent_section_ptr;
526
527 my ($section_num) = $section =~ /(\d+)$/;
528 return undef unless defined $section_num;
529
530 my $i = 0;
531 my $section_order = $parent_section_ptr->{'subsection_order'};
532 while ($i < scalar(@$section_order)) {
533 last if $section_order->[$i] eq $section_num;
534 $i++;
535 }
536
537 $i++; # the next child
538 if ($i < scalar(@$section_order)) {
539 return $section_order->[$i] if $parent_section eq "";
540 return "$parent_section.$section_order->[$i]";
541 }
542
543 # no more sections in this level
544 return undef;
545}
546
547# returns a reference to a list of children
548sub get_children {
549 my $self = shift (@_);
550 my ($section) = @_;
551
552 my $section_ptr = $self->_lookup_section($section);
553 return [] unless defined $section_ptr;
554
555 my @children = @{$section_ptr->{'subsection_order'}};
556
557 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
558 return \@children;
559}
560
561# returns the child section one past the last one (which
562# is coded as "0")
563sub get_end_child {
564 my $self = shift (@_);
565 my ($section) = @_;
566
567 return $section . ".0" unless $section eq "";
568 return "0";
569}
570
571# returns the next section in book order
572sub get_next_section {
573 my $self = shift (@_);
574 my ($section) = @_;
575
576 return undef unless defined $section;
577
578 my $section_ptr = $self->_lookup_section($section);
579 return undef unless defined $section_ptr;
580
581 # first try to find first child
582 if (defined $section_ptr->{'subsection_order'}->[0]) {
583 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
584 return "$section.$section_ptr->{'subsection_order'}->[0]";
585 }
586
587 do {
588 # try to find sibling
589 my $next_child = $self->get_next_child ($section);
590 return $next_child if (defined $next_child);
591
592 # move up one level
593 $section = $self->get_parent_section ($section);
594 } while $section =~ /\d/;
595
596 return undef;
597}
598
599sub is_leaf_section {
600 my $self = shift (@_);
601 my ($section) = @_;
602
603 my $section_ptr = $self->_lookup_section($section);
604 return 1 unless defined $section_ptr;
605
606 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
607}
608
609# methods for dealing with sections
610
611# returns the name of the inserted section
612sub insert_section {
613 my $self = shift (@_);
614 my ($before_section) = @_;
615
616 # get the child to insert before and its parent section
617 my $parent_section = "";
618 my $before_child = "0";
619 my @before_section = split (/\./, $before_section);
620 if (scalar(@before_section) > 0) {
621 $before_child = pop (@before_section);
622 $parent_section = join (".", @before_section);
623 }
624
625 my $parent_section_ptr = $self->_lookup_section($parent_section);
626 if (!defined $parent_section_ptr) {
627 print STDERR "doc::insert_section couldn't find parent section " .
628 "$parent_section\n";
629 return;
630 }
631
632 # get the next section number
633 my $section_num = $parent_section_ptr->{'next_subsection'}++;
634
635 my $i = 0;
636 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
637 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
638 $i++;
639 }
640
641 # insert the section number into the order list
642 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
643
644 # add this section to the parent section
645 my $section_ptr = {'subsection_order'=>[],
646 'next_subsection'=>1,
647 'subsections'=>{},
648 'metadata'=>[],
649 'text'=>""};
650 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
651
652 # work out the full section number
653 my $section = $parent_section;
654 $section .= "." unless $section eq "";
655 $section .= $section_num;
656
657 return $section;
658}
659
660# creates a pre-named section
661sub create_named_section {
662 my $self = shift (@_);
663 my ($mastersection) = @_;
664
665 my ($num);
666 my $section = $mastersection;
667 my $sectionref = $self;
668
669 while ($section ne "") {
670 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
671 $num =~ s/^0+(\d)/$1/; # remove leading 0s
672 $section = "" unless defined $section;
673
674 if (defined $num) {
675 if (!defined $sectionref->{'subsections'}->{$num}) {
676 push (@{$sectionref->{'subsection_order'}}, $num);
677 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
678 'next_subsection'=>1,
679 'subsections'=>{},
680 'metadata'=>[],
681 'text'=>""};
682 if ($num >= $sectionref->{'next_subsection'}) {
683 $sectionref->{'next_subsection'} = $num + 1;
684 }
685 }
686 $sectionref = $sectionref->{'subsections'}->{$num};
687
688 } else {
689 print STDERR "doc::create_named_section couldn't create section ";
690 print STDERR "$mastersection\n";
691 last;
692 }
693 }
694}
695
696# returns a reference to a list of subsections
697sub list_subsections {
698 my $self = shift (@_);
699 my ($section) = @_;
700
701 my $section_ptr = $self->_lookup_section ($section);
702 if (!defined $section_ptr) {
703 print STDERR "doc::list_subsections couldn't find section $section\n";
704 return [];
705 }
706
707 return [@{$section_ptr->{'subsection_order'}}];
708}
709
710sub delete_section {
711 my $self = shift (@_);
712 my ($section) = @_;
713
714# my $section_ptr = {'subsection_order'=>[],
715# 'next_subsection'=>1,
716# 'subsections'=>{},
717# 'metadata'=>[],
718# 'text'=>""};
719
720 # if this is the top section reset everything
721 if ($section eq "") {
722 $self->{'subsection_order'} = [];
723 $self->{'subsections'} = {};
724 $self->{'metadata'} = [];
725 $self->{'text'} = "";
726 return;
727 }
728
729 # find the parent of the section to delete
730 my $parent_section = "";
731 my $child = "0";
732 my @section = split (/\./, $section);
733 if (scalar(@section) > 0) {
734 $child = pop (@section);
735 $parent_section = join (".", @section);
736 }
737
738 my $parent_section_ptr = $self->_lookup_section($parent_section);
739 if (!defined $parent_section_ptr) {
740 print STDERR "doc::delete_section couldn't find parent section " .
741 "$parent_section\n";
742 return;
743 }
744
745 # remove this section from the subsection_order list
746 my $i = 0;
747 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
748 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
749 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
750 last;
751 }
752 $i++;
753 }
754
755 # remove this section from the subsection hash
756 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
757 undef $parent_section_ptr->{'subsections'}->{$child};
758 }
759}
760
761#--
762# methods for dealing with metadata
763
764# set_metadata_element and get_metadata_element are for metadata
765# which should only have one value. add_meta_data and get_metadata
766# are for metadata which can have more than one value.
767
768# returns the first metadata value which matches field
769
770# This version of get metadata element works much like the one above,
771# except it allows for the namespace portion of a metadata element to
772# be ignored, thus if you are searching for dc.Title, the first piece
773# of matching metadata ending with the name Title (once any namespace
774# is removed) would be returned.
775# 28-11-2003 John Thompson
776sub get_metadata_element {
777 my $self = shift (@_);
778 my ($section, $field, $ignore_namespace) = @_;
779 my ($data);
780
781 $ignore_namespace = 0 unless defined $ignore_namespace;
782
783 my $section_ptr = $self->_lookup_section($section);
784 if (!defined $section_ptr) {
785 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
786 return;
787 }
788
789 # Remove the namespace if we are being told to ignore them
790 if($ignore_namespace) {
791 $field =~ s/^\w*\.//;
792 }
793
794 foreach $data (@{$section_ptr->{'metadata'}}) {
795
796 my $data_name = $data->[0];
797
798 # Remove the any namespace if we are being told to ignore them
799 if($ignore_namespace) {
800 $data_name =~ s/^\w*\.//;
801 }
802 $data_name =~ s/^ex\.//; # we always remove ex. - it maybe there in doc_obj, but we will never ask for it.
803 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
804 }
805
806 return undef; # was not found
807}
808
809# returns a list of the form [value1, value2, ...]
810sub get_metadata {
811 my $self = shift (@_);
812 my ($section, $field, $ignore_namespace) = @_;
813 my ($data);
814
815 $ignore_namespace = 0 unless defined $ignore_namespace;
816
817 my $section_ptr = $self->_lookup_section($section);
818 if (!defined $section_ptr) {
819 print STDERR "doc::get_metadata couldn't find section ",
820 $section, "\n";
821 return;
822 }
823
824 # Remove the any namespace if we are being told to ignore them
825 if($ignore_namespace) {
826 $field =~ s/^\w*\.//;
827 }
828
829 my @metadata = ();
830 foreach $data (@{$section_ptr->{'metadata'}}) {
831
832 my $data_name = $data->[0];
833 # Remove the any namespace if we are being told to ignore them
834 if($ignore_namespace) {
835 $data_name =~ s/^\w*\.//;
836 }
837 $data_name =~ s/^ex\.//; # we always remove ex. - it maybe there in doc_obj, but we will never ask for it.
838 push (@metadata, $data->[1]) if ($data_name eq $field);
839 }
840
841 return \@metadata;
842}
843
844sub get_metadata_hashmap {
845 my $self = shift (@_);
846 my ($section, $opt_namespace) = @_;
847
848 my $section_ptr = $self->_lookup_section($section);
849 if (!defined $section_ptr) {
850 print STDERR "doc::get_metadata couldn't find section ",
851 $section, "\n";
852 return;
853 }
854
855 my $metadata_hashmap = {};
856 foreach my $data (@{$section_ptr->{'metadata'}}) {
857 my $metaname = $data->[0];
858
859 if ((!defined $opt_namespace) || ($metaname =~ m/^$opt_namespace\./)) {
860 if (!defined $metadata_hashmap->{$metaname}) {
861 $metadata_hashmap->{$metaname} = [];
862 }
863 my $metaval_list = $metadata_hashmap->{$metaname};
864 push(@$metaval_list, $data->[1]);
865 }
866 }
867
868 return $metadata_hashmap;
869}
870
871# returns a list of the form [[field,value],[field,value],...]
872sub get_all_metadata {
873 my $self = shift (@_);
874 my ($section) = @_;
875
876 my $section_ptr = $self->_lookup_section($section);
877 if (!defined $section_ptr) {
878 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
879 return;
880 }
881
882 return $section_ptr->{'metadata'};
883}
884
885# $value is optional
886sub delete_metadata {
887 my $self = shift (@_);
888 my ($section, $field, $value) = @_;
889
890 my $section_ptr = $self->_lookup_section($section);
891 if (!defined $section_ptr) {
892 print STDERR "doc::delete_metadata couldn't find section ", $section, "$field\n";
893 return;
894 }
895
896 my $i = 0;
897 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
898 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
899 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
900 splice (@{$section_ptr->{'metadata'}}, $i, 1);
901 } else {
902 $i++;
903 }
904 }
905}
906
907sub delete_all_metadata {
908 my $self = shift (@_);
909 my ($section) = @_;
910
911 my $section_ptr = $self->_lookup_section($section);
912 if (!defined $section_ptr) {
913 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
914 return;
915 }
916
917 $section_ptr->{'metadata'} = [];
918}
919
920sub set_metadata_element {
921 my $self = shift (@_);
922 my ($section, $field, $value) = @_;
923
924 $self->set_utf8_metadata_element ($section, $field,
925 &unicode::ascii2utf8(\$value));
926}
927
928# set_utf8_metadata_element assumes the text has already been
929# converted to the UTF-8 encoding.
930sub set_utf8_metadata_element {
931 my $self = shift (@_);
932 my ($section, $field, $value) = @_;
933
934 $self->delete_metadata ($section, $field);
935 $self->add_utf8_metadata ($section, $field, $value);
936}
937
938
939# add_metadata assumes the text is in (extended) ascii form. For
940# text which has already been converted to the UTF-8 format use
941# add_utf8_metadata.
942sub add_metadata {
943 my $self = shift (@_);
944 my ($section, $field, $value) = @_;
945
946 $self->add_utf8_metadata ($section, $field,
947 &unicode::ascii2utf8(\$value));
948}
949
950sub add_utf8_metadata {
951 my $self = shift (@_);
952 my ($section, $field, $value) = @_;
953
954# my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
955# my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
956# print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
957
958 my $section_ptr = $self->_lookup_section($section);
959 if (!defined $section_ptr) {
960 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
961 return;
962 }
963 if (!defined $value) {
964 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
965 return;
966 }
967 if (!defined $field) {
968 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
969 return;
970 }
971
972 #print STDERR "###$field=$value\n";
973
974 # For now, supress this check. Given that text data read in is now
975 # Unicode aware, then the following block of code can (ironically enough)
976 # cause our unicode compliant string to be re-encoded (leading to
977 # a double-encoded UTF-8 string, which we definitely don't want!).
978
979
980 # double check that the value is utf-8
981# if (!&unicode::check_is_utf8($value)) {
982# print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
983# &unicode::ensure_utf8(\$value);
984# print STDERR " Tried converting to utf8: $value\n";
985# }
986
987 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
988}
989
990
991# methods for dealing with text
992
993# returns the text for a section
994sub get_text {
995 my $self = shift (@_);
996 my ($section) = @_;
997
998 my $section_ptr = $self->_lookup_section($section);
999 if (!defined $section_ptr) {
1000 print STDERR "doc::get_text couldn't find section " .
1001 "$section\n";
1002 return "";
1003 }
1004
1005 return $section_ptr->{'text'};
1006}
1007
1008# returns the (utf-8 encoded) length of the text for a section
1009sub get_text_length {
1010 my $self = shift (@_);
1011 my ($section) = @_;
1012
1013 my $section_ptr = $self->_lookup_section($section);
1014 if (!defined $section_ptr) {
1015 print STDERR "doc::get_text_length couldn't find section " .
1016 "$section\n";
1017 return 0;
1018 }
1019
1020 return length ($section_ptr->{'text'});
1021}
1022
1023# returns the total length for all the sections
1024sub get_total_text_length {
1025 my $self = shift (@_);
1026
1027 my $section = $self->get_top_section();
1028 my $length = 0;
1029 while (defined $section) {
1030 $length += $self->get_text_length($section);
1031 $section = $self->get_next_section($section);
1032 }
1033 return $length;
1034}
1035
1036sub delete_text {
1037 my $self = shift (@_);
1038 my ($section) = @_;
1039
1040 my $section_ptr = $self->_lookup_section($section);
1041 if (!defined $section_ptr) {
1042 print STDERR "doc::delete_text couldn't find section " .
1043 "$section\n";
1044 return;
1045 }
1046
1047 $section_ptr->{'text'} = "";
1048}
1049
1050# add_text assumes the text is in (extended) ascii form. For
1051# text which has been already converted to the UTF-8 format
1052# use add_utf8_text.
1053sub add_text {
1054 my $self = shift (@_);
1055 my ($section, $text) = @_;
1056
1057 # convert the text to UTF-8 encoded unicode characters
1058 # and add the text
1059 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1060}
1061
1062
1063# add_utf8_text assumes the text to be added has already
1064# been converted to the UTF-8 encoding. For ascii text use
1065# add_text
1066sub add_utf8_text {
1067 my $self = shift (@_);
1068 my ($section, $text) = @_;
1069
1070 my $section_ptr = $self->_lookup_section($section);
1071 if (!defined $section_ptr) {
1072 print STDERR "doc::add_utf8_text couldn't find section " .
1073 "$section\n";
1074 return;
1075 }
1076
1077 $section_ptr->{'text'} .= $text;
1078}
1079
1080# returns the Source meta, which is the utf8 filename generated.
1081# Added a separate method here for convenience
1082sub get_source {
1083 my $self = shift (@_);
1084 return $self->get_metadata_element ($self->get_top_section(), "Source");
1085}
1086
1087# returns the SourceFile meta, which is the url reference to the URL-encoded
1088# version of Source (the utf8 filename). Added a separate method here for convenience
1089sub get_sourcefile {
1090 my $self = shift (@_);
1091 return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
1092}
1093
1094# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
1095# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
1096sub get_assocfile_from_sourcefile {
1097 my $self = shift (@_);
1098
1099 # get the SourceFile meta, which is a *URL* to a file on the filesystem
1100 my $top_section = $self->get_top_section();
1101 my $source_file = $self->get_metadata_element($top_section, "SourceFile");
1102
1103 # get the actual filename as it exists on the filesystem which this url refers to
1104 $source_file = &unicode::url_to_filename($source_file);
1105 my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
1106 return $assocfilename;
1107}
1108
1109# methods for dealing with associated files
1110
1111# a file is associated with a document, NOT a section.
1112# if section is defined it is noted in the data structure
1113# only so that files associated from a particular section
1114# may be removed later (using delete_section_assoc_files)
1115sub associate_file {
1116 my $self = shift (@_);
1117 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1118 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1119
1120 # remove all associated files with the same name
1121 $self->delete_assoc_file ($assoc_filename);
1122
1123 # Too harsh a requirement
1124 # Definitely get HTML docs, for example, with some missing
1125 # support files
1126# if (!&util::fd_exists($real_filename)) {
1127# print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n";
1128# exit -1;
1129# }
1130
1131# print STDERR "**** is the following a UTF8 rep of *real* filename?\n $real_filename\n";
1132# print STDERR "****##### so, ensure it is before storing?!?!?\n";
1133## my $utf8_filename = Encode::encode("utf8",$filename);
1134
1135 push (@{$self->{'associated_files'}},
1136 [$real_filename, $assoc_filename, $mime_type, $section]);
1137}
1138
1139# returns a list of associated files in the form
1140# [[real_filename, assoc_filename, mimetype], ...]
1141sub get_assoc_files {
1142 my $self = shift (@_);
1143
1144 return $self->{'associated_files'};
1145}
1146
1147# the following two methods used to keep track of original associated files
1148# for incremental building. eg a txt file used by an item file does not end
1149# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1150# file for incremental build
1151sub associate_source_file {
1152 my $self = shift (@_);
1153 my ($full_filename) = @_;
1154
1155 push (@{$self->{'source_assoc_files'}}, $full_filename);
1156
1157}
1158
1159sub get_source_assoc_files {
1160 my $self = shift (@_);
1161
1162 return $self->{'source_assoc_files'};
1163
1164
1165}
1166sub metadata_file {
1167 my $self = shift (@_);
1168 my ($real_filename, $filename) = @_;
1169
1170 push (@{$self->{'metadata_files'}},
1171 [$real_filename, $filename]);
1172}
1173
1174# used for writing out the archiveinf-doc info database, to list all the metadata files
1175sub get_meta_files {
1176 my $self = shift (@_);
1177
1178 return $self->{'metadata_files'};
1179}
1180
1181sub delete_section_assoc_files {
1182 my $self = shift (@_);
1183 my ($section) = @_;
1184
1185 my $i=0;
1186 while ($i < scalar (@{$self->{'associated_files'}})) {
1187 if (defined $self->{'associated_files'}->[$i]->[3] &&
1188 $self->{'associated_files'}->[$i]->[3] eq $section) {
1189 splice (@{$self->{'associated_files'}}, $i, 1);
1190 } else {
1191 $i++;
1192 }
1193 }
1194}
1195
1196sub delete_assoc_file {
1197 my $self = shift (@_);
1198 my ($assoc_filename) = @_;
1199
1200 my $i=0;
1201 while ($i < scalar (@{$self->{'associated_files'}})) {
1202 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1203 splice (@{$self->{'associated_files'}}, $i, 1);
1204 } else {
1205 $i++;
1206 }
1207 }
1208}
1209
1210sub reset_nextsection_ptr {
1211 my $self = shift (@_);
1212 my ($section) = @_;
1213
1214 my $section_ptr = $self->_lookup_section($section);
1215 $section_ptr->{'next_subsection'} = 1;
1216}
1217
12181;
Note: See TracBrowser for help on using the repository browser.