source: main/trunk/greenstone2/perllib/doc.pm@ 21862

Last change on this file since 21862 was 21862, checked in by davidb, 14 years ago

Documents now have 'lastmodifieddate' added as metadata in yyyymmdd format

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 29.3 KB
RevLine 
[537]1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
[8894]10# This program is free software; you can redistr te it and/or modify
[537]11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
[1241]26# base class to hold documents
[4]27
28package doc;
[3834]29eval {require bytes};
[4]30
[832]31BEGIN {
[1241]32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
[832]34}
35
[15894]36use strict;
[1241]37use unicode;
38use util;
39use ghtml;
[8220]40use File::stat;
[1241]41##use hashdoc;
[13172]42use docprint;
[1241]43
[4]44# the document type may be indexed_doc, nonindexed_doc, or
45# classification
46
[18528]47our $OIDcount = 0;
[2327]48
[20412]49# rename_method can be 'url', 'none', 'base64'
[4]50sub new {
51 my $class = shift (@_);
[18319]52 my ($source_filename, $doc_type, $rename_method) = @_;
[1374]53
[13770]54
[1241]55 my $self = bless {'associated_files'=>[],
56 'subsection_order'=>[],
57 'next_subsection'=>1,
58 'subsections'=>{},
59 'metadata'=>[],
[2327]60 'text'=>"",
61 'OIDtype'=>"hash"}, $class;
[4]62
[10217]63 # used to set lastmodified here, but this can screw up the HASH ids, so
64 # the docsave processor now calls set_lastmodified
[13770]65
66 $self->{'source_path'} = $source_filename;
[10217]67
[7929]68 if (defined $source_filename) {
[15874]69 $source_filename = &util::filename_within_collection($source_filename);
[18508]70 print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
[18319]71 $self->set_source_filename ($source_filename, $rename_method);
[7929]72 }
73
[1374]74 $self->set_doc_type ($doc_type) if defined $doc_type;
[485]75
[4]76 return $self;
77}
[10217]78# set lastmodified for OAI purposes, added by GRB, moved by kjdon
79sub set_lastmodified {
80 my $self = shift (@_);
[4]81
[13770]82 my $source_path = $self->{'source_path'};
83
84 if (defined $source_path && (-e $source_path)) {
85
86 my $file_stat = stat($source_path);
[10217]87 my $mtime = $file_stat->mtime;
[21862]88 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
89 $wday, $yday, $isdst) = localtime($mtime);
90
91 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
92
[10217]93 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $file_stat->mtime);
[21862]94 $self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
[10217]95 }
96}
97
[1241]98# clone the $self object
99sub duplicate {
100 my $self = shift (@_);
101
102 my $newobj = {};
103
[8716]104 foreach my $k (keys %$self) {
[1241]105 $newobj->{$k} = &clone ($self->{$k});
106 }
107
108 bless $newobj, ref($self);
109 return $newobj;
110}
111
112sub clone {
113 my ($from) = @_;
114 my $type = ref ($from);
115
116 if ($type eq "HASH") {
117 my $to = {};
[8716]118 foreach my $key (keys %$from) {
[1241]119 $to->{$key} = &clone ($from->{$key});
120 }
121 return $to;
122 } elsif ($type eq "ARRAY") {
123 my $to = [];
[8716]124 foreach my $v (@$from) {
[1241]125 push (@$to, &clone ($v));
126 }
127 return $to;
128 } else {
129 return $from;
130 }
131}
132
[2327]133sub set_OIDtype {
134 my $self = shift (@_);
[12268]135 my ($type, $metadata) = @_;
[1241]136
[17025]137 if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|incremental|dirname|assigned)$/) {
[2327]138 $self->{'OIDtype'} = $type;
139 } else {
140 $self->{'OIDtype'} = "hash";
141 }
[16792]142
[12268]143 if ($type =~ /^assigned$/) {
144 if (defined $metadata) {
145 $self->{'OIDmetadata'} = $metadata;
146 } else {
147 $self->{'OIDmetadata'} = "dc.Identifier";
148 }
149 }
[2327]150}
151
[20412]152# rename_method can be 'url', 'none', 'base64'
[1241]153sub set_source_filename {
154 my $self = shift (@_);
[18319]155 my ($source_filename, $rename_method) = @_;
[1241]156
[16578]157 # Since the gsdlsourcefilename element goes into the doc.xml it has
158 # to be utf8. However, it should also *represent* the source filename
159 # (in the import directory) which may not be utf8 at all.
[18319]160 # For instance, if this meta element (gsdlsourcefilename) will be used
161 # by other applications that parse doc.xml in order to locate
162 # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
163 # encode the real filename as this is a binary-to-text encoding meaning
164 # that the resulting string is ASCII (utf8). Decoding will give the original.
[16578]165
[18319]166# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
167
[16670]168 # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
169 my ($srcfilename,$dirname,$suffix)
170 = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
[16792]171# print STDERR "-> $srcfilename -> ";
[18319]172 $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
[16670]173 $source_filename = &util::filename_cat($dirname, $srcfilename);
[16792]174# print STDERR "$source_filename\n";
[16670]175
[16578]176 $self->set_utf8_metadata_element ($self->get_top_section(),
[1241]177 "gsdlsourcefilename",
178 $source_filename);
179}
180
[7569]181sub set_converted_filename {
182 my $self = shift (@_);
183 my ($converted_filename) = @_;
184
[16578]185 # we know the converted filename is utf8
186 $self->set_utf8_metadata_element ($self->get_top_section(),
[7569]187 "gsdlconvertedfilename",
188 $converted_filename);
189}
190
[1241]191# returns the source_filename as it was provided
[19829]192sub get_unmodified_source_filename {
193 my $self = shift (@_);
194
195 return $self->{'source_path'};
196}
197
198# returns the source_filename with whatever rename_method was given
[1241]199sub get_source_filename {
200 my $self = shift (@_);
201
202 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
203}
204
[19829]205
206
[7569]207# returns converted filename if available else returns source filename
208sub get_filename_for_hashing {
209 my $self = shift (@_);
210
211 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
212
213 if (!defined $filename) {
[11097]214 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
[10980]215 # if NULPlug processed file, then don't give a filename
[11097]216 if (defined $plugin_name && $plugin_name eq "NULPlug") {
[10980]217 $filename = undef;
[16670]218 } else { # returns the URL encoded source filename!
[10980]219 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
220 }
[7569]221 }
222 return $filename;
223}
224
[1241]225sub set_doc_type {
226 my $self = shift (@_);
227 my ($doc_type) = @_;
228
229 $self->set_metadata_element ($self->get_top_section(),
230 "gsdldoctype",
231 $doc_type);
232}
233
[10217]234# returns the gsdldoctype as it was provided
[1241]235# the default of "indexed_doc" is used if no document
236# type was provided
237sub get_doc_type {
238 my $self = shift (@_);
239
240 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
241 return $doc_type if (defined $doc_type);
242 return "indexed_doc";
243}
244
245
246# look up the reference to the a particular section
247sub _lookup_section {
248 my $self = shift (@_);
249 my ($section) = @_;
250
251 my ($num);
252 my $sectionref = $self;
253
254 while (defined $section && $section ne "") {
[12327]255
[1241]256 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
[12327]257
258 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
259
[1241]260 $section = "" unless defined $section;
261
[12327]262
[1241]263 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
264 $sectionref = $sectionref->{'subsections'}->{$num};
265 } else {
266 return undef;
267 }
268 }
269
270 return $sectionref;
271}
272
[2327]273# calculate OID by hashing the contents of the document
[1241]274sub _calc_OID {
275 my $self = shift (@_);
276 my ($filename) = @_;
277
278 my $osexe = &util::get_os_exe();
279
280 my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
281 $ENV{'GSDLOS'},"hashfile$osexe");
[8504]282
[1241]283 my $result = "NULL";
[1679]284
[1241]285 if (-e "$hashfile_exe") {
[1679]286# $result = `\"$hashfile_exe\" \"$filename\"`;
[21862]287# $result = `hashfile$osexe \"$filename\" 2>&1`;
[1679]288 $result = `hashfile$osexe \"$filename\"`;
[21862]289
[1241]290 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
291 } else {
292 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
293 }
294 return "HASH$result";
295}
296
297# methods dealing with OID, not groups of them.
298
[2327]299# if $OID is not provided one is calculated
[1241]300sub set_OID {
301 my $self = shift (@_);
302 my ($OID) = @_;
[8504]303
[8797]304 my $use_hash_oid = 0;
[17057]305 # if an OID wasn't provided calculate one
[1241]306 if (!defined $OID) {
307 $OID = "NULL";
[17025]308 if ($self->{'OIDtype'} =~ /^hash/) {
[8797]309 $use_hash_oid = 1;
310 } elsif ($self->{'OIDtype'} eq "incremental") {
[2327]311 $OID = "D" . $OIDcount;
312 $OIDcount ++;
[8716]313
314 } elsif ($self->{'OIDtype'} eq "dirname") {
315 $OID = 'J';
316 my $filename = $self->get_source_filename();
[8797]317 if (defined($filename)) { # && -e $filename) {
[8716]318 $OID = &File::Basename::dirname($filename);
319 if (defined $OID) {
320 $OID = 'J'.&File::Basename::basename($OID);
[19617]321 $OID = &util::tidy_up_oid($OID);
[8716]322 } else {
[8797]323 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
324 $use_hash_oid = 1;
[8716]325 }
326 } else {
[8797]327 print STDERR "Failed to find filename, generating hash id\n";
328 $use_hash_oid = 1;
[8716]329 }
[2327]330
[8797]331 } elsif ($self->{'OIDtype'} eq "assigned") {
[12268]332 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
[8797]333 if (defined $identifier && $identifier ne "") {
[17025]334 $OID = $identifier;
[19617]335 $OID = &util::tidy_up_oid($OID);
[8797]336 } else {
337 # need a hash id
[12268]338 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
[8797]339 $use_hash_oid = 1;
340 }
341
[2327]342 } else {
[8797]343 $use_hash_oid = 1;
344 }
345
346 if ($use_hash_oid) {
[17025]347 my $hash_on_file = 1;
348 if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
349 $hash_on_file = 0;
350 }
351 if ($hash_on_file) {
352 # "hash" OID - feed file to hashfile.exe
353 my $filename = $self->get_filename_for_hashing();
354 # -z: don't want to hash on the file if it is zero size
355 if (defined($filename) && -e $filename && !-z $filename) {
356 $OID = $self->_calc_OID ($filename);
357 } else {
358 $hash_on_file = 0;
359 }
360 }
361 if (!$hash_on_file) {
362 my $filename = &util::get_tmp_filename();
[2327]363 if (!open (OUTFILE, ">$filename")) {
364 print STDERR "doc::set_OID could not write to $filename\n";
365 } else {
[13172]366 my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
367 print OUTFILE $doc_text;
[2327]368 close (OUTFILE);
369 }
370 $OID = $self->_calc_OID ($filename);
371 &util::rm ($filename);
[1374]372 }
[1241]373 }
374 }
375 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
376}
377
378# this uses hashdoc (embedded c thingy) which is faster but still
379# needs a little work to be suffiently stable
380sub ___set_OID {
381 my $self = shift (@_);
382 my ($OID) = @_;
383
384 # if an OID wasn't provided then calculate hash value based on document
385 if (!defined $OID)
386 {
[13172]387 my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
[1241]388 my $hash_len = length($hash_text);
389
390 $OID = &hashdoc::buffer($hash_text,$hash_len);
391 }
392
393 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
394}
395
396# returns the OID for this document
397sub get_OID {
398 my $self = shift (@_);
399 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
400 return $OID if (defined $OID);
401 return "NULL";
402}
403
404sub delete_OID {
405 my $self = shift (@_);
406
407 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
408}
409
410
411# methods for manipulating section names
412
413# returns the name of the top-most section (the top
414# level of the document
415sub get_top_section {
416 my $self = shift (@_);
417
418 return "";
419}
420
421# returns a section
422sub get_parent_section {
423 my $self = shift (@_);
424 my ($section) = @_;
425
426 $section =~ s/(^|\.)\d+$//;
427
428 return $section;
429}
430
431# returns the first child section (or the end child
432# if there isn't any)
433sub get_begin_child {
434 my $self = shift (@_);
435 my ($section) = @_;
436
437 my $section_ptr = $self->_lookup_section($section);
438 return "" unless defined $section_ptr;
439
440 if (defined $section_ptr->{'subsection_order'}->[0]) {
441 return "$section.$section_ptr->{'subsection_order'}->[0]";
442 }
443
444 return $self->get_end_child ($section);
445}
446
447# returns the next child of a parent section
448sub get_next_child {
449 my $self = shift (@_);
450 my ($section) = @_;
451
452 my $parent_section = $self->get_parent_section($section);
453 my $parent_section_ptr = $self->_lookup_section($parent_section);
454 return undef unless defined $parent_section_ptr;
455
456 my ($section_num) = $section =~ /(\d+)$/;
457 return undef unless defined $section_num;
458
459 my $i = 0;
460 my $section_order = $parent_section_ptr->{'subsection_order'};
461 while ($i < scalar(@$section_order)) {
462 last if $section_order->[$i] eq $section_num;
463 $i++;
464 }
465
466 $i++; # the next child
467 if ($i < scalar(@$section_order)) {
468 return $section_order->[$i] if $parent_section eq "";
469 return "$parent_section.$section_order->[$i]";
470 }
471
472 # no more sections in this level
473 return undef;
474}
475
476# returns a reference to a list of children
477sub get_children {
478 my $self = shift (@_);
479 my ($section) = @_;
480
481 my $section_ptr = $self->_lookup_section($section);
482 return [] unless defined $section_ptr;
483
484 my @children = @{$section_ptr->{'subsection_order'}};
485
486 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
487 return \@children;
488}
489
490# returns the child section one past the last one (which
491# is coded as "0")
492sub get_end_child {
493 my $self = shift (@_);
494 my ($section) = @_;
495
496 return $section . ".0" unless $section eq "";
497 return "0";
498}
499
500# returns the next section in book order
501sub get_next_section {
502 my $self = shift (@_);
503 my ($section) = @_;
504
505 return undef unless defined $section;
506
507 my $section_ptr = $self->_lookup_section($section);
508 return undef unless defined $section_ptr;
509
510 # first try to find first child
511 if (defined $section_ptr->{'subsection_order'}->[0]) {
512 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
513 return "$section.$section_ptr->{'subsection_order'}->[0]";
514 }
515
516 do {
517 # try to find sibling
518 my $next_child = $self->get_next_child ($section);
519 return $next_child if (defined $next_child);
520
521 # move up one level
522 $section = $self->get_parent_section ($section);
523 } while $section =~ /\d/;
524
525 return undef;
526}
527
528sub is_leaf_section {
529 my $self = shift (@_);
530 my ($section) = @_;
531
532 my $section_ptr = $self->_lookup_section($section);
533 return 1 unless defined $section_ptr;
534
535 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
536}
537
538# methods for dealing with sections
539
540# returns the name of the inserted section
541sub insert_section {
542 my $self = shift (@_);
543 my ($before_section) = @_;
544
545 # get the child to insert before and its parent section
546 my $parent_section = "";
547 my $before_child = "0";
548 my @before_section = split (/\./, $before_section);
549 if (scalar(@before_section) > 0) {
550 $before_child = pop (@before_section);
551 $parent_section = join (".", @before_section);
552 }
553
554 my $parent_section_ptr = $self->_lookup_section($parent_section);
555 if (!defined $parent_section_ptr) {
556 print STDERR "doc::insert_section couldn't find parent section " .
557 "$parent_section\n";
558 return;
559 }
560
561 # get the next section number
562 my $section_num = $parent_section_ptr->{'next_subsection'}++;
563
564 my $i = 0;
565 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
566 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
567 $i++;
568 }
569
570 # insert the section number into the order list
571 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
572
573 # add this section to the parent section
574 my $section_ptr = {'subsection_order'=>[],
575 'next_subsection'=>1,
576 'subsections'=>{},
577 'metadata'=>[],
578 'text'=>""};
579 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
580
581 # work out the full section number
582 my $section = $parent_section;
583 $section .= "." unless $section eq "";
584 $section .= $section_num;
585
586 return $section;
587}
588
589# creates a pre-named section
590sub create_named_section {
591 my $self = shift (@_);
592 my ($mastersection) = @_;
593
594 my ($num);
595 my $section = $mastersection;
596 my $sectionref = $self;
597
598 while ($section ne "") {
599 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
600 $num =~ s/^0+(\d)/$1/; # remove leading 0s
601 $section = "" unless defined $section;
602
603 if (defined $num) {
604 if (!defined $sectionref->{'subsections'}->{$num}) {
605 push (@{$sectionref->{'subsection_order'}}, $num);
606 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
607 'next_subsection'=>1,
608 'subsections'=>{},
609 'metadata'=>[],
610 'text'=>""};
611 if ($num >= $sectionref->{'next_subsection'}) {
612 $sectionref->{'next_subsection'} = $num + 1;
613 }
614 }
615 $sectionref = $sectionref->{'subsections'}->{$num};
616
617 } else {
618 print STDERR "doc::create_named_section couldn't create section ";
619 print STDERR "$mastersection\n";
620 last;
621 }
622 }
623}
624
625# returns a reference to a list of subsections
626sub list_subsections {
627 my $self = shift (@_);
628 my ($section) = @_;
629
630 my $section_ptr = $self->_lookup_section ($section);
631 if (!defined $section_ptr) {
632 print STDERR "doc::list_subsections couldn't find section $section\n";
633 return [];
634 }
635
636 return [@{$section_ptr->{'subsection_order'}}];
637}
638
639sub delete_section {
640 my $self = shift (@_);
641 my ($section) = @_;
642
643# my $section_ptr = {'subsection_order'=>[],
644# 'next_subsection'=>1,
645# 'subsections'=>{},
646# 'metadata'=>[],
647# 'text'=>""};
648
649 # if this is the top section reset everything
650 if ($section eq "") {
651 $self->{'subsection_order'} = [];
652 $self->{'subsections'} = {};
653 $self->{'metadata'} = [];
654 $self->{'text'} = "";
655 return;
656 }
657
658 # find the parent of the section to delete
659 my $parent_section = "";
660 my $child = "0";
661 my @section = split (/\./, $section);
662 if (scalar(@section) > 0) {
663 $child = pop (@section);
664 $parent_section = join (".", @section);
665 }
666
667 my $parent_section_ptr = $self->_lookup_section($parent_section);
668 if (!defined $parent_section_ptr) {
669 print STDERR "doc::delete_section couldn't find parent section " .
670 "$parent_section\n";
671 return;
672 }
673
674 # remove this section from the subsection_order list
675 my $i = 0;
676 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
677 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
678 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
679 last;
680 }
681 $i++;
682 }
683
684 # remove this section from the subsection hash
685 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
686 undef $parent_section_ptr->{'subsections'}->{$child};
687 }
688}
689
690#--
[4]691# methods for dealing with metadata
692
693# set_metadata_element and get_metadata_element are for metadata
694# which should only have one value. add_meta_data and get_metadata
695# are for metadata which can have more than one value.
696
[1241]697# returns the first metadata value which matches field
[6111]698
699# This version of get metadata element works much like the one above,
700# except it allows for the namespace portion of a metadata element to
701# be ignored, thus if you are searching for dc.Title, the first piece
702# of matching metadata ending with the name Title (once any namespace
703# is removed) would be returned.
704# 28-11-2003 John Thompson
[1241]705sub get_metadata_element {
706 my $self = shift (@_);
[6111]707 my ($section, $field, $ignore_namespace) = @_;
[1241]708 my ($data);
709
[6111]710 $ignore_namespace = 0 unless defined $ignore_namespace;
711
[1241]712 my $section_ptr = $self->_lookup_section($section);
713 if (!defined $section_ptr) {
[8716]714 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
[1241]715 return;
716 }
717
[20417]718 # Remove the namespace if we are being told to ignore them
[6111]719 if($ignore_namespace) {
720 $field =~ s/^\w*\.//;
721 }
722
[1241]723 foreach $data (@{$section_ptr->{'metadata'}}) {
[6111]724
725 my $data_name = $data->[0];
[14966]726
[6111]727 # Remove the any namespace if we are being told to ignore them
728 if($ignore_namespace) {
729 $data_name =~ s/^\w*\.//;
730 }
731
732 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
[1241]733 }
734
735 return undef; # was not found
736}
737
738# returns a list of the form [value1, value2, ...]
739sub get_metadata {
740 my $self = shift (@_);
[6111]741 my ($section, $field, $ignore_namespace) = @_;
[1241]742 my ($data);
743
[6111]744 $ignore_namespace = 0 unless defined $ignore_namespace;
745
[1241]746 my $section_ptr = $self->_lookup_section($section);
747 if (!defined $section_ptr) {
[8716]748 print STDERR "doc::get_metadata couldn't find section ",
749 $section, "\n";
[1241]750 return;
751 }
752
[6111]753 # Remove the any namespace if we are being told to ignore them
754 if($ignore_namespace) {
755 $field =~ s/^\w*\.//;
756 }
757
[1241]758 my @metadata = ();
759 foreach $data (@{$section_ptr->{'metadata'}}) {
[6111]760
761 my $data_name = $data->[0];
762 # Remove the any namespace if we are being told to ignore them
763 if($ignore_namespace) {
764 $data_name =~ s/^\w*\.//;
765 }
766
767 push (@metadata, $data->[1]) if ($data_name eq $field);
[1241]768 }
[9241]769
[1241]770 return \@metadata;
771}
772
773# returns a list of the form [[field,value],[field,value],...]
774sub get_all_metadata {
775 my $self = shift (@_);
776 my ($section) = @_;
777
778 my $section_ptr = $self->_lookup_section($section);
779 if (!defined $section_ptr) {
[8716]780 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
[1241]781 return;
782 }
783
784 return $section_ptr->{'metadata'};
785}
786
787# $value is optional
788sub delete_metadata {
789 my $self = shift (@_);
790 my ($section, $field, $value) = @_;
791
792 my $section_ptr = $self->_lookup_section($section);
793 if (!defined $section_ptr) {
[8716]794 print STDERR "doc::delete_metadata couldn't find section ", $section, "\n";
[1241]795 return;
796 }
797
798 my $i = 0;
799 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
800 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
801 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
802 splice (@{$section_ptr->{'metadata'}}, $i, 1);
803 } else {
804 $i++;
805 }
806 }
807}
808
809sub delete_all_metadata {
810 my $self = shift (@_);
811 my ($section) = @_;
812
813 my $section_ptr = $self->_lookup_section($section);
814 if (!defined $section_ptr) {
[8716]815 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
[1241]816 return;
817 }
818
819 $section_ptr->{'metadata'} = [];
820}
821
[4]822sub set_metadata_element {
823 my $self = shift (@_);
824 my ($section, $field, $value) = @_;
825
[97]826 $self->set_utf8_metadata_element ($section, $field,
[1870]827 &unicode::ascii2utf8(\$value));
[73]828}
829
830# set_utf8_metadata_element assumes the text has already been
831# converted to the UTF-8 encoding.
832sub set_utf8_metadata_element {
833 my $self = shift (@_);
834 my ($section, $field, $value) = @_;
835
[4]836 $self->delete_metadata ($section, $field);
[73]837 $self->add_utf8_metadata ($section, $field, $value);
[4]838}
839
840
[73]841# add_metadata assumes the text is in (extended) ascii form. For
[8220]842# text which has already been converted to the UTF-8 format use
[73]843# add_utf8_metadata.
[4]844sub add_metadata {
845 my $self = shift (@_);
846 my ($section, $field, $value) = @_;
847
[97]848 $self->add_utf8_metadata ($section, $field,
[1870]849 &unicode::ascii2utf8(\$value));
[73]850}
851
852sub add_utf8_metadata {
853 my $self = shift (@_);
854 my ($section, $field, $value) = @_;
855
[4]856 my $section_ptr = $self->_lookup_section($section);
857 if (!defined $section_ptr) {
[8716]858 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
[4]859 return;
860 }
[1732]861 if (!defined $value) {
862 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
863 return;
864 }
865 if (!defined $field) {
866 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
867 return;
868 }
[4]869
[8894]870 #print STDERR "###$field=$value\n";
[7798]871 # double check that the value is utf-8
[16578]872 if (!&unicode::check_is_utf8($value)) {
[16670]873 print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
[16578]874 &unicode::ensure_utf8(\$value);
[16670]875 print STDERR " Tried converting to utf8: $value\n";
[7798]876 }
877
[4]878 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
879}
880
881
882# methods for dealing with text
883
[1241]884# returns the text for a section
885sub get_text {
886 my $self = shift (@_);
887 my ($section) = @_;
888
889 my $section_ptr = $self->_lookup_section($section);
890 if (!defined $section_ptr) {
891 print STDERR "doc::get_text couldn't find section " .
892 "$section\n";
893 return "";
894 }
895
896 return $section_ptr->{'text'};
897}
898
899# returns the (utf-8 encoded) length of the text for a section
900sub get_text_length {
901 my $self = shift (@_);
902 my ($section) = @_;
903
904 my $section_ptr = $self->_lookup_section($section);
905 if (!defined $section_ptr) {
906 print STDERR "doc::get_text_length couldn't find section " .
907 "$section\n";
908 return 0;
909 }
910
911 return length ($section_ptr->{'text'});
912}
913
914sub delete_text {
915 my $self = shift (@_);
916 my ($section) = @_;
917
918 my $section_ptr = $self->_lookup_section($section);
919 if (!defined $section_ptr) {
920 print STDERR "doc::delete_text couldn't find section " .
921 "$section\n";
922 return;
923 }
924
925 $section_ptr->{'text'} = "";
926}
927
[73]928# add_text assumes the text is in (extended) ascii form. For
929# text which has been already converted to the UTF-8 format
930# use add_utf8_text.
[4]931sub add_text {
932 my $self = shift (@_);
933 my ($section, $text) = @_;
934
[73]935 # convert the text to UTF-8 encoded unicode characters
936 # and add the text
[1870]937 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
[73]938}
939
940
941# add_utf8_text assumes the text to be added has already
942# been converted to the UTF-8 encoding. For ascii text use
943# add_text
944sub add_utf8_text {
945 my $self = shift (@_);
946 my ($section, $text) = @_;
947
[4]948 my $section_ptr = $self->_lookup_section($section);
949 if (!defined $section_ptr) {
[73]950 print STDERR "doc::add_utf8_text couldn't find section " .
[4]951 "$section\n";
952 return;
953 }
954
955 $section_ptr->{'text'} .= $text;
956}
957
[16950]958# returns the Source meta, which is the utf8 filename generated.
959# Added a separate method here for convenience
960sub get_source {
[16924]961 my $self = shift (@_);
[16950]962 return $self->get_metadata_element ($self->get_top_section(), "Source");
963}
964
965# returns the SourceFile meta, which is the url reference to the URL-encoded
966# version of Source (the utf8 filename). Added a separate method here for convenience
967sub get_sourcefile {
968 my $self = shift (@_);
969 return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
970}
971
972# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
973# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
974sub get_assocfile_from_sourcefile {
975 my $self = shift (@_);
[16924]976
977 # get the SourceFile meta, which is a *URL* to a file on the filesystem
978 my $top_section = $self->get_top_section();
979 my $source_file = $self->get_metadata_element($top_section, "SourceFile");
[4]980
[16924]981 # get the actual filename as it exists on the filesystem which this url refers to
[16928]982 $source_file = &unicode::url_to_filename($source_file);
[16924]983 my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
984 return $assocfilename;
985}
986
[1241]987# methods for dealing with associated files
988
989# a file is associated with a document, NOT a section.
990# if section is defined it is noted in the data structure
991# only so that files associated from a particular section
992# may be removed later (using delete_section_assoc_files)
993sub associate_file {
994 my $self = shift (@_);
995 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
996 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
997
998 # remove all associated files with the same name
999 $self->delete_assoc_file ($assoc_filename);
[8504]1000
[1241]1001 push (@{$self->{'associated_files'}},
1002 [$real_filename, $assoc_filename, $mime_type, $section]);
1003}
1004
1005# returns a list of associated files in the form
1006# [[real_filename, assoc_filename, mimetype], ...]
1007sub get_assoc_files {
1008 my $self = shift (@_);
1009
1010 return $self->{'associated_files'};
1011}
1012
[20775]1013# the following two methods used to keep track of original associated files
1014# for incremental building. eg a txt file used by an item file does not end
1015# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1016# file for incremental build
1017sub associate_source_file {
1018 my $self = shift (@_);
1019 my ($full_filename) = @_;
[19494]1020
[20775]1021 push (@{$self->{'source_assoc_files'}}, $full_filename);
1022
1023}
1024
1025sub get_source_assoc_files {
1026 my $self = shift (@_);
1027
1028 return $self->{'source_assoc_files'};
1029
1030
1031}
[19494]1032sub metadata_file {
1033 my $self = shift (@_);
1034 my ($real_filename, $filename) = @_;
1035
1036 push (@{$self->{'metadata_files'}},
1037 [$real_filename, $filename]);
1038}
1039
[21566]1040# used for writing out the archiveinf-doc info database, to list all the metadata files
[19494]1041sub get_meta_files {
1042 my $self = shift (@_);
1043
1044 return $self->{'metadata_files'};
1045}
1046
[1241]1047sub delete_section_assoc_files {
1048 my $self = shift (@_);
1049 my ($section) = @_;
1050
1051 my $i=0;
1052 while ($i < scalar (@{$self->{'associated_files'}})) {
1053 if (defined $self->{'associated_files'}->[$i]->[3] &&
1054 $self->{'associated_files'}->[$i]->[3] eq $section) {
1055 splice (@{$self->{'associated_files'}}, $i, 1);
1056 } else {
1057 $i++;
1058 }
1059 }
1060}
1061
1062sub delete_assoc_file {
1063 my $self = shift (@_);
1064 my ($assoc_filename) = @_;
1065
1066 my $i=0;
1067 while ($i < scalar (@{$self->{'associated_files'}})) {
1068 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1069 splice (@{$self->{'associated_files'}}, $i, 1);
1070 } else {
1071 $i++;
1072 }
1073 }
1074}
1075
1076sub reset_nextsection_ptr {
1077 my $self = shift (@_);
1078 my ($section) = @_;
1079
1080 my $section_ptr = $self->_lookup_section($section);
1081 $section_ptr->{'next_subsection'} = 1;
1082}
1083
[4]10841;
Note: See TracBrowser for help on using the repository browser.