source: gsdl/trunk/perllib/doc.pm@ 17025

Last change on this file since 17025 was 17025, checked in by kjdon, 16 years ago

modified set_OID to take hash_on_file and hash_on_gs_xml as oid types.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 27.8 KB
RevLine 
[537]1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
[8894]10# This program is free software; you can redistr te it and/or modify
[537]11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
[1241]26# base class to hold documents
[4]27
28package doc;
[3834]29eval {require bytes};
[4]30
[832]31BEGIN {
[1241]32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
[832]34}
35
[15894]36use strict;
[1241]37use unicode;
38use util;
39use ghtml;
[8220]40use File::stat;
[1241]41##use hashdoc;
[13172]42use docprint;
[1241]43
[4]44# the document type may be indexed_doc, nonindexed_doc, or
45# classification
46
[2327]47my $OIDcount = 0;
48
[4]49sub new {
50 my $class = shift (@_);
[1374]51 my ($source_filename, $doc_type) = @_;
52
[13770]53
[1241]54 my $self = bless {'associated_files'=>[],
55 'subsection_order'=>[],
56 'next_subsection'=>1,
57 'subsections'=>{},
58 'metadata'=>[],
[2327]59 'text'=>"",
60 'OIDtype'=>"hash"}, $class;
[4]61
[10217]62 # used to set lastmodified here, but this can screw up the HASH ids, so
63 # the docsave processor now calls set_lastmodified
[13770]64
65 $self->{'source_path'} = $source_filename;
[10217]66
[7929]67 if (defined $source_filename) {
[15874]68 $source_filename = &util::filename_within_collection($source_filename);
[7929]69 $self->set_source_filename ($source_filename);
70 }
71
[1374]72 $self->set_doc_type ($doc_type) if defined $doc_type;
[485]73
[4]74 return $self;
75}
[10217]76# set lastmodified for OAI purposes, added by GRB, moved by kjdon
77sub set_lastmodified {
78 my $self = shift (@_);
[4]79
[13770]80 my $source_path = $self->{'source_path'};
81
82 if (defined $source_path && (-e $source_path)) {
83
84 my $file_stat = stat($source_path);
[10217]85 my $mtime = $file_stat->mtime;
86 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $file_stat->mtime);
87 }
88}
89
[1241]90# clone the $self object
91sub duplicate {
92 my $self = shift (@_);
93
94 my $newobj = {};
95
[8716]96 foreach my $k (keys %$self) {
[1241]97 $newobj->{$k} = &clone ($self->{$k});
98 }
99
100 bless $newobj, ref($self);
101 return $newobj;
102}
103
104sub clone {
105 my ($from) = @_;
106 my $type = ref ($from);
107
108 if ($type eq "HASH") {
109 my $to = {};
[8716]110 foreach my $key (keys %$from) {
[1241]111 $to->{$key} = &clone ($from->{$key});
112 }
113 return $to;
114 } elsif ($type eq "ARRAY") {
115 my $to = [];
[8716]116 foreach my $v (@$from) {
[1241]117 push (@$to, &clone ($v));
118 }
119 return $to;
120 } else {
121 return $from;
122 }
123}
124
[2327]125sub set_OIDtype {
126 my $self = shift (@_);
[12268]127 my ($type, $metadata) = @_;
[1241]128
[17025]129 if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|incremental|dirname|assigned)$/) {
[2327]130 $self->{'OIDtype'} = $type;
131 } else {
132 $self->{'OIDtype'} = "hash";
133 }
[16792]134
[12268]135 if ($type =~ /^assigned$/) {
136 if (defined $metadata) {
137 $self->{'OIDmetadata'} = $metadata;
138 } else {
139 $self->{'OIDmetadata'} = "dc.Identifier";
140 }
141 }
[2327]142}
143
[1241]144sub set_source_filename {
145 my $self = shift (@_);
146 my ($source_filename) = @_;
147
[16578]148 # Since the gsdlsourcefilename element goes into the doc.xml it has
149 # to be utf8. However, it should also *represent* the source filename
150 # (in the import directory) which may not be utf8 at all.
151 # For instance, if this meta element (gsdlsourcefilename) will be
152 # used by other applications that parse doc.xml in order to locate
[16670]153 # gsdlsourcefilename. Therefore, the solution is to URLencode the real
154 # filename as this is a binary-to-text encoding meaning that the
155 # resulting string is ASCII (utf8).
[16578]156
[16792]157# print STDERR "******URLencoding the gsdl_source_filename $source_filename ";
[16578]158
[16670]159 # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
160 my ($srcfilename,$dirname,$suffix)
161 = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
[16792]162# print STDERR "-> $srcfilename -> ";
[16670]163 $srcfilename = &unicode::url_encode($srcfilename.$suffix);
164 $source_filename = &util::filename_cat($dirname, $srcfilename);
[16792]165# print STDERR "$source_filename\n";
[16670]166
[16578]167 $self->set_utf8_metadata_element ($self->get_top_section(),
[1241]168 "gsdlsourcefilename",
169 $source_filename);
170}
171
[7569]172sub set_converted_filename {
173 my $self = shift (@_);
174 my ($converted_filename) = @_;
175
[16578]176 # we know the converted filename is utf8
177 $self->set_utf8_metadata_element ($self->get_top_section(),
[7569]178 "gsdlconvertedfilename",
179 $converted_filename);
180}
181
[1241]182# returns the source_filename as it was provided
183sub get_source_filename {
184 my $self = shift (@_);
185
186 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
187}
188
[7569]189# returns converted filename if available else returns source filename
190sub get_filename_for_hashing {
191 my $self = shift (@_);
192
193 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
194
195 if (!defined $filename) {
[11097]196 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
[10980]197 # if NULPlug processed file, then don't give a filename
[11097]198 if (defined $plugin_name && $plugin_name eq "NULPlug") {
[10980]199 $filename = undef;
[16670]200 } else { # returns the URL encoded source filename!
[10980]201 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
202 }
[7569]203 }
204 return $filename;
205}
206
[1241]207sub set_doc_type {
208 my $self = shift (@_);
209 my ($doc_type) = @_;
210
211 $self->set_metadata_element ($self->get_top_section(),
212 "gsdldoctype",
213 $doc_type);
214}
215
[10217]216# returns the gsdldoctype as it was provided
[1241]217# the default of "indexed_doc" is used if no document
218# type was provided
219sub get_doc_type {
220 my $self = shift (@_);
221
222 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
223 return $doc_type if (defined $doc_type);
224 return "indexed_doc";
225}
226
227
228# look up the reference to the a particular section
229sub _lookup_section {
230 my $self = shift (@_);
231 my ($section) = @_;
232
233 my ($num);
234 my $sectionref = $self;
235
236 while (defined $section && $section ne "") {
[12327]237
[1241]238 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
[12327]239
240 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
241
[1241]242 $section = "" unless defined $section;
243
[12327]244
[1241]245 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
246 $sectionref = $sectionref->{'subsections'}->{$num};
247 } else {
248 return undef;
249 }
250 }
251
252 return $sectionref;
253}
254
[2327]255# calculate OID by hashing the contents of the document
[1241]256sub _calc_OID {
257 my $self = shift (@_);
258 my ($filename) = @_;
259
260 my $osexe = &util::get_os_exe();
261
262 my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
263 $ENV{'GSDLOS'},"hashfile$osexe");
[8504]264
[1241]265 my $result = "NULL";
[1679]266
[1241]267 if (-e "$hashfile_exe") {
[1679]268# $result = `\"$hashfile_exe\" \"$filename\"`;
269 $result = `hashfile$osexe \"$filename\"`;
[1241]270 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
271 } else {
272 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
273 }
274 return "HASH$result";
275}
276
277# methods dealing with OID, not groups of them.
278
[2327]279# if $OID is not provided one is calculated
[1241]280sub set_OID {
281 my $self = shift (@_);
282 my ($OID) = @_;
[8504]283
[8797]284 my $use_hash_oid = 0;
[2327]285 # if an OID wasn't provided claculate one
[1241]286 if (!defined $OID) {
287 $OID = "NULL";
[17025]288 if ($self->{'OIDtype'} =~ /^hash/) {
[8797]289 $use_hash_oid = 1;
290 } elsif ($self->{'OIDtype'} eq "incremental") {
[2327]291 $OID = "D" . $OIDcount;
292 $OIDcount ++;
[8716]293
294 } elsif ($self->{'OIDtype'} eq "dirname") {
295 $OID = 'J';
296 my $filename = $self->get_source_filename();
[8797]297 if (defined($filename)) { # && -e $filename) {
[8716]298 $OID = &File::Basename::dirname($filename);
299 if (defined $OID) {
300 $OID = 'J'.&File::Basename::basename($OID);
[8797]301 $OID =~ s/\.//; #remove any periods
[8716]302 } else {
[8797]303 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
304 $use_hash_oid = 1;
[8716]305 }
306 } else {
[8797]307 print STDERR "Failed to find filename, generating hash id\n";
308 $use_hash_oid = 1;
[8716]309 }
[2327]310
[8797]311 } elsif ($self->{'OIDtype'} eq "assigned") {
[12268]312 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
[8797]313 if (defined $identifier && $identifier ne "") {
[17025]314 $OID = $identifier;
[8797]315 $OID =~ s/\.//; #remove any periods
[17025]316 if ($OID =~ /^[\d]*$/) {
317 $OID = "D" . $OID;
318 print STDERR "OID only contains numbers, adding a D\n";
319 }
320
[8797]321 } else {
322 # need a hash id
[12268]323 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
[8797]324 $use_hash_oid = 1;
325 }
326
[2327]327 } else {
[8797]328 $use_hash_oid = 1;
329 }
330
331 if ($use_hash_oid) {
[17025]332 my $hash_on_file = 1;
333 if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
334 $hash_on_file = 0;
335 }
336 if ($hash_on_file) {
337 # "hash" OID - feed file to hashfile.exe
338 my $filename = $self->get_filename_for_hashing();
339 # -z: don't want to hash on the file if it is zero size
340 if (defined($filename) && -e $filename && !-z $filename) {
341 $OID = $self->_calc_OID ($filename);
342 } else {
343 $hash_on_file = 0;
344 }
345 }
346 if (!$hash_on_file) {
347 my $filename = &util::get_tmp_filename();
[2327]348 if (!open (OUTFILE, ">$filename")) {
349 print STDERR "doc::set_OID could not write to $filename\n";
350 } else {
[13172]351 my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
352 print OUTFILE $doc_text;
[2327]353 close (OUTFILE);
354 }
355 $OID = $self->_calc_OID ($filename);
356 &util::rm ($filename);
[1374]357 }
[1241]358 }
359 }
360 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
361}
362
363# this uses hashdoc (embedded c thingy) which is faster but still
364# needs a little work to be suffiently stable
365sub ___set_OID {
366 my $self = shift (@_);
367 my ($OID) = @_;
368
369 # if an OID wasn't provided then calculate hash value based on document
370 if (!defined $OID)
371 {
[13172]372 my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
[1241]373 my $hash_len = length($hash_text);
374
375 $OID = &hashdoc::buffer($hash_text,$hash_len);
376 }
377
378 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
379}
380
381# returns the OID for this document
382sub get_OID {
383 my $self = shift (@_);
384 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
385 return $OID if (defined $OID);
386 return "NULL";
387}
388
389sub delete_OID {
390 my $self = shift (@_);
391
392 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
393}
394
395
396# methods for manipulating section names
397
398# returns the name of the top-most section (the top
399# level of the document
400sub get_top_section {
401 my $self = shift (@_);
402
403 return "";
404}
405
406# returns a section
407sub get_parent_section {
408 my $self = shift (@_);
409 my ($section) = @_;
410
411 $section =~ s/(^|\.)\d+$//;
412
413 return $section;
414}
415
416# returns the first child section (or the end child
417# if there isn't any)
418sub get_begin_child {
419 my $self = shift (@_);
420 my ($section) = @_;
421
422 my $section_ptr = $self->_lookup_section($section);
423 return "" unless defined $section_ptr;
424
425 if (defined $section_ptr->{'subsection_order'}->[0]) {
426 return "$section.$section_ptr->{'subsection_order'}->[0]";
427 }
428
429 return $self->get_end_child ($section);
430}
431
432# returns the next child of a parent section
433sub get_next_child {
434 my $self = shift (@_);
435 my ($section) = @_;
436
437 my $parent_section = $self->get_parent_section($section);
438 my $parent_section_ptr = $self->_lookup_section($parent_section);
439 return undef unless defined $parent_section_ptr;
440
441 my ($section_num) = $section =~ /(\d+)$/;
442 return undef unless defined $section_num;
443
444 my $i = 0;
445 my $section_order = $parent_section_ptr->{'subsection_order'};
446 while ($i < scalar(@$section_order)) {
447 last if $section_order->[$i] eq $section_num;
448 $i++;
449 }
450
451 $i++; # the next child
452 if ($i < scalar(@$section_order)) {
453 return $section_order->[$i] if $parent_section eq "";
454 return "$parent_section.$section_order->[$i]";
455 }
456
457 # no more sections in this level
458 return undef;
459}
460
461# returns a reference to a list of children
462sub get_children {
463 my $self = shift (@_);
464 my ($section) = @_;
465
466 my $section_ptr = $self->_lookup_section($section);
467 return [] unless defined $section_ptr;
468
469 my @children = @{$section_ptr->{'subsection_order'}};
470
471 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
472 return \@children;
473}
474
475# returns the child section one past the last one (which
476# is coded as "0")
477sub get_end_child {
478 my $self = shift (@_);
479 my ($section) = @_;
480
481 return $section . ".0" unless $section eq "";
482 return "0";
483}
484
485# returns the next section in book order
486sub get_next_section {
487 my $self = shift (@_);
488 my ($section) = @_;
489
490 return undef unless defined $section;
491
492 my $section_ptr = $self->_lookup_section($section);
493 return undef unless defined $section_ptr;
494
495 # first try to find first child
496 if (defined $section_ptr->{'subsection_order'}->[0]) {
497 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
498 return "$section.$section_ptr->{'subsection_order'}->[0]";
499 }
500
501 do {
502 # try to find sibling
503 my $next_child = $self->get_next_child ($section);
504 return $next_child if (defined $next_child);
505
506 # move up one level
507 $section = $self->get_parent_section ($section);
508 } while $section =~ /\d/;
509
510 return undef;
511}
512
513sub is_leaf_section {
514 my $self = shift (@_);
515 my ($section) = @_;
516
517 my $section_ptr = $self->_lookup_section($section);
518 return 1 unless defined $section_ptr;
519
520 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
521}
522
523# methods for dealing with sections
524
525# returns the name of the inserted section
526sub insert_section {
527 my $self = shift (@_);
528 my ($before_section) = @_;
529
530 # get the child to insert before and its parent section
531 my $parent_section = "";
532 my $before_child = "0";
533 my @before_section = split (/\./, $before_section);
534 if (scalar(@before_section) > 0) {
535 $before_child = pop (@before_section);
536 $parent_section = join (".", @before_section);
537 }
538
539 my $parent_section_ptr = $self->_lookup_section($parent_section);
540 if (!defined $parent_section_ptr) {
541 print STDERR "doc::insert_section couldn't find parent section " .
542 "$parent_section\n";
543 return;
544 }
545
546 # get the next section number
547 my $section_num = $parent_section_ptr->{'next_subsection'}++;
548
549 my $i = 0;
550 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
551 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
552 $i++;
553 }
554
555 # insert the section number into the order list
556 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
557
558 # add this section to the parent section
559 my $section_ptr = {'subsection_order'=>[],
560 'next_subsection'=>1,
561 'subsections'=>{},
562 'metadata'=>[],
563 'text'=>""};
564 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
565
566 # work out the full section number
567 my $section = $parent_section;
568 $section .= "." unless $section eq "";
569 $section .= $section_num;
570
571 return $section;
572}
573
574# creates a pre-named section
575sub create_named_section {
576 my $self = shift (@_);
577 my ($mastersection) = @_;
578
579 my ($num);
580 my $section = $mastersection;
581 my $sectionref = $self;
582
583 while ($section ne "") {
584 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
585 $num =~ s/^0+(\d)/$1/; # remove leading 0s
586 $section = "" unless defined $section;
587
588 if (defined $num) {
589 if (!defined $sectionref->{'subsections'}->{$num}) {
590 push (@{$sectionref->{'subsection_order'}}, $num);
591 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
592 'next_subsection'=>1,
593 'subsections'=>{},
594 'metadata'=>[],
595 'text'=>""};
596 if ($num >= $sectionref->{'next_subsection'}) {
597 $sectionref->{'next_subsection'} = $num + 1;
598 }
599 }
600 $sectionref = $sectionref->{'subsections'}->{$num};
601
602 } else {
603 print STDERR "doc::create_named_section couldn't create section ";
604 print STDERR "$mastersection\n";
605 last;
606 }
607 }
608}
609
610# returns a reference to a list of subsections
611sub list_subsections {
612 my $self = shift (@_);
613 my ($section) = @_;
614
615 my $section_ptr = $self->_lookup_section ($section);
616 if (!defined $section_ptr) {
617 print STDERR "doc::list_subsections couldn't find section $section\n";
618 return [];
619 }
620
621 return [@{$section_ptr->{'subsection_order'}}];
622}
623
624sub delete_section {
625 my $self = shift (@_);
626 my ($section) = @_;
627
628# my $section_ptr = {'subsection_order'=>[],
629# 'next_subsection'=>1,
630# 'subsections'=>{},
631# 'metadata'=>[],
632# 'text'=>""};
633
634 # if this is the top section reset everything
635 if ($section eq "") {
636 $self->{'subsection_order'} = [];
637 $self->{'subsections'} = {};
638 $self->{'metadata'} = [];
639 $self->{'text'} = "";
640 return;
641 }
642
643 # find the parent of the section to delete
644 my $parent_section = "";
645 my $child = "0";
646 my @section = split (/\./, $section);
647 if (scalar(@section) > 0) {
648 $child = pop (@section);
649 $parent_section = join (".", @section);
650 }
651
652 my $parent_section_ptr = $self->_lookup_section($parent_section);
653 if (!defined $parent_section_ptr) {
654 print STDERR "doc::delete_section couldn't find parent section " .
655 "$parent_section\n";
656 return;
657 }
658
659 # remove this section from the subsection_order list
660 my $i = 0;
661 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
662 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
663 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
664 last;
665 }
666 $i++;
667 }
668
669 # remove this section from the subsection hash
670 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
671 undef $parent_section_ptr->{'subsections'}->{$child};
672 }
673}
674
675#--
[4]676# methods for dealing with metadata
677
678# set_metadata_element and get_metadata_element are for metadata
679# which should only have one value. add_meta_data and get_metadata
680# are for metadata which can have more than one value.
681
[1241]682# returns the first metadata value which matches field
[6111]683
684# This version of get metadata element works much like the one above,
685# except it allows for the namespace portion of a metadata element to
686# be ignored, thus if you are searching for dc.Title, the first piece
687# of matching metadata ending with the name Title (once any namespace
688# is removed) would be returned.
689# 28-11-2003 John Thompson
[1241]690sub get_metadata_element {
691 my $self = shift (@_);
[6111]692 my ($section, $field, $ignore_namespace) = @_;
[1241]693 my ($data);
694
[6111]695 $ignore_namespace = 0 unless defined $ignore_namespace;
696
[1241]697 my $section_ptr = $self->_lookup_section($section);
698 if (!defined $section_ptr) {
[8716]699 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
[1241]700 return;
701 }
702
[6111]703 # Remove the any namespace if we are being told to ignore them
704 if($ignore_namespace) {
705 $field =~ s/^\w*\.//;
706 }
707
[1241]708 foreach $data (@{$section_ptr->{'metadata'}}) {
[6111]709
710 my $data_name = $data->[0];
[14966]711
[6111]712 # Remove the any namespace if we are being told to ignore them
713 if($ignore_namespace) {
714 $data_name =~ s/^\w*\.//;
715 }
716
717 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
[1241]718 }
719
720 return undef; # was not found
721}
722
723# returns a list of the form [value1, value2, ...]
724sub get_metadata {
725 my $self = shift (@_);
[6111]726 my ($section, $field, $ignore_namespace) = @_;
[1241]727 my ($data);
728
[6111]729 $ignore_namespace = 0 unless defined $ignore_namespace;
730
[1241]731 my $section_ptr = $self->_lookup_section($section);
732 if (!defined $section_ptr) {
[8716]733 print STDERR "doc::get_metadata couldn't find section ",
734 $section, "\n";
[1241]735 return;
736 }
737
[6111]738 # Remove the any namespace if we are being told to ignore them
739 if($ignore_namespace) {
740 $field =~ s/^\w*\.//;
741 }
742
[1241]743 my @metadata = ();
744 foreach $data (@{$section_ptr->{'metadata'}}) {
[6111]745
746 my $data_name = $data->[0];
747 # Remove the any namespace if we are being told to ignore them
748 if($ignore_namespace) {
749 $data_name =~ s/^\w*\.//;
750 }
751
752 push (@metadata, $data->[1]) if ($data_name eq $field);
[1241]753 }
[9241]754
[1241]755 return \@metadata;
756}
757
758# returns a list of the form [[field,value],[field,value],...]
759sub get_all_metadata {
760 my $self = shift (@_);
761 my ($section) = @_;
762
763 my $section_ptr = $self->_lookup_section($section);
764 if (!defined $section_ptr) {
[8716]765 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
[1241]766 return;
767 }
768
769 return $section_ptr->{'metadata'};
770}
771
772# $value is optional
773sub delete_metadata {
774 my $self = shift (@_);
775 my ($section, $field, $value) = @_;
776
777 my $section_ptr = $self->_lookup_section($section);
778 if (!defined $section_ptr) {
[8716]779 print STDERR "doc::delete_metadata couldn't find section ", $section, "\n";
[1241]780 return;
781 }
782
783 my $i = 0;
784 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
785 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
786 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
787 splice (@{$section_ptr->{'metadata'}}, $i, 1);
788 } else {
789 $i++;
790 }
791 }
792}
793
794sub delete_all_metadata {
795 my $self = shift (@_);
796 my ($section) = @_;
797
798 my $section_ptr = $self->_lookup_section($section);
799 if (!defined $section_ptr) {
[8716]800 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
[1241]801 return;
802 }
803
804 $section_ptr->{'metadata'} = [];
805}
806
[4]807sub set_metadata_element {
808 my $self = shift (@_);
809 my ($section, $field, $value) = @_;
810
[97]811 $self->set_utf8_metadata_element ($section, $field,
[1870]812 &unicode::ascii2utf8(\$value));
[73]813}
814
815# set_utf8_metadata_element assumes the text has already been
816# converted to the UTF-8 encoding.
817sub set_utf8_metadata_element {
818 my $self = shift (@_);
819 my ($section, $field, $value) = @_;
820
[4]821 $self->delete_metadata ($section, $field);
[73]822 $self->add_utf8_metadata ($section, $field, $value);
[4]823}
824
825
[73]826# add_metadata assumes the text is in (extended) ascii form. For
[8220]827# text which has already been converted to the UTF-8 format use
[73]828# add_utf8_metadata.
[4]829sub add_metadata {
830 my $self = shift (@_);
831 my ($section, $field, $value) = @_;
832
[97]833 $self->add_utf8_metadata ($section, $field,
[1870]834 &unicode::ascii2utf8(\$value));
[73]835}
836
837sub add_utf8_metadata {
838 my $self = shift (@_);
839 my ($section, $field, $value) = @_;
840
[4]841 my $section_ptr = $self->_lookup_section($section);
842 if (!defined $section_ptr) {
[8716]843 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
[4]844 return;
845 }
[1732]846 if (!defined $value) {
847 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
848 return;
849 }
850 if (!defined $field) {
851 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
852 return;
853 }
[4]854
[8894]855 #print STDERR "###$field=$value\n";
[7798]856 # double check that the value is utf-8
[16578]857 if (!&unicode::check_is_utf8($value)) {
[16670]858 print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
[16578]859 &unicode::ensure_utf8(\$value);
[16670]860 print STDERR " Tried converting to utf8: $value\n";
[7798]861 }
862
[4]863 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
864}
865
866
867# methods for dealing with text
868
[1241]869# returns the text for a section
870sub get_text {
871 my $self = shift (@_);
872 my ($section) = @_;
873
874 my $section_ptr = $self->_lookup_section($section);
875 if (!defined $section_ptr) {
876 print STDERR "doc::get_text couldn't find section " .
877 "$section\n";
878 return "";
879 }
880
881 return $section_ptr->{'text'};
882}
883
884# returns the (utf-8 encoded) length of the text for a section
885sub get_text_length {
886 my $self = shift (@_);
887 my ($section) = @_;
888
889 my $section_ptr = $self->_lookup_section($section);
890 if (!defined $section_ptr) {
891 print STDERR "doc::get_text_length couldn't find section " .
892 "$section\n";
893 return 0;
894 }
895
896 return length ($section_ptr->{'text'});
897}
898
899sub delete_text {
900 my $self = shift (@_);
901 my ($section) = @_;
902
903 my $section_ptr = $self->_lookup_section($section);
904 if (!defined $section_ptr) {
905 print STDERR "doc::delete_text couldn't find section " .
906 "$section\n";
907 return;
908 }
909
910 $section_ptr->{'text'} = "";
911}
912
[73]913# add_text assumes the text is in (extended) ascii form. For
914# text which has been already converted to the UTF-8 format
915# use add_utf8_text.
[4]916sub add_text {
917 my $self = shift (@_);
918 my ($section, $text) = @_;
919
[73]920 # convert the text to UTF-8 encoded unicode characters
921 # and add the text
[1870]922 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
[73]923}
924
925
926# add_utf8_text assumes the text to be added has already
927# been converted to the UTF-8 encoding. For ascii text use
928# add_text
929sub add_utf8_text {
930 my $self = shift (@_);
931 my ($section, $text) = @_;
932
[4]933 my $section_ptr = $self->_lookup_section($section);
934 if (!defined $section_ptr) {
[73]935 print STDERR "doc::add_utf8_text couldn't find section " .
[4]936 "$section\n";
937 return;
938 }
939
940 $section_ptr->{'text'} .= $text;
941}
942
[16950]943# returns the Source meta, which is the utf8 filename generated.
944# Added a separate method here for convenience
945sub get_source {
[16924]946 my $self = shift (@_);
[16950]947 return $self->get_metadata_element ($self->get_top_section(), "Source");
948}
949
950# returns the SourceFile meta, which is the url reference to the URL-encoded
951# version of Source (the utf8 filename). Added a separate method here for convenience
952sub get_sourcefile {
953 my $self = shift (@_);
954 return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
955}
956
957# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
958# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
959sub get_assocfile_from_sourcefile {
960 my $self = shift (@_);
[16924]961
962 # get the SourceFile meta, which is a *URL* to a file on the filesystem
963 my $top_section = $self->get_top_section();
964 my $source_file = $self->get_metadata_element($top_section, "SourceFile");
[4]965
[16924]966 # get the actual filename as it exists on the filesystem which this url refers to
[16928]967 $source_file = &unicode::url_to_filename($source_file);
[16924]968 my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
969 return $assocfilename;
970}
971
[1241]972# methods for dealing with associated files
973
974# a file is associated with a document, NOT a section.
975# if section is defined it is noted in the data structure
976# only so that files associated from a particular section
977# may be removed later (using delete_section_assoc_files)
978sub associate_file {
979 my $self = shift (@_);
980 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
981 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
982
983 # remove all associated files with the same name
984 $self->delete_assoc_file ($assoc_filename);
[8504]985
[1241]986 push (@{$self->{'associated_files'}},
987 [$real_filename, $assoc_filename, $mime_type, $section]);
988}
989
990# returns a list of associated files in the form
991# [[real_filename, assoc_filename, mimetype], ...]
992sub get_assoc_files {
993 my $self = shift (@_);
994
995 return $self->{'associated_files'};
996}
997
998sub delete_section_assoc_files {
999 my $self = shift (@_);
1000 my ($section) = @_;
1001
1002 my $i=0;
1003 while ($i < scalar (@{$self->{'associated_files'}})) {
1004 if (defined $self->{'associated_files'}->[$i]->[3] &&
1005 $self->{'associated_files'}->[$i]->[3] eq $section) {
1006 splice (@{$self->{'associated_files'}}, $i, 1);
1007 } else {
1008 $i++;
1009 }
1010 }
1011}
1012
1013sub delete_assoc_file {
1014 my $self = shift (@_);
1015 my ($assoc_filename) = @_;
1016
1017 my $i=0;
1018 while ($i < scalar (@{$self->{'associated_files'}})) {
1019 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1020 splice (@{$self->{'associated_files'}}, $i, 1);
1021 } else {
1022 $i++;
1023 }
1024 }
1025}
1026
1027sub reset_nextsection_ptr {
1028 my $self = shift (@_);
1029 my ($section) = @_;
1030
1031 my $section_ptr = $self->_lookup_section($section);
1032 $section_ptr->{'next_subsection'} = 1;
1033}
1034
[4]10351;
Note: See TracBrowser for help on using the repository browser.