source: gsdl/trunk/perllib/doc.pm@ 16578

Last change on this file since 16578 was 16578, checked in by ak19, 16 years ago
  1. Base64 encoded gsdlsourcefilename to preserve original filename. 2. Both subroutines set_source_filename and set_converted_filename now call set_utf8_metadata_element. 3. Subroutine add_utf8_metadata checks to see that the meta is utf8 (or else tries to make it utf8) before adding it in.
  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 26.7 KB
RevLine 
[537]1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
[8894]10# This program is free software; you can redistr te it and/or modify
[537]11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
[1241]26# base class to hold documents
[4]27
28package doc;
[3834]29eval {require bytes};
[4]30
[832]31BEGIN {
[1241]32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
[832]34}
35
[15894]36use strict;
[1241]37use unicode;
38use util;
39use ghtml;
[8220]40use File::stat;
[16578]41use MIME::Base64;
[1241]42##use hashdoc;
[13172]43use docprint;
[1241]44
[4]45# the document type may be indexed_doc, nonindexed_doc, or
46# classification
47
[2327]48my $OIDcount = 0;
49
[4]50sub new {
51 my $class = shift (@_);
[1374]52 my ($source_filename, $doc_type) = @_;
53
[13770]54
[1241]55 my $self = bless {'associated_files'=>[],
56 'subsection_order'=>[],
57 'next_subsection'=>1,
58 'subsections'=>{},
59 'metadata'=>[],
[2327]60 'text'=>"",
61 'OIDtype'=>"hash"}, $class;
[4]62
[10217]63 # used to set lastmodified here, but this can screw up the HASH ids, so
64 # the docsave processor now calls set_lastmodified
[13770]65
66 $self->{'source_path'} = $source_filename;
[10217]67
[7929]68 if (defined $source_filename) {
[15874]69 $source_filename = &util::filename_within_collection($source_filename);
[7929]70 $self->set_source_filename ($source_filename);
71 }
72
[1374]73 $self->set_doc_type ($doc_type) if defined $doc_type;
[485]74
[4]75 return $self;
76}
[10217]77# set lastmodified for OAI purposes, added by GRB, moved by kjdon
78sub set_lastmodified {
79 my $self = shift (@_);
[4]80
[13770]81 my $source_path = $self->{'source_path'};
82
83 if (defined $source_path && (-e $source_path)) {
84
85 my $file_stat = stat($source_path);
[10217]86 my $mtime = $file_stat->mtime;
87 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $file_stat->mtime);
88 }
89}
90
[1241]91# clone the $self object
92sub duplicate {
93 my $self = shift (@_);
94
95 my $newobj = {};
96
[8716]97 foreach my $k (keys %$self) {
[1241]98 $newobj->{$k} = &clone ($self->{$k});
99 }
100
101 bless $newobj, ref($self);
102 return $newobj;
103}
104
105sub clone {
106 my ($from) = @_;
107 my $type = ref ($from);
108
109 if ($type eq "HASH") {
110 my $to = {};
[8716]111 foreach my $key (keys %$from) {
[1241]112 $to->{$key} = &clone ($from->{$key});
113 }
114 return $to;
115 } elsif ($type eq "ARRAY") {
116 my $to = [];
[8716]117 foreach my $v (@$from) {
[1241]118 push (@$to, &clone ($v));
119 }
120 return $to;
121 } else {
122 return $from;
123 }
124}
125
[2327]126sub set_OIDtype {
127 my $self = shift (@_);
[12268]128 my ($type, $metadata) = @_;
[1241]129
[10608]130 if (defined $type && $type =~ /^(hash|incremental|dirname|assigned)$/) {
[2327]131 $self->{'OIDtype'} = $type;
132 } else {
133 $self->{'OIDtype'} = "hash";
134 }
[12268]135 if ($type =~ /^assigned$/) {
136 if (defined $metadata) {
137 $self->{'OIDmetadata'} = $metadata;
138 } else {
139 $self->{'OIDmetadata'} = "dc.Identifier";
140 }
141 }
[2327]142}
143
[1241]144sub set_source_filename {
145 my $self = shift (@_);
146 my ($source_filename) = @_;
147
[16578]148 # Since the gsdlsourcefilename element goes into the doc.xml it has
149 # to be utf8. However, it should also *represent* the source filename
150 # (in the import directory) which may not be utf8 at all.
151 # For instance, if this meta element (gsdlsourcefilename) will be
152 # used by other applications that parse doc.xml in order to locate
153 # gsdlsourcefilename. Therefore, the solution is to encode the real
154 # filename into Base64 encoding which is a binary-to-text encoding,
155 # meaning that the resulting string is ASCII (utf8). See also
156 # MIME#Content-Transfer-Encoding in Wiki.
157
158 print STDERR "******Base64 encoding the non-utf8 gsdl_source_filename $source_filename ";
159
160 # remove "import/" from path, base 64 encode just the source_filename
161 # (remove trailing spaces and newlines) and prefix "import/" again.
162 my $import_dir = "import";
163 my $srcfilename = $source_filename;
164 $srcfilename =~ s/^$import_dir(?:\\|\/)+(.*)$/$1/;
165
166 print STDERR "-> $srcfilename -> ";
167 $srcfilename = &MIME::Base64::encode_base64($srcfilename);
168 $srcfilename =~ s/\s*//sg; # for some reason it adds spaces not just at end but also in middle
169
170 $source_filename = &util::filename_cat($import_dir, $srcfilename);
171 print STDERR "$source_filename\n";
172
173 $self->set_utf8_metadata_element ($self->get_top_section(),
[1241]174 "gsdlsourcefilename",
175 $source_filename);
176}
177
[7569]178sub set_converted_filename {
179 my $self = shift (@_);
180 my ($converted_filename) = @_;
181
[16578]182 # we know the converted filename is utf8
183 $self->set_utf8_metadata_element ($self->get_top_section(),
[7569]184 "gsdlconvertedfilename",
185 $converted_filename);
186}
187
188
[1241]189# returns the source_filename as it was provided
190sub get_source_filename {
191 my $self = shift (@_);
192
193 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
194}
195
[7569]196# returns converted filename if available else returns source filename
197sub get_filename_for_hashing {
198 my $self = shift (@_);
199
200 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
201
202 if (!defined $filename) {
[11097]203 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
[10980]204 # if NULPlug processed file, then don't give a filename
[11097]205 if (defined $plugin_name && $plugin_name eq "NULPlug") {
[10980]206 $filename = undef;
207 } else {
208 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
209 }
[7569]210 }
211 return $filename;
212}
213
[1241]214sub set_doc_type {
215 my $self = shift (@_);
216 my ($doc_type) = @_;
217
218 $self->set_metadata_element ($self->get_top_section(),
219 "gsdldoctype",
220 $doc_type);
221}
222
[10217]223# returns the gsdldoctype as it was provided
[1241]224# the default of "indexed_doc" is used if no document
225# type was provided
226sub get_doc_type {
227 my $self = shift (@_);
228
229 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
230 return $doc_type if (defined $doc_type);
231 return "indexed_doc";
232}
233
234
235# look up the reference to the a particular section
236sub _lookup_section {
237 my $self = shift (@_);
238 my ($section) = @_;
239
240 my ($num);
241 my $sectionref = $self;
242
243 while (defined $section && $section ne "") {
[12327]244
[1241]245 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
[12327]246
247 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
248
[1241]249 $section = "" unless defined $section;
250
[12327]251
[1241]252 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
253 $sectionref = $sectionref->{'subsections'}->{$num};
254 } else {
255 return undef;
256 }
257 }
258
259 return $sectionref;
260}
261
[2327]262# calculate OID by hashing the contents of the document
[1241]263sub _calc_OID {
264 my $self = shift (@_);
265 my ($filename) = @_;
266
267 my $osexe = &util::get_os_exe();
268
269 my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
270 $ENV{'GSDLOS'},"hashfile$osexe");
[8504]271
[1241]272 my $result = "NULL";
[1679]273
[1241]274 if (-e "$hashfile_exe") {
[1679]275# $result = `\"$hashfile_exe\" \"$filename\"`;
276 $result = `hashfile$osexe \"$filename\"`;
[1241]277 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
278
279 } else {
280 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
281 }
282 return "HASH$result";
283}
284
285# methods dealing with OID, not groups of them.
286
[2327]287# if $OID is not provided one is calculated
[1241]288sub set_OID {
289 my $self = shift (@_);
290 my ($OID) = @_;
[8504]291
[8797]292 my $use_hash_oid = 0;
[2327]293 # if an OID wasn't provided claculate one
[1241]294 if (!defined $OID) {
295 $OID = "NULL";
[8797]296 if ($self->{'OIDtype'} eq "hash") {
297 $use_hash_oid = 1;
298 } elsif ($self->{'OIDtype'} eq "incremental") {
[2327]299 $OID = "D" . $OIDcount;
300 $OIDcount ++;
[8716]301
302 } elsif ($self->{'OIDtype'} eq "dirname") {
303 $OID = 'J';
304 my $filename = $self->get_source_filename();
[8797]305 if (defined($filename)) { # && -e $filename) {
[8716]306 $OID = &File::Basename::dirname($filename);
307 if (defined $OID) {
308 $OID = 'J'.&File::Basename::basename($OID);
[8797]309 $OID =~ s/\.//; #remove any periods
[8716]310 } else {
[8797]311 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
312 $use_hash_oid = 1;
[8716]313 }
314 } else {
[8797]315 print STDERR "Failed to find filename, generating hash id\n";
316 $use_hash_oid = 1;
[8716]317 }
[2327]318
[8797]319 } elsif ($self->{'OIDtype'} eq "assigned") {
[12268]320 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
[8797]321 if (defined $identifier && $identifier ne "") {
[12268]322 $OID = "D" . $identifier;
[8797]323 $OID =~ s/\.//; #remove any periods
324 } else {
325 # need a hash id
[12268]326 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
[8797]327 $use_hash_oid = 1;
328 }
329
[2327]330 } else {
[8797]331 $use_hash_oid = 1;
332 }
333
334 if ($use_hash_oid) {
335
[2327]336 # "hash" OID - feed file to hashfile.exe
[7569]337 #my $filename = $self->get_source_filename();
338 # we want to use the converted file for hashing if available
339 # cos its quicker
340 my $filename = $self->get_filename_for_hashing();
[9038]341 # -z: don't want to hash on the file if it is zero size
342 if (defined($filename) && -e $filename && !-z $filename) {
[2327]343 $OID = $self->_calc_OID ($filename);
344 } else {
345 $filename = &util::get_tmp_filename();
346 if (!open (OUTFILE, ">$filename")) {
347 print STDERR "doc::set_OID could not write to $filename\n";
348 } else {
[13172]349 my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
350 print OUTFILE $doc_text;
[2327]351 close (OUTFILE);
352 }
353 $OID = $self->_calc_OID ($filename);
354 &util::rm ($filename);
[1374]355 }
[1241]356 }
357 }
358 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
359}
360
361# this uses hashdoc (embedded c thingy) which is faster but still
362# needs a little work to be suffiently stable
363sub ___set_OID {
364 my $self = shift (@_);
365 my ($OID) = @_;
366
367 # if an OID wasn't provided then calculate hash value based on document
368 if (!defined $OID)
369 {
[13172]370 my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
[1241]371 my $hash_len = length($hash_text);
372
373 $OID = &hashdoc::buffer($hash_text,$hash_len);
374 }
375
376 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
377}
378
379# returns the OID for this document
380sub get_OID {
381 my $self = shift (@_);
382 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
383 return $OID if (defined $OID);
384 return "NULL";
385}
386
387sub delete_OID {
388 my $self = shift (@_);
389
390 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
391}
392
393
394# methods for manipulating section names
395
396# returns the name of the top-most section (the top
397# level of the document
398sub get_top_section {
399 my $self = shift (@_);
400
401 return "";
402}
403
404# returns a section
405sub get_parent_section {
406 my $self = shift (@_);
407 my ($section) = @_;
408
409 $section =~ s/(^|\.)\d+$//;
410
411 return $section;
412}
413
414# returns the first child section (or the end child
415# if there isn't any)
416sub get_begin_child {
417 my $self = shift (@_);
418 my ($section) = @_;
419
420 my $section_ptr = $self->_lookup_section($section);
421 return "" unless defined $section_ptr;
422
423 if (defined $section_ptr->{'subsection_order'}->[0]) {
424 return "$section.$section_ptr->{'subsection_order'}->[0]";
425 }
426
427 return $self->get_end_child ($section);
428}
429
430# returns the next child of a parent section
431sub get_next_child {
432 my $self = shift (@_);
433 my ($section) = @_;
434
435 my $parent_section = $self->get_parent_section($section);
436 my $parent_section_ptr = $self->_lookup_section($parent_section);
437 return undef unless defined $parent_section_ptr;
438
439 my ($section_num) = $section =~ /(\d+)$/;
440 return undef unless defined $section_num;
441
442 my $i = 0;
443 my $section_order = $parent_section_ptr->{'subsection_order'};
444 while ($i < scalar(@$section_order)) {
445 last if $section_order->[$i] eq $section_num;
446 $i++;
447 }
448
449 $i++; # the next child
450 if ($i < scalar(@$section_order)) {
451 return $section_order->[$i] if $parent_section eq "";
452 return "$parent_section.$section_order->[$i]";
453 }
454
455 # no more sections in this level
456 return undef;
457}
458
459# returns a reference to a list of children
460sub get_children {
461 my $self = shift (@_);
462 my ($section) = @_;
463
464 my $section_ptr = $self->_lookup_section($section);
465 return [] unless defined $section_ptr;
466
467 my @children = @{$section_ptr->{'subsection_order'}};
468
469 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
470 return \@children;
471}
472
473# returns the child section one past the last one (which
474# is coded as "0")
475sub get_end_child {
476 my $self = shift (@_);
477 my ($section) = @_;
478
479 return $section . ".0" unless $section eq "";
480 return "0";
481}
482
483# returns the next section in book order
484sub get_next_section {
485 my $self = shift (@_);
486 my ($section) = @_;
487
488 return undef unless defined $section;
489
490 my $section_ptr = $self->_lookup_section($section);
491 return undef unless defined $section_ptr;
492
493 # first try to find first child
494 if (defined $section_ptr->{'subsection_order'}->[0]) {
495 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
496 return "$section.$section_ptr->{'subsection_order'}->[0]";
497 }
498
499 do {
500 # try to find sibling
501 my $next_child = $self->get_next_child ($section);
502 return $next_child if (defined $next_child);
503
504 # move up one level
505 $section = $self->get_parent_section ($section);
506 } while $section =~ /\d/;
507
508 return undef;
509}
510
511sub is_leaf_section {
512 my $self = shift (@_);
513 my ($section) = @_;
514
515 my $section_ptr = $self->_lookup_section($section);
516 return 1 unless defined $section_ptr;
517
518 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
519}
520
521# methods for dealing with sections
522
523# returns the name of the inserted section
524sub insert_section {
525 my $self = shift (@_);
526 my ($before_section) = @_;
527
528 # get the child to insert before and its parent section
529 my $parent_section = "";
530 my $before_child = "0";
531 my @before_section = split (/\./, $before_section);
532 if (scalar(@before_section) > 0) {
533 $before_child = pop (@before_section);
534 $parent_section = join (".", @before_section);
535 }
536
537 my $parent_section_ptr = $self->_lookup_section($parent_section);
538 if (!defined $parent_section_ptr) {
539 print STDERR "doc::insert_section couldn't find parent section " .
540 "$parent_section\n";
541 return;
542 }
543
544 # get the next section number
545 my $section_num = $parent_section_ptr->{'next_subsection'}++;
546
547 my $i = 0;
548 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
549 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
550 $i++;
551 }
552
553 # insert the section number into the order list
554 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
555
556 # add this section to the parent section
557 my $section_ptr = {'subsection_order'=>[],
558 'next_subsection'=>1,
559 'subsections'=>{},
560 'metadata'=>[],
561 'text'=>""};
562 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
563
564 # work out the full section number
565 my $section = $parent_section;
566 $section .= "." unless $section eq "";
567 $section .= $section_num;
568
569 return $section;
570}
571
572# creates a pre-named section
573sub create_named_section {
574 my $self = shift (@_);
575 my ($mastersection) = @_;
576
577 my ($num);
578 my $section = $mastersection;
579 my $sectionref = $self;
580
581 while ($section ne "") {
582 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
583 $num =~ s/^0+(\d)/$1/; # remove leading 0s
584 $section = "" unless defined $section;
585
586 if (defined $num) {
587 if (!defined $sectionref->{'subsections'}->{$num}) {
588 push (@{$sectionref->{'subsection_order'}}, $num);
589 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
590 'next_subsection'=>1,
591 'subsections'=>{},
592 'metadata'=>[],
593 'text'=>""};
594 if ($num >= $sectionref->{'next_subsection'}) {
595 $sectionref->{'next_subsection'} = $num + 1;
596 }
597 }
598 $sectionref = $sectionref->{'subsections'}->{$num};
599
600 } else {
601 print STDERR "doc::create_named_section couldn't create section ";
602 print STDERR "$mastersection\n";
603 last;
604 }
605 }
606}
607
608# returns a reference to a list of subsections
609sub list_subsections {
610 my $self = shift (@_);
611 my ($section) = @_;
612
613 my $section_ptr = $self->_lookup_section ($section);
614 if (!defined $section_ptr) {
615 print STDERR "doc::list_subsections couldn't find section $section\n";
616 return [];
617 }
618
619 return [@{$section_ptr->{'subsection_order'}}];
620}
621
622sub delete_section {
623 my $self = shift (@_);
624 my ($section) = @_;
625
626# my $section_ptr = {'subsection_order'=>[],
627# 'next_subsection'=>1,
628# 'subsections'=>{},
629# 'metadata'=>[],
630# 'text'=>""};
631
632 # if this is the top section reset everything
633 if ($section eq "") {
634 $self->{'subsection_order'} = [];
635 $self->{'subsections'} = {};
636 $self->{'metadata'} = [];
637 $self->{'text'} = "";
638 return;
639 }
640
641 # find the parent of the section to delete
642 my $parent_section = "";
643 my $child = "0";
644 my @section = split (/\./, $section);
645 if (scalar(@section) > 0) {
646 $child = pop (@section);
647 $parent_section = join (".", @section);
648 }
649
650 my $parent_section_ptr = $self->_lookup_section($parent_section);
651 if (!defined $parent_section_ptr) {
652 print STDERR "doc::delete_section couldn't find parent section " .
653 "$parent_section\n";
654 return;
655 }
656
657 # remove this section from the subsection_order list
658 my $i = 0;
659 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
660 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
661 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
662 last;
663 }
664 $i++;
665 }
666
667 # remove this section from the subsection hash
668 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
669 undef $parent_section_ptr->{'subsections'}->{$child};
670 }
671}
672
673#--
[4]674# methods for dealing with metadata
675
676# set_metadata_element and get_metadata_element are for metadata
677# which should only have one value. add_meta_data and get_metadata
678# are for metadata which can have more than one value.
679
[1241]680# returns the first metadata value which matches field
[6111]681
682# This version of get metadata element works much like the one above,
683# except it allows for the namespace portion of a metadata element to
684# be ignored, thus if you are searching for dc.Title, the first piece
685# of matching metadata ending with the name Title (once any namespace
686# is removed) would be returned.
687# 28-11-2003 John Thompson
[1241]688sub get_metadata_element {
689 my $self = shift (@_);
[6111]690 my ($section, $field, $ignore_namespace) = @_;
[1241]691 my ($data);
692
[6111]693 $ignore_namespace = 0 unless defined $ignore_namespace;
694
[1241]695 my $section_ptr = $self->_lookup_section($section);
696 if (!defined $section_ptr) {
[8716]697 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
[1241]698 return;
699 }
700
[6111]701 # Remove the any namespace if we are being told to ignore them
702 if($ignore_namespace) {
703 $field =~ s/^\w*\.//;
704 }
705
[1241]706 foreach $data (@{$section_ptr->{'metadata'}}) {
[6111]707
708 my $data_name = $data->[0];
[14966]709
[6111]710 # Remove the any namespace if we are being told to ignore them
711 if($ignore_namespace) {
712 $data_name =~ s/^\w*\.//;
713 }
714
715 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
[1241]716 }
717
718 return undef; # was not found
719}
720
721# returns a list of the form [value1, value2, ...]
722sub get_metadata {
723 my $self = shift (@_);
[6111]724 my ($section, $field, $ignore_namespace) = @_;
[1241]725 my ($data);
726
[6111]727 $ignore_namespace = 0 unless defined $ignore_namespace;
728
[1241]729 my $section_ptr = $self->_lookup_section($section);
730 if (!defined $section_ptr) {
[8716]731 print STDERR "doc::get_metadata couldn't find section ",
732 $section, "\n";
[1241]733 return;
734 }
735
[6111]736 # Remove the any namespace if we are being told to ignore them
737 if($ignore_namespace) {
738 $field =~ s/^\w*\.//;
739 }
740
[1241]741 my @metadata = ();
742 foreach $data (@{$section_ptr->{'metadata'}}) {
[6111]743
744 my $data_name = $data->[0];
745 # Remove the any namespace if we are being told to ignore them
746 if($ignore_namespace) {
747 $data_name =~ s/^\w*\.//;
748 }
749
750 push (@metadata, $data->[1]) if ($data_name eq $field);
[1241]751 }
[9241]752
[1241]753 return \@metadata;
754}
755
756# returns a list of the form [[field,value],[field,value],...]
757sub get_all_metadata {
758 my $self = shift (@_);
759 my ($section) = @_;
760
761 my $section_ptr = $self->_lookup_section($section);
762 if (!defined $section_ptr) {
[8716]763 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
[1241]764 return;
765 }
766
767 return $section_ptr->{'metadata'};
768}
769
770# $value is optional
771sub delete_metadata {
772 my $self = shift (@_);
773 my ($section, $field, $value) = @_;
774
775 my $section_ptr = $self->_lookup_section($section);
776 if (!defined $section_ptr) {
[8716]777 print STDERR "doc::delete_metadata couldn't find section ", $section, "\n";
[1241]778 return;
779 }
780
781 my $i = 0;
782 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
783 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
784 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
785 splice (@{$section_ptr->{'metadata'}}, $i, 1);
786 } else {
787 $i++;
788 }
789 }
790}
791
792sub delete_all_metadata {
793 my $self = shift (@_);
794 my ($section) = @_;
795
796 my $section_ptr = $self->_lookup_section($section);
797 if (!defined $section_ptr) {
[8716]798 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
[1241]799 return;
800 }
801
802 $section_ptr->{'metadata'} = [];
803}
804
[4]805sub set_metadata_element {
806 my $self = shift (@_);
807 my ($section, $field, $value) = @_;
808
[97]809 $self->set_utf8_metadata_element ($section, $field,
[1870]810 &unicode::ascii2utf8(\$value));
[73]811}
812
813# set_utf8_metadata_element assumes the text has already been
814# converted to the UTF-8 encoding.
815sub set_utf8_metadata_element {
816 my $self = shift (@_);
817 my ($section, $field, $value) = @_;
818
[4]819 $self->delete_metadata ($section, $field);
[73]820 $self->add_utf8_metadata ($section, $field, $value);
[4]821}
822
823
[73]824# add_metadata assumes the text is in (extended) ascii form. For
[8220]825# text which has already been converted to the UTF-8 format use
[73]826# add_utf8_metadata.
[4]827sub add_metadata {
828 my $self = shift (@_);
829 my ($section, $field, $value) = @_;
830
[97]831 $self->add_utf8_metadata ($section, $field,
[1870]832 &unicode::ascii2utf8(\$value));
[73]833}
834
835sub add_utf8_metadata {
836 my $self = shift (@_);
837 my ($section, $field, $value) = @_;
838
[4]839 my $section_ptr = $self->_lookup_section($section);
840 if (!defined $section_ptr) {
[8716]841 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
[4]842 return;
843 }
[1732]844 if (!defined $value) {
845 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
846 return;
847 }
848 if (!defined $field) {
849 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
850 return;
851 }
[4]852
[8894]853 #print STDERR "###$field=$value\n";
[7798]854 # double check that the value is utf-8
[16578]855 if (!&unicode::check_is_utf8($value)) {
856 print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8. ";
857 &unicode::ensure_utf8(\$value);
858 print STDERR "Tried converting to utf8: $value\n";
[7798]859 }
860
[4]861 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
862}
863
864
865# methods for dealing with text
866
[1241]867# returns the text for a section
868sub get_text {
869 my $self = shift (@_);
870 my ($section) = @_;
871
872 my $section_ptr = $self->_lookup_section($section);
873 if (!defined $section_ptr) {
874 print STDERR "doc::get_text couldn't find section " .
875 "$section\n";
876 return "";
877 }
878
879 return $section_ptr->{'text'};
880}
881
882# returns the (utf-8 encoded) length of the text for a section
883sub get_text_length {
884 my $self = shift (@_);
885 my ($section) = @_;
886
887 my $section_ptr = $self->_lookup_section($section);
888 if (!defined $section_ptr) {
889 print STDERR "doc::get_text_length couldn't find section " .
890 "$section\n";
891 return 0;
892 }
893
894 return length ($section_ptr->{'text'});
895}
896
897sub delete_text {
898 my $self = shift (@_);
899 my ($section) = @_;
900
901 my $section_ptr = $self->_lookup_section($section);
902 if (!defined $section_ptr) {
903 print STDERR "doc::delete_text couldn't find section " .
904 "$section\n";
905 return;
906 }
907
908 $section_ptr->{'text'} = "";
909}
910
[73]911# add_text assumes the text is in (extended) ascii form. For
912# text which has been already converted to the UTF-8 format
913# use add_utf8_text.
[4]914sub add_text {
915 my $self = shift (@_);
916 my ($section, $text) = @_;
917
[73]918 # convert the text to UTF-8 encoded unicode characters
919 # and add the text
[1870]920 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
[73]921}
922
923
924# add_utf8_text assumes the text to be added has already
925# been converted to the UTF-8 encoding. For ascii text use
926# add_text
927sub add_utf8_text {
928 my $self = shift (@_);
929 my ($section, $text) = @_;
930
[4]931 my $section_ptr = $self->_lookup_section($section);
932 if (!defined $section_ptr) {
[73]933 print STDERR "doc::add_utf8_text couldn't find section " .
[4]934 "$section\n";
935 return;
936 }
937
938 $section_ptr->{'text'} .= $text;
939}
940
941
[1241]942# methods for dealing with associated files
943
944# a file is associated with a document, NOT a section.
945# if section is defined it is noted in the data structure
946# only so that files associated from a particular section
947# may be removed later (using delete_section_assoc_files)
948sub associate_file {
949 my $self = shift (@_);
950 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
951 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
952
953 # remove all associated files with the same name
954 $self->delete_assoc_file ($assoc_filename);
[8504]955
[1241]956 push (@{$self->{'associated_files'}},
957 [$real_filename, $assoc_filename, $mime_type, $section]);
958}
959
960# returns a list of associated files in the form
961# [[real_filename, assoc_filename, mimetype], ...]
962sub get_assoc_files {
963 my $self = shift (@_);
964
965 return $self->{'associated_files'};
966}
967
968sub delete_section_assoc_files {
969 my $self = shift (@_);
970 my ($section) = @_;
971
972 my $i=0;
973 while ($i < scalar (@{$self->{'associated_files'}})) {
974 if (defined $self->{'associated_files'}->[$i]->[3] &&
975 $self->{'associated_files'}->[$i]->[3] eq $section) {
976 splice (@{$self->{'associated_files'}}, $i, 1);
977 } else {
978 $i++;
979 }
980 }
981}
982
983sub delete_assoc_file {
984 my $self = shift (@_);
985 my ($assoc_filename) = @_;
986
987 my $i=0;
988 while ($i < scalar (@{$self->{'associated_files'}})) {
989 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
990 splice (@{$self->{'associated_files'}}, $i, 1);
991 } else {
992 $i++;
993 }
994 }
995}
996
997sub reset_nextsection_ptr {
998 my $self = shift (@_);
999 my ($section) = @_;
1000
1001 my $section_ptr = $self->_lookup_section($section);
1002 $section_ptr->{'next_subsection'} = 1;
1003}
1004
[4]10051;
Note: See TracBrowser for help on using the repository browser.