root/gsdl/trunk/perllib/doc.pm @ 16578

Revision 16578, 26.7 KB (checked in by ak19, 12 years ago)

1. Base64 encoded gsdlsourcefilename to preserve original filename. 2. Both subroutines set_source_filename and set_converted_filename now call set_utf8_metadata_element. 3. Subroutine add_utf8_metadata checks to see that the meta is utf8 (or else tries to make it utf8) before adding it in.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr   te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32    die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use ghtml;
40use File::stat;
41use MIME::Base64;
42##use hashdoc;
43use docprint;
44
45# the document type may be indexed_doc, nonindexed_doc, or
46# classification
47
48my $OIDcount = 0;
49
50sub new {
51    my $class = shift (@_);
52    my ($source_filename, $doc_type) = @_;
53
54
55    my $self = bless {'associated_files'=>[],
56              'subsection_order'=>[],
57              'next_subsection'=>1,
58              'subsections'=>{},
59              'metadata'=>[],
60              'text'=>"",
61              'OIDtype'=>"hash"}, $class;
62
63    # used to set lastmodified here, but this can screw up the HASH ids, so
64    # the docsave processor now calls set_lastmodified
65
66    $self->{'source_path'} = $source_filename;
67   
68    if (defined $source_filename) {
69    $source_filename = &util::filename_within_collection($source_filename);
70    $self->set_source_filename ($source_filename);
71    }
72
73    $self->set_doc_type ($doc_type) if defined $doc_type;
74
75    return $self;
76}
77# set lastmodified for OAI purposes, added by GRB, moved by kjdon
78sub set_lastmodified {
79    my $self = shift (@_);
80
81    my $source_path = $self->{'source_path'};
82   
83    if (defined $source_path && (-e $source_path)) {
84
85        my $file_stat = stat($source_path);
86    my $mtime = $file_stat->mtime;
87    $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $file_stat->mtime);
88    }
89}
90
91# clone the $self object
92sub duplicate {
93    my $self = shift (@_);
94
95    my $newobj = {};
96   
97    foreach my $k (keys %$self) {
98    $newobj->{$k} = &clone ($self->{$k});
99    }
100
101    bless $newobj, ref($self);
102    return $newobj;
103}
104
105sub clone {
106    my ($from) = @_;
107    my $type = ref ($from);
108
109    if ($type eq "HASH") {
110    my $to = {};
111    foreach my $key (keys %$from) {
112        $to->{$key} = &clone ($from->{$key});
113    }
114    return $to;
115    } elsif ($type eq "ARRAY") {
116    my $to = [];
117    foreach my $v (@$from) {
118        push (@$to, &clone ($v));
119    }
120    return $to;
121    } else {
122    return $from;
123    }
124}
125
126sub set_OIDtype {
127    my $self = shift (@_);
128    my ($type, $metadata) = @_;
129
130    if (defined $type && $type =~ /^(hash|incremental|dirname|assigned)$/) {
131    $self->{'OIDtype'} = $type;
132    } else {
133    $self->{'OIDtype'} = "hash";
134    }
135    if ($type =~ /^assigned$/) {
136    if (defined $metadata) {
137        $self->{'OIDmetadata'} = $metadata;
138    } else {
139        $self->{'OIDmetadata'} = "dc.Identifier";
140    }
141    }
142}
143
144sub set_source_filename {
145    my $self = shift (@_);
146    my ($source_filename) = @_;
147
148    # Since the gsdlsourcefilename element goes into the doc.xml it has
149    # to be utf8. However, it should also *represent* the source filename
150    # (in the import directory) which may not be utf8 at all.
151    # For instance, if this meta element (gsdlsourcefilename) will be
152    # used by other applications that parse doc.xml in order to locate
153    # gsdlsourcefilename. Therefore, the solution is to encode the real
154    # filename into Base64 encoding which is a binary-to-text encoding,
155    # meaning that the resulting string is ASCII (utf8). See also
156    # MIME#Content-Transfer-Encoding in Wiki.
157   
158    print STDERR "******Base64 encoding the non-utf8 gsdl_source_filename $source_filename ";
159   
160    # remove "import/" from path, base 64 encode just the source_filename
161    # (remove trailing spaces and newlines) and prefix "import/" again.
162    my $import_dir = "import";
163    my $srcfilename = $source_filename;
164    $srcfilename =~ s/^$import_dir(?:\\|\/)+(.*)$/$1/;
165   
166    print STDERR "-> $srcfilename -> ";
167    $srcfilename = &MIME::Base64::encode_base64($srcfilename);
168    $srcfilename =~ s/\s*//sg; # for some reason it adds spaces not just at end but also in middle
169       
170    $source_filename = &util::filename_cat($import_dir, $srcfilename);
171    print STDERR "$source_filename\n";
172   
173    $self->set_utf8_metadata_element ($self->get_top_section(),
174                 "gsdlsourcefilename",
175                 $source_filename);
176}
177
178sub set_converted_filename {
179    my $self = shift (@_);
180    my ($converted_filename) = @_;
181
182    # we know the converted filename is utf8
183    $self->set_utf8_metadata_element ($self->get_top_section(),
184                 "gsdlconvertedfilename",
185                 $converted_filename);
186}
187
188
189# returns the source_filename as it was provided
190sub get_source_filename {
191    my $self = shift (@_);
192
193    return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
194}
195
196# returns converted filename if available else returns source filename
197sub get_filename_for_hashing {
198    my $self = shift (@_);
199
200    my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
201
202    if (!defined $filename) {
203    my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
204    # if NULPlug processed file, then don't give a filename
205    if (defined $plugin_name && $plugin_name eq "NULPlug") {
206        $filename = undef;
207    } else {
208        $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
209    }
210    }
211    return $filename;
212}
213
214sub set_doc_type {
215    my $self = shift (@_);
216    my ($doc_type) = @_;
217
218    $self->set_metadata_element ($self->get_top_section(),
219                 "gsdldoctype",
220                 $doc_type);
221}
222
223# returns the gsdldoctype as it was provided
224# the default of "indexed_doc" is used if no document
225# type was provided
226sub get_doc_type {
227    my $self = shift (@_);
228
229    my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
230    return $doc_type if (defined $doc_type);
231    return "indexed_doc";
232}
233
234
235# look up the reference to the a particular section
236sub _lookup_section {
237    my $self = shift (@_);
238    my ($section) = @_;
239
240    my ($num);
241    my $sectionref = $self;
242
243    while (defined $section && $section ne "") {
244   
245    ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
246   
247    $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
248   
249    $section = "" unless defined $section;
250   
251
252    if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
253        $sectionref = $sectionref->{'subsections'}->{$num};
254    } else {
255        return undef;
256    }
257    }
258   
259    return $sectionref;
260}
261
262# calculate OID by hashing the contents of the document
263sub _calc_OID {
264    my $self = shift (@_);
265    my ($filename) = @_;
266
267    my $osexe = &util::get_os_exe();
268
269    my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
270                       $ENV{'GSDLOS'},"hashfile$osexe");
271
272    my $result = "NULL";
273   
274    if (-e "$hashfile_exe") {
275#   $result = `\"$hashfile_exe\" \"$filename\"`;
276    $result = `hashfile$osexe \"$filename\"`;
277    ($result) = $result =~ /:\s*([0-9a-f]+)/i;
278
279    } else {
280    print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
281    }
282    return "HASH$result";
283}
284
285# methods dealing with OID, not groups of them.
286
287# if $OID is not provided one is calculated
288sub set_OID {
289    my $self = shift (@_);
290    my ($OID) = @_;
291   
292    my $use_hash_oid = 0;
293    # if an OID wasn't provided claculate one
294    if (!defined $OID) {
295    $OID = "NULL";
296    if ($self->{'OIDtype'} eq "hash") {
297        $use_hash_oid = 1;
298    } elsif ($self->{'OIDtype'} eq "incremental") {
299        $OID = "D" . $OIDcount;
300        $OIDcount ++;
301     
302    } elsif ($self->{'OIDtype'} eq "dirname") {
303        $OID = 'J';
304        my $filename = $self->get_source_filename();
305        if (defined($filename)) { # && -e $filename) {
306        $OID = &File::Basename::dirname($filename);
307        if (defined $OID) {
308            $OID = 'J'.&File::Basename::basename($OID);
309            $OID =~ s/\.//; #remove any periods
310        } else {
311            print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
312            $use_hash_oid = 1;
313        }
314        } else {
315        print STDERR "Failed to find filename, generating hash id\n";
316        $use_hash_oid = 1;
317        }   
318       
319    } elsif ($self->{'OIDtype'} eq "assigned") {
320        my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
321        if (defined $identifier && $identifier ne "") {
322        $OID = "D" . $identifier;
323        $OID =~ s/\.//; #remove any periods
324        } else {
325        # need a hash id
326        print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
327        $use_hash_oid = 1;
328        }
329               
330    } else {
331        $use_hash_oid = 1;
332    }
333
334    if ($use_hash_oid) {
335       
336        # "hash" OID - feed file to hashfile.exe
337        #my $filename = $self->get_source_filename();
338        # we want to use the converted file for hashing if available
339        # cos its quicker
340        my $filename = $self->get_filename_for_hashing();
341        # -z: don't want to hash on the file if it is zero size
342        if (defined($filename) && -e $filename && !-z $filename) {
343        $OID = $self->_calc_OID ($filename);
344        } else {
345        $filename = &util::get_tmp_filename();
346        if (!open (OUTFILE, ">$filename")) {
347            print STDERR "doc::set_OID could not write to $filename\n";
348        } else {
349            my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
350            print OUTFILE $doc_text;
351            close (OUTFILE);
352        }
353        $OID = $self->_calc_OID ($filename);
354        &util::rm ($filename);
355        }
356    }
357    }
358    $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
359}
360
361# this uses hashdoc (embedded c thingy) which is faster but still
362# needs a little work to be suffiently stable
363sub ___set_OID {
364    my $self = shift (@_);
365    my ($OID) = @_;
366
367    # if an OID wasn't provided then calculate hash value based on document
368    if (!defined $OID)
369    {
370    my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
371    my $hash_len = length($hash_text);
372
373        $OID = &hashdoc::buffer($hash_text,$hash_len);
374    }
375
376    $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
377}
378
379# returns the OID for this document
380sub get_OID {
381    my $self = shift (@_);
382    my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
383    return $OID if (defined $OID);
384    return "NULL";
385}
386
387sub delete_OID {
388    my $self = shift (@_);
389   
390    $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
391}
392
393
394# methods for manipulating section names
395
396# returns the name of the top-most section (the top
397# level of the document
398sub get_top_section {
399    my $self = shift (@_);
400   
401    return "";
402}
403
404# returns a section
405sub get_parent_section {
406    my $self = shift (@_);
407    my ($section) = @_;
408
409    $section =~ s/(^|\.)\d+$//;
410
411    return $section;
412}
413
414# returns the first child section (or the end child
415# if there isn't any)
416sub get_begin_child {
417    my $self = shift (@_);
418    my ($section) = @_;
419
420    my $section_ptr = $self->_lookup_section($section);
421    return "" unless defined $section_ptr;
422
423    if (defined $section_ptr->{'subsection_order'}->[0]) {
424    return "$section.$section_ptr->{'subsection_order'}->[0]";
425    }
426
427    return $self->get_end_child ($section);
428}
429
430# returns the next child of a parent section
431sub get_next_child {
432    my $self = shift (@_);
433    my ($section) = @_;
434   
435    my $parent_section = $self->get_parent_section($section);
436    my $parent_section_ptr = $self->_lookup_section($parent_section);
437    return undef unless defined $parent_section_ptr;
438
439    my ($section_num) = $section =~ /(\d+)$/;
440    return undef unless defined $section_num;
441
442    my $i = 0;
443    my $section_order = $parent_section_ptr->{'subsection_order'};
444    while ($i < scalar(@$section_order)) {
445    last if $section_order->[$i] eq $section_num;
446    $i++;
447    }
448
449    $i++; # the next child
450    if ($i < scalar(@$section_order)) {
451    return $section_order->[$i] if $parent_section eq "";
452    return "$parent_section.$section_order->[$i]";
453    }
454
455    # no more sections in this level
456    return undef;
457}
458
459# returns a reference to a list of children
460sub get_children {
461    my $self = shift (@_);
462    my ($section) = @_;
463
464    my $section_ptr = $self->_lookup_section($section);
465    return [] unless defined $section_ptr;
466
467    my @children = @{$section_ptr->{'subsection_order'}};
468
469    map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
470    return \@children;
471}
472
473# returns the child section one past the last one (which
474# is coded as "0")
475sub get_end_child {
476    my $self = shift (@_);
477    my ($section) = @_;
478
479    return $section . ".0" unless $section eq "";
480    return "0";
481}
482
483# returns the next section in book order
484sub get_next_section {
485    my $self = shift (@_);
486    my ($section) = @_;
487
488    return undef unless defined $section;
489
490    my $section_ptr = $self->_lookup_section($section);
491    return undef unless defined $section_ptr;
492
493    # first try to find first child
494    if (defined $section_ptr->{'subsection_order'}->[0]) {
495    return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
496    return "$section.$section_ptr->{'subsection_order'}->[0]";
497    }
498
499    do {
500    # try to find sibling
501    my $next_child = $self->get_next_child ($section);
502    return $next_child if (defined $next_child);
503
504    # move up one level
505    $section = $self->get_parent_section ($section);
506    } while $section =~ /\d/;
507
508    return undef;
509}
510
511sub is_leaf_section {
512    my $self = shift (@_);
513    my ($section) = @_;
514
515    my $section_ptr = $self->_lookup_section($section);
516    return 1 unless defined $section_ptr;
517
518    return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
519}
520
521# methods for dealing with sections
522
523# returns the name of the inserted section
524sub insert_section {
525    my $self = shift (@_);
526    my ($before_section) = @_;
527
528    # get the child to insert before and its parent section
529    my $parent_section = "";
530    my $before_child = "0";
531    my @before_section = split (/\./, $before_section);
532    if (scalar(@before_section) > 0) {
533    $before_child = pop (@before_section);
534    $parent_section = join (".", @before_section);
535    }
536
537    my $parent_section_ptr = $self->_lookup_section($parent_section);
538    if (!defined $parent_section_ptr) {
539    print STDERR "doc::insert_section couldn't find parent section " .
540        "$parent_section\n";
541    return;
542    }
543
544    # get the next section number
545    my $section_num = $parent_section_ptr->{'next_subsection'}++;
546
547    my $i = 0;
548    while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
549       $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
550    $i++;
551    }
552   
553    # insert the section number into the order list
554    splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
555
556    # add this section to the parent section
557    my $section_ptr = {'subsection_order'=>[],
558               'next_subsection'=>1,
559               'subsections'=>{},
560               'metadata'=>[],
561               'text'=>""};
562    $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
563
564    # work out the full section number
565    my $section = $parent_section;
566    $section .= "." unless $section eq "";
567    $section .= $section_num;
568   
569    return $section;
570}
571
572# creates a pre-named section
573sub create_named_section {
574    my $self = shift (@_);
575    my ($mastersection) = @_;
576
577    my ($num);
578    my $section = $mastersection;
579    my $sectionref = $self;
580
581    while ($section ne "") {
582    ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
583    $num =~ s/^0+(\d)/$1/; # remove leading 0s
584    $section = "" unless defined $section;
585   
586    if (defined $num) {
587        if (!defined $sectionref->{'subsections'}->{$num}) {
588        push (@{$sectionref->{'subsection_order'}}, $num);
589        $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
590                            'next_subsection'=>1,
591                            'subsections'=>{},
592                            'metadata'=>[],
593                            'text'=>""};
594        if ($num >= $sectionref->{'next_subsection'}) {
595            $sectionref->{'next_subsection'} = $num + 1;
596        }
597        }
598        $sectionref = $sectionref->{'subsections'}->{$num};
599
600    } else {
601        print STDERR "doc::create_named_section couldn't create section ";
602        print STDERR "$mastersection\n";
603        last;
604    }
605    }
606}
607
608# returns a reference to a list of subsections
609sub list_subsections {
610    my $self = shift (@_);
611    my ($section) = @_;
612
613    my $section_ptr = $self->_lookup_section ($section);
614    if (!defined $section_ptr) {
615    print STDERR "doc::list_subsections couldn't find section $section\n";
616    return [];
617    }
618
619    return [@{$section_ptr->{'subsection_order'}}];
620}
621
622sub delete_section {
623    my $self = shift (@_);
624    my ($section) = @_;
625
626#    my $section_ptr = {'subsection_order'=>[],
627#              'next_subsection'=>1,
628#              'subsections'=>{},
629#              'metadata'=>[],
630#              'text'=>""};
631
632    # if this is the top section reset everything
633    if ($section eq "") {
634    $self->{'subsection_order'} = [];
635    $self->{'subsections'} = {};
636    $self->{'metadata'} = [];
637    $self->{'text'} = "";
638    return;
639    }
640
641    # find the parent of the section to delete
642    my $parent_section = "";
643    my $child = "0";
644    my @section = split (/\./, $section);
645    if (scalar(@section) > 0) {
646    $child = pop (@section);
647    $parent_section = join (".", @section);
648    }
649
650    my $parent_section_ptr = $self->_lookup_section($parent_section);
651    if (!defined $parent_section_ptr) {
652    print STDERR "doc::delete_section couldn't find parent section " .
653        "$parent_section\n";
654    return;
655    }
656
657    # remove this section from the subsection_order list
658    my $i = 0;
659    while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
660    if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
661        splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
662        last;
663    }
664    $i++;
665    }
666
667    # remove this section from the subsection hash
668    if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
669    undef $parent_section_ptr->{'subsections'}->{$child};
670    }
671}
672
673#--
674# methods for dealing with metadata
675
676# set_metadata_element and get_metadata_element are for metadata
677# which should only have one value. add_meta_data and get_metadata
678# are for metadata which can have more than one value.
679
680# returns the first metadata value which matches field
681
682# This version of get metadata element works much like the one above,
683# except it allows for the namespace portion of a metadata element to
684# be ignored, thus if you are searching for dc.Title, the first piece
685# of matching metadata ending with the name Title (once any namespace
686# is removed) would be returned.
687# 28-11-2003 John Thompson
688sub get_metadata_element {
689    my $self = shift (@_);
690    my ($section, $field, $ignore_namespace) = @_;
691    my ($data);
692
693    $ignore_namespace = 0 unless defined $ignore_namespace;
694
695    my $section_ptr = $self->_lookup_section($section);
696    if (!defined $section_ptr) {
697    print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
698    return;
699    }
700
701    # Remove the any namespace if we are being told to ignore them
702    if($ignore_namespace) {
703    $field =~ s/^\w*\.//;
704    }
705
706    foreach $data (@{$section_ptr->{'metadata'}}) {
707
708    my $data_name = $data->[0];
709
710    # Remove the any namespace if we are being told to ignore them
711    if($ignore_namespace) {
712        $data_name =~ s/^\w*\.//;
713    }
714   
715    return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
716    }
717   
718    return undef; # was not found
719}
720
721# returns a list of the form [value1, value2, ...]
722sub get_metadata {
723    my $self = shift (@_);
724    my ($section, $field, $ignore_namespace) = @_;
725    my ($data);
726
727    $ignore_namespace = 0 unless defined $ignore_namespace;
728
729    my $section_ptr = $self->_lookup_section($section);
730    if (!defined $section_ptr) {
731        print STDERR "doc::get_metadata couldn't find section ",
732        $section, "\n";
733        return;
734    }
735
736    # Remove the any namespace if we are being told to ignore them
737    if($ignore_namespace) {
738    $field =~ s/^\w*\.//;
739    }
740
741    my @metadata = ();
742    foreach $data (@{$section_ptr->{'metadata'}}) {
743
744    my $data_name = $data->[0];
745    # Remove the any namespace if we are being told to ignore them
746    if($ignore_namespace) {
747        $data_name =~ s/^\w*\.//;
748    }
749
750        push (@metadata, $data->[1]) if ($data_name eq $field);
751    }
752
753    return \@metadata;
754}
755
756# returns a list of the form [[field,value],[field,value],...]
757sub get_all_metadata {
758    my $self = shift (@_);
759    my ($section) = @_;
760
761    my $section_ptr = $self->_lookup_section($section);
762    if (!defined $section_ptr) {
763    print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
764    return;
765    }
766   
767    return $section_ptr->{'metadata'};
768}
769
770# $value is optional
771sub delete_metadata {
772    my $self = shift (@_);
773    my ($section, $field, $value) = @_;
774
775    my $section_ptr = $self->_lookup_section($section);
776    if (!defined $section_ptr) {
777    print STDERR "doc::delete_metadata couldn't find section ", $section, "\n";
778    return;
779    }
780
781    my $i = 0;
782    while ($i < scalar (@{$section_ptr->{'metadata'}})) {
783    if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
784        (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
785        splice (@{$section_ptr->{'metadata'}}, $i, 1);
786    } else {
787        $i++;
788    }
789    }
790}
791
792sub delete_all_metadata {
793    my $self = shift (@_);
794    my ($section) = @_;
795
796    my $section_ptr = $self->_lookup_section($section);
797    if (!defined $section_ptr) {
798    print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
799    return;
800    }
801   
802    $section_ptr->{'metadata'} = [];
803}
804
805sub set_metadata_element {
806    my $self = shift (@_);
807    my ($section, $field, $value) = @_;
808
809    $self->set_utf8_metadata_element ($section, $field,
810                      &unicode::ascii2utf8(\$value));
811}
812
813# set_utf8_metadata_element assumes the text has already been
814# converted to the UTF-8 encoding.
815sub set_utf8_metadata_element {
816    my $self = shift (@_);
817    my ($section, $field, $value) = @_;
818
819    $self->delete_metadata ($section, $field);
820    $self->add_utf8_metadata ($section, $field, $value);
821}
822
823
824# add_metadata assumes the text is in (extended) ascii form. For
825# text which has already been converted to the UTF-8 format use
826# add_utf8_metadata.
827sub add_metadata {
828    my $self = shift (@_);
829    my ($section, $field, $value) = @_;
830
831    $self->add_utf8_metadata ($section, $field,
832                  &unicode::ascii2utf8(\$value));
833}
834
835sub add_utf8_metadata {
836    my $self = shift (@_);
837    my ($section, $field, $value) = @_;
838
839    my $section_ptr = $self->_lookup_section($section);
840    if (!defined $section_ptr) {
841    print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
842    return;
843    }
844    if (!defined $value) {
845    print STDERR "doc::add_utf8_metadata undefined value for $field\n";
846    return;
847    }
848    if (!defined $field) {
849    print STDERR "doc::add_utf8_metadata undefined metadata type \n";
850    return;
851    }
852   
853    #print STDERR "###$field=$value\n";
854    # double check that the value is utf-8
855    if (!&unicode::check_is_utf8($value)) {
856    print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8. ";
857    &unicode::ensure_utf8(\$value);
858    print STDERR "Tried converting to utf8: $value\n";
859    }
860
861    push (@{$section_ptr->{'metadata'}}, [$field, $value]);
862}
863
864
865# methods for dealing with text
866
867# returns the text for a section
868sub get_text {
869    my $self = shift (@_);
870    my ($section) = @_;
871
872    my $section_ptr = $self->_lookup_section($section);
873    if (!defined $section_ptr) {
874    print STDERR "doc::get_text couldn't find section " .
875        "$section\n";
876    return "";
877    }
878
879    return $section_ptr->{'text'};
880}
881
882# returns the (utf-8 encoded) length of the text for a section
883sub get_text_length {
884    my $self = shift (@_);
885    my ($section) = @_;
886
887    my $section_ptr = $self->_lookup_section($section);
888    if (!defined $section_ptr) {
889    print STDERR "doc::get_text_length couldn't find section " .
890        "$section\n";
891    return 0;
892    }
893
894    return length ($section_ptr->{'text'});
895}
896
897sub delete_text {
898    my $self = shift (@_);
899    my ($section) = @_;
900
901    my $section_ptr = $self->_lookup_section($section);
902    if (!defined $section_ptr) {
903    print STDERR "doc::delete_text couldn't find section " .
904        "$section\n";
905    return;
906    }
907
908    $section_ptr->{'text'} = "";
909}
910
911# add_text assumes the text is in (extended) ascii form. For
912# text which has been already converted to the UTF-8 format
913# use add_utf8_text.
914sub add_text {
915    my $self = shift (@_);
916    my ($section, $text) = @_;
917
918    # convert the text to UTF-8 encoded unicode characters
919    # and add the text
920    $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
921}
922
923
924# add_utf8_text assumes the text to be added has already
925# been converted to the UTF-8 encoding. For ascii text use
926# add_text
927sub add_utf8_text {
928    my $self = shift (@_);
929    my ($section, $text) = @_;
930
931    my $section_ptr = $self->_lookup_section($section);
932    if (!defined $section_ptr) {
933    print STDERR "doc::add_utf8_text couldn't find section " .
934        "$section\n";
935    return;
936    }
937
938    $section_ptr->{'text'} .= $text;
939}
940
941
942# methods for dealing with associated files
943
944# a file is associated with a document, NOT a section.
945# if section is defined it is noted in the data structure
946# only so that files associated from a particular section
947# may be removed later (using delete_section_assoc_files)
948sub associate_file {
949    my $self = shift (@_);
950    my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
951    $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
952
953    # remove all associated files with the same name
954    $self->delete_assoc_file ($assoc_filename);
955   
956    push (@{$self->{'associated_files'}},
957      [$real_filename, $assoc_filename, $mime_type, $section]);
958}
959
960# returns a list of associated files in the form
961#   [[real_filename, assoc_filename, mimetype], ...]
962sub get_assoc_files {
963    my $self = shift (@_);
964
965    return $self->{'associated_files'};
966}
967
968sub delete_section_assoc_files {
969    my $self = shift (@_);
970    my ($section) = @_;
971
972    my $i=0;
973    while ($i < scalar (@{$self->{'associated_files'}})) {
974    if (defined $self->{'associated_files'}->[$i]->[3] &&
975        $self->{'associated_files'}->[$i]->[3] eq $section) {
976        splice (@{$self->{'associated_files'}}, $i, 1);
977    } else {
978        $i++;
979    }
980    }
981}
982
983sub delete_assoc_file {
984    my $self = shift (@_);
985    my ($assoc_filename) = @_;
986
987    my $i=0;
988    while ($i < scalar (@{$self->{'associated_files'}})) {
989    if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
990        splice (@{$self->{'associated_files'}}, $i, 1);
991    } else {
992        $i++;
993    }
994    }
995}
996
997sub reset_nextsection_ptr {
998    my $self = shift (@_);
999    my ($section) = @_;
1000   
1001    my $section_ptr = $self->_lookup_section($section);
1002    $section_ptr->{'next_subsection'} = 1;
1003}
1004
10051;
Note: See TracBrowser for help on using the browser.