root/gsdl/trunk/perllib/doc.pm @ 16670

Revision 16670, 26.5 KB (checked in by ak19, 12 years ago)

Instead of base64 encoding the gsdl_source_filename, it now URL encodes it. GLI's DocXMLFile.java urlDecodes it now (instead of applying base64 decoding).

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr   te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32    die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use ghtml;
40use File::stat;
41##use hashdoc;
42use docprint;
43
44# the document type may be indexed_doc, nonindexed_doc, or
45# classification
46
47my $OIDcount = 0;
48
49sub new {
50    my $class = shift (@_);
51    my ($source_filename, $doc_type) = @_;
52
53
54    my $self = bless {'associated_files'=>[],
55              'subsection_order'=>[],
56              'next_subsection'=>1,
57              'subsections'=>{},
58              'metadata'=>[],
59              'text'=>"",
60              'OIDtype'=>"hash"}, $class;
61
62    # used to set lastmodified here, but this can screw up the HASH ids, so
63    # the docsave processor now calls set_lastmodified
64
65    $self->{'source_path'} = $source_filename;
66   
67    if (defined $source_filename) {
68    $source_filename = &util::filename_within_collection($source_filename);
69    $self->set_source_filename ($source_filename);
70    }
71
72    $self->set_doc_type ($doc_type) if defined $doc_type;
73
74    return $self;
75}
76# set lastmodified for OAI purposes, added by GRB, moved by kjdon
77sub set_lastmodified {
78    my $self = shift (@_);
79
80    my $source_path = $self->{'source_path'};
81   
82    if (defined $source_path && (-e $source_path)) {
83
84        my $file_stat = stat($source_path);
85    my $mtime = $file_stat->mtime;
86    $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $file_stat->mtime);
87    }
88}
89
90# clone the $self object
91sub duplicate {
92    my $self = shift (@_);
93
94    my $newobj = {};
95   
96    foreach my $k (keys %$self) {
97    $newobj->{$k} = &clone ($self->{$k});
98    }
99
100    bless $newobj, ref($self);
101    return $newobj;
102}
103
104sub clone {
105    my ($from) = @_;
106    my $type = ref ($from);
107
108    if ($type eq "HASH") {
109    my $to = {};
110    foreach my $key (keys %$from) {
111        $to->{$key} = &clone ($from->{$key});
112    }
113    return $to;
114    } elsif ($type eq "ARRAY") {
115    my $to = [];
116    foreach my $v (@$from) {
117        push (@$to, &clone ($v));
118    }
119    return $to;
120    } else {
121    return $from;
122    }
123}
124
125sub set_OIDtype {
126    my $self = shift (@_);
127    my ($type, $metadata) = @_;
128
129    if (defined $type && $type =~ /^(hash|incremental|dirname|assigned)$/) {
130    $self->{'OIDtype'} = $type;
131    } else {
132    $self->{'OIDtype'} = "hash";
133    }
134    if ($type =~ /^assigned$/) {
135    if (defined $metadata) {
136        $self->{'OIDmetadata'} = $metadata;
137    } else {
138        $self->{'OIDmetadata'} = "dc.Identifier";
139    }
140    }
141}
142
143sub set_source_filename {
144    my $self = shift (@_);
145    my ($source_filename) = @_;
146
147    # Since the gsdlsourcefilename element goes into the doc.xml it has
148    # to be utf8. However, it should also *represent* the source filename
149    # (in the import directory) which may not be utf8 at all.
150    # For instance, if this meta element (gsdlsourcefilename) will be
151    # used by other applications that parse doc.xml in order to locate
152    # gsdlsourcefilename. Therefore, the solution is to URLencode the real
153    # filename as this is a binary-to-text encoding meaning that the
154    # resulting string is ASCII (utf8).
155   
156    print STDERR "******URLencoding the gsdl_source_filename $source_filename ";
157   
158    # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
159    my ($srcfilename,$dirname,$suffix)
160    = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
161    print STDERR "-> $srcfilename -> ";
162    $srcfilename = &unicode::url_encode($srcfilename.$suffix);
163    $source_filename = &util::filename_cat($dirname, $srcfilename);
164    print STDERR "$source_filename\n";
165   
166    $self->set_utf8_metadata_element ($self->get_top_section(),
167                 "gsdlsourcefilename",
168                 $source_filename);
169}
170
171sub set_converted_filename {
172    my $self = shift (@_);
173    my ($converted_filename) = @_;
174
175    # we know the converted filename is utf8
176    $self->set_utf8_metadata_element ($self->get_top_section(),
177                 "gsdlconvertedfilename",
178                 $converted_filename);
179}
180
181
182# returns the source_filename as it was provided
183sub get_source_filename {
184    my $self = shift (@_);
185
186    return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
187}
188
189# returns converted filename if available else returns source filename
190sub get_filename_for_hashing {
191    my $self = shift (@_);
192
193    my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
194
195    if (!defined $filename) {
196    my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
197    # if NULPlug processed file, then don't give a filename
198    if (defined $plugin_name && $plugin_name eq "NULPlug") {
199        $filename = undef;
200    } else { # returns the URL encoded source filename!
201        $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
202    }
203    }
204    return $filename;
205}
206
207sub set_doc_type {
208    my $self = shift (@_);
209    my ($doc_type) = @_;
210
211    $self->set_metadata_element ($self->get_top_section(),
212                 "gsdldoctype",
213                 $doc_type);
214}
215
216# returns the gsdldoctype as it was provided
217# the default of "indexed_doc" is used if no document
218# type was provided
219sub get_doc_type {
220    my $self = shift (@_);
221
222    my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
223    return $doc_type if (defined $doc_type);
224    return "indexed_doc";
225}
226
227
228# look up the reference to the a particular section
229sub _lookup_section {
230    my $self = shift (@_);
231    my ($section) = @_;
232
233    my ($num);
234    my $sectionref = $self;
235
236    while (defined $section && $section ne "") {
237   
238    ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
239   
240    $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
241   
242    $section = "" unless defined $section;
243   
244
245    if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
246        $sectionref = $sectionref->{'subsections'}->{$num};
247    } else {
248        return undef;
249    }
250    }
251   
252    return $sectionref;
253}
254
255# calculate OID by hashing the contents of the document
256sub _calc_OID {
257    my $self = shift (@_);
258    my ($filename) = @_;
259
260    my $osexe = &util::get_os_exe();
261
262    my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
263                       $ENV{'GSDLOS'},"hashfile$osexe");
264
265    my $result = "NULL";
266   
267    if (-e "$hashfile_exe") {
268#   $result = `\"$hashfile_exe\" \"$filename\"`;
269    $result = `hashfile$osexe \"$filename\"`;
270    ($result) = $result =~ /:\s*([0-9a-f]+)/i;
271    } else {
272    print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
273    }
274    return "HASH$result";
275}
276
277# methods dealing with OID, not groups of them.
278
279# if $OID is not provided one is calculated
280sub set_OID {
281    my $self = shift (@_);
282    my ($OID) = @_;
283   
284    my $use_hash_oid = 0;
285    # if an OID wasn't provided claculate one
286    if (!defined $OID) {
287    $OID = "NULL";
288    if ($self->{'OIDtype'} eq "hash") {
289        $use_hash_oid = 1;
290    } elsif ($self->{'OIDtype'} eq "incremental") {
291        $OID = "D" . $OIDcount;
292        $OIDcount ++;
293     
294    } elsif ($self->{'OIDtype'} eq "dirname") {
295        $OID = 'J';
296        my $filename = $self->get_source_filename();
297        if (defined($filename)) { # && -e $filename) {
298        $OID = &File::Basename::dirname($filename);
299        if (defined $OID) {
300            $OID = 'J'.&File::Basename::basename($OID);
301            $OID =~ s/\.//; #remove any periods
302        } else {
303            print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
304            $use_hash_oid = 1;
305        }
306        } else {
307        print STDERR "Failed to find filename, generating hash id\n";
308        $use_hash_oid = 1;
309        }   
310       
311    } elsif ($self->{'OIDtype'} eq "assigned") {
312        my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
313        if (defined $identifier && $identifier ne "") {
314        $OID = "D" . $identifier;
315        $OID =~ s/\.//; #remove any periods
316        } else {
317        # need a hash id
318        print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
319        $use_hash_oid = 1;
320        }
321               
322    } else {
323        $use_hash_oid = 1;
324    }
325
326    if ($use_hash_oid) {
327       
328        # "hash" OID - feed file to hashfile.exe
329        #my $filename = $self->get_source_filename();
330        # we want to use the converted file for hashing if available
331        # cos its quicker
332        my $filename = $self->get_filename_for_hashing();
333        # -z: don't want to hash on the file if it is zero size
334        if (defined($filename) && -e $filename && !-z $filename) {
335        $OID = $self->_calc_OID ($filename);
336        } else {
337        $filename = &util::get_tmp_filename();
338        if (!open (OUTFILE, ">$filename")) {
339            print STDERR "doc::set_OID could not write to $filename\n";
340        } else {
341            my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
342            print OUTFILE $doc_text;
343            close (OUTFILE);
344        }
345        $OID = $self->_calc_OID ($filename);
346        &util::rm ($filename);
347        }
348    }
349    }
350    $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
351}
352
353# this uses hashdoc (embedded c thingy) which is faster but still
354# needs a little work to be suffiently stable
355sub ___set_OID {
356    my $self = shift (@_);
357    my ($OID) = @_;
358
359    # if an OID wasn't provided then calculate hash value based on document
360    if (!defined $OID)
361    {
362    my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
363    my $hash_len = length($hash_text);
364
365        $OID = &hashdoc::buffer($hash_text,$hash_len);
366    }
367
368    $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
369}
370
371# returns the OID for this document
372sub get_OID {
373    my $self = shift (@_);
374    my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
375    return $OID if (defined $OID);
376    return "NULL";
377}
378
379sub delete_OID {
380    my $self = shift (@_);
381   
382    $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
383}
384
385
386# methods for manipulating section names
387
388# returns the name of the top-most section (the top
389# level of the document
390sub get_top_section {
391    my $self = shift (@_);
392   
393    return "";
394}
395
396# returns a section
397sub get_parent_section {
398    my $self = shift (@_);
399    my ($section) = @_;
400
401    $section =~ s/(^|\.)\d+$//;
402
403    return $section;
404}
405
406# returns the first child section (or the end child
407# if there isn't any)
408sub get_begin_child {
409    my $self = shift (@_);
410    my ($section) = @_;
411
412    my $section_ptr = $self->_lookup_section($section);
413    return "" unless defined $section_ptr;
414
415    if (defined $section_ptr->{'subsection_order'}->[0]) {
416    return "$section.$section_ptr->{'subsection_order'}->[0]";
417    }
418
419    return $self->get_end_child ($section);
420}
421
422# returns the next child of a parent section
423sub get_next_child {
424    my $self = shift (@_);
425    my ($section) = @_;
426   
427    my $parent_section = $self->get_parent_section($section);
428    my $parent_section_ptr = $self->_lookup_section($parent_section);
429    return undef unless defined $parent_section_ptr;
430
431    my ($section_num) = $section =~ /(\d+)$/;
432    return undef unless defined $section_num;
433
434    my $i = 0;
435    my $section_order = $parent_section_ptr->{'subsection_order'};
436    while ($i < scalar(@$section_order)) {
437    last if $section_order->[$i] eq $section_num;
438    $i++;
439    }
440
441    $i++; # the next child
442    if ($i < scalar(@$section_order)) {
443    return $section_order->[$i] if $parent_section eq "";
444    return "$parent_section.$section_order->[$i]";
445    }
446
447    # no more sections in this level
448    return undef;
449}
450
451# returns a reference to a list of children
452sub get_children {
453    my $self = shift (@_);
454    my ($section) = @_;
455
456    my $section_ptr = $self->_lookup_section($section);
457    return [] unless defined $section_ptr;
458
459    my @children = @{$section_ptr->{'subsection_order'}};
460
461    map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
462    return \@children;
463}
464
465# returns the child section one past the last one (which
466# is coded as "0")
467sub get_end_child {
468    my $self = shift (@_);
469    my ($section) = @_;
470
471    return $section . ".0" unless $section eq "";
472    return "0";
473}
474
475# returns the next section in book order
476sub get_next_section {
477    my $self = shift (@_);
478    my ($section) = @_;
479
480    return undef unless defined $section;
481
482    my $section_ptr = $self->_lookup_section($section);
483    return undef unless defined $section_ptr;
484
485    # first try to find first child
486    if (defined $section_ptr->{'subsection_order'}->[0]) {
487    return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
488    return "$section.$section_ptr->{'subsection_order'}->[0]";
489    }
490
491    do {
492    # try to find sibling
493    my $next_child = $self->get_next_child ($section);
494    return $next_child if (defined $next_child);
495
496    # move up one level
497    $section = $self->get_parent_section ($section);
498    } while $section =~ /\d/;
499
500    return undef;
501}
502
503sub is_leaf_section {
504    my $self = shift (@_);
505    my ($section) = @_;
506
507    my $section_ptr = $self->_lookup_section($section);
508    return 1 unless defined $section_ptr;
509
510    return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
511}
512
513# methods for dealing with sections
514
515# returns the name of the inserted section
516sub insert_section {
517    my $self = shift (@_);
518    my ($before_section) = @_;
519
520    # get the child to insert before and its parent section
521    my $parent_section = "";
522    my $before_child = "0";
523    my @before_section = split (/\./, $before_section);
524    if (scalar(@before_section) > 0) {
525    $before_child = pop (@before_section);
526    $parent_section = join (".", @before_section);
527    }
528
529    my $parent_section_ptr = $self->_lookup_section($parent_section);
530    if (!defined $parent_section_ptr) {
531    print STDERR "doc::insert_section couldn't find parent section " .
532        "$parent_section\n";
533    return;
534    }
535
536    # get the next section number
537    my $section_num = $parent_section_ptr->{'next_subsection'}++;
538
539    my $i = 0;
540    while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
541       $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
542    $i++;
543    }
544   
545    # insert the section number into the order list
546    splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
547
548    # add this section to the parent section
549    my $section_ptr = {'subsection_order'=>[],
550               'next_subsection'=>1,
551               'subsections'=>{},
552               'metadata'=>[],
553               'text'=>""};
554    $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
555
556    # work out the full section number
557    my $section = $parent_section;
558    $section .= "." unless $section eq "";
559    $section .= $section_num;
560   
561    return $section;
562}
563
564# creates a pre-named section
565sub create_named_section {
566    my $self = shift (@_);
567    my ($mastersection) = @_;
568
569    my ($num);
570    my $section = $mastersection;
571    my $sectionref = $self;
572
573    while ($section ne "") {
574    ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
575    $num =~ s/^0+(\d)/$1/; # remove leading 0s
576    $section = "" unless defined $section;
577   
578    if (defined $num) {
579        if (!defined $sectionref->{'subsections'}->{$num}) {
580        push (@{$sectionref->{'subsection_order'}}, $num);
581        $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
582                            'next_subsection'=>1,
583                            'subsections'=>{},
584                            'metadata'=>[],
585                            'text'=>""};
586        if ($num >= $sectionref->{'next_subsection'}) {
587            $sectionref->{'next_subsection'} = $num + 1;
588        }
589        }
590        $sectionref = $sectionref->{'subsections'}->{$num};
591
592    } else {
593        print STDERR "doc::create_named_section couldn't create section ";
594        print STDERR "$mastersection\n";
595        last;
596    }
597    }
598}
599
600# returns a reference to a list of subsections
601sub list_subsections {
602    my $self = shift (@_);
603    my ($section) = @_;
604
605    my $section_ptr = $self->_lookup_section ($section);
606    if (!defined $section_ptr) {
607    print STDERR "doc::list_subsections couldn't find section $section\n";
608    return [];
609    }
610
611    return [@{$section_ptr->{'subsection_order'}}];
612}
613
614sub delete_section {
615    my $self = shift (@_);
616    my ($section) = @_;
617
618#    my $section_ptr = {'subsection_order'=>[],
619#              'next_subsection'=>1,
620#              'subsections'=>{},
621#              'metadata'=>[],
622#              'text'=>""};
623
624    # if this is the top section reset everything
625    if ($section eq "") {
626    $self->{'subsection_order'} = [];
627    $self->{'subsections'} = {};
628    $self->{'metadata'} = [];
629    $self->{'text'} = "";
630    return;
631    }
632
633    # find the parent of the section to delete
634    my $parent_section = "";
635    my $child = "0";
636    my @section = split (/\./, $section);
637    if (scalar(@section) > 0) {
638    $child = pop (@section);
639    $parent_section = join (".", @section);
640    }
641
642    my $parent_section_ptr = $self->_lookup_section($parent_section);
643    if (!defined $parent_section_ptr) {
644    print STDERR "doc::delete_section couldn't find parent section " .
645        "$parent_section\n";
646    return;
647    }
648
649    # remove this section from the subsection_order list
650    my $i = 0;
651    while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
652    if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
653        splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
654        last;
655    }
656    $i++;
657    }
658
659    # remove this section from the subsection hash
660    if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
661    undef $parent_section_ptr->{'subsections'}->{$child};
662    }
663}
664
665#--
666# methods for dealing with metadata
667
668# set_metadata_element and get_metadata_element are for metadata
669# which should only have one value. add_meta_data and get_metadata
670# are for metadata which can have more than one value.
671
672# returns the first metadata value which matches field
673
674# This version of get metadata element works much like the one above,
675# except it allows for the namespace portion of a metadata element to
676# be ignored, thus if you are searching for dc.Title, the first piece
677# of matching metadata ending with the name Title (once any namespace
678# is removed) would be returned.
679# 28-11-2003 John Thompson
680sub get_metadata_element {
681    my $self = shift (@_);
682    my ($section, $field, $ignore_namespace) = @_;
683    my ($data);
684
685    $ignore_namespace = 0 unless defined $ignore_namespace;
686
687    my $section_ptr = $self->_lookup_section($section);
688    if (!defined $section_ptr) {
689    print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
690    return;
691    }
692
693    # Remove the any namespace if we are being told to ignore them
694    if($ignore_namespace) {
695    $field =~ s/^\w*\.//;
696    }
697
698    foreach $data (@{$section_ptr->{'metadata'}}) {
699
700    my $data_name = $data->[0];
701
702    # Remove the any namespace if we are being told to ignore them
703    if($ignore_namespace) {
704        $data_name =~ s/^\w*\.//;
705    }
706   
707    return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
708    }
709   
710    return undef; # was not found
711}
712
713# returns a list of the form [value1, value2, ...]
714sub get_metadata {
715    my $self = shift (@_);
716    my ($section, $field, $ignore_namespace) = @_;
717    my ($data);
718
719    $ignore_namespace = 0 unless defined $ignore_namespace;
720
721    my $section_ptr = $self->_lookup_section($section);
722    if (!defined $section_ptr) {
723        print STDERR "doc::get_metadata couldn't find section ",
724        $section, "\n";
725        return;
726    }
727
728    # Remove the any namespace if we are being told to ignore them
729    if($ignore_namespace) {
730    $field =~ s/^\w*\.//;
731    }
732
733    my @metadata = ();
734    foreach $data (@{$section_ptr->{'metadata'}}) {
735
736    my $data_name = $data->[0];
737    # Remove the any namespace if we are being told to ignore them
738    if($ignore_namespace) {
739        $data_name =~ s/^\w*\.//;
740    }
741
742        push (@metadata, $data->[1]) if ($data_name eq $field);
743    }
744
745    return \@metadata;
746}
747
748# returns a list of the form [[field,value],[field,value],...]
749sub get_all_metadata {
750    my $self = shift (@_);
751    my ($section) = @_;
752
753    my $section_ptr = $self->_lookup_section($section);
754    if (!defined $section_ptr) {
755    print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
756    return;
757    }
758   
759    return $section_ptr->{'metadata'};
760}
761
762# $value is optional
763sub delete_metadata {
764    my $self = shift (@_);
765    my ($section, $field, $value) = @_;
766
767    my $section_ptr = $self->_lookup_section($section);
768    if (!defined $section_ptr) {
769    print STDERR "doc::delete_metadata couldn't find section ", $section, "\n";
770    return;
771    }
772
773    my $i = 0;
774    while ($i < scalar (@{$section_ptr->{'metadata'}})) {
775    if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
776        (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
777        splice (@{$section_ptr->{'metadata'}}, $i, 1);
778    } else {
779        $i++;
780    }
781    }
782}
783
784sub delete_all_metadata {
785    my $self = shift (@_);
786    my ($section) = @_;
787
788    my $section_ptr = $self->_lookup_section($section);
789    if (!defined $section_ptr) {
790    print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
791    return;
792    }
793   
794    $section_ptr->{'metadata'} = [];
795}
796
797sub set_metadata_element {
798    my $self = shift (@_);
799    my ($section, $field, $value) = @_;
800
801    $self->set_utf8_metadata_element ($section, $field,
802                      &unicode::ascii2utf8(\$value));
803}
804
805# set_utf8_metadata_element assumes the text has already been
806# converted to the UTF-8 encoding.
807sub set_utf8_metadata_element {
808    my $self = shift (@_);
809    my ($section, $field, $value) = @_;
810
811    $self->delete_metadata ($section, $field);
812    $self->add_utf8_metadata ($section, $field, $value);
813}
814
815
816# add_metadata assumes the text is in (extended) ascii form. For
817# text which has already been converted to the UTF-8 format use
818# add_utf8_metadata.
819sub add_metadata {
820    my $self = shift (@_);
821    my ($section, $field, $value) = @_;
822
823    $self->add_utf8_metadata ($section, $field,
824                  &unicode::ascii2utf8(\$value));
825}
826
827sub add_utf8_metadata {
828    my $self = shift (@_);
829    my ($section, $field, $value) = @_;
830
831    my $section_ptr = $self->_lookup_section($section);
832    if (!defined $section_ptr) {
833    print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
834    return;
835    }
836    if (!defined $value) {
837    print STDERR "doc::add_utf8_metadata undefined value for $field\n";
838    return;
839    }
840    if (!defined $field) {
841    print STDERR "doc::add_utf8_metadata undefined metadata type \n";
842    return;
843    }
844   
845    #print STDERR "###$field=$value\n";
846    # double check that the value is utf-8
847    if (!&unicode::check_is_utf8($value)) {
848    print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
849    &unicode::ensure_utf8(\$value);
850    print STDERR " Tried converting to utf8: $value\n";
851    }
852
853    push (@{$section_ptr->{'metadata'}}, [$field, $value]);
854}
855
856
857# methods for dealing with text
858
859# returns the text for a section
860sub get_text {
861    my $self = shift (@_);
862    my ($section) = @_;
863
864    my $section_ptr = $self->_lookup_section($section);
865    if (!defined $section_ptr) {
866    print STDERR "doc::get_text couldn't find section " .
867        "$section\n";
868    return "";
869    }
870
871    return $section_ptr->{'text'};
872}
873
874# returns the (utf-8 encoded) length of the text for a section
875sub get_text_length {
876    my $self = shift (@_);
877    my ($section) = @_;
878
879    my $section_ptr = $self->_lookup_section($section);
880    if (!defined $section_ptr) {
881    print STDERR "doc::get_text_length couldn't find section " .
882        "$section\n";
883    return 0;
884    }
885
886    return length ($section_ptr->{'text'});
887}
888
889sub delete_text {
890    my $self = shift (@_);
891    my ($section) = @_;
892
893    my $section_ptr = $self->_lookup_section($section);
894    if (!defined $section_ptr) {
895    print STDERR "doc::delete_text couldn't find section " .
896        "$section\n";
897    return;
898    }
899
900    $section_ptr->{'text'} = "";
901}
902
903# add_text assumes the text is in (extended) ascii form. For
904# text which has been already converted to the UTF-8 format
905# use add_utf8_text.
906sub add_text {
907    my $self = shift (@_);
908    my ($section, $text) = @_;
909
910    # convert the text to UTF-8 encoded unicode characters
911    # and add the text
912    $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
913}
914
915
916# add_utf8_text assumes the text to be added has already
917# been converted to the UTF-8 encoding. For ascii text use
918# add_text
919sub add_utf8_text {
920    my $self = shift (@_);
921    my ($section, $text) = @_;
922
923    my $section_ptr = $self->_lookup_section($section);
924    if (!defined $section_ptr) {
925    print STDERR "doc::add_utf8_text couldn't find section " .
926        "$section\n";
927    return;
928    }
929
930    $section_ptr->{'text'} .= $text;
931}
932
933
934# methods for dealing with associated files
935
936# a file is associated with a document, NOT a section.
937# if section is defined it is noted in the data structure
938# only so that files associated from a particular section
939# may be removed later (using delete_section_assoc_files)
940sub associate_file {
941    my $self = shift (@_);
942    my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
943    $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
944
945    # remove all associated files with the same name
946    $self->delete_assoc_file ($assoc_filename);
947   
948    push (@{$self->{'associated_files'}},
949      [$real_filename, $assoc_filename, $mime_type, $section]);
950}
951
952# returns a list of associated files in the form
953#   [[real_filename, assoc_filename, mimetype], ...]
954sub get_assoc_files {
955    my $self = shift (@_);
956
957    return $self->{'associated_files'};
958}
959
960sub delete_section_assoc_files {
961    my $self = shift (@_);
962    my ($section) = @_;
963
964    my $i=0;
965    while ($i < scalar (@{$self->{'associated_files'}})) {
966    if (defined $self->{'associated_files'}->[$i]->[3] &&
967        $self->{'associated_files'}->[$i]->[3] eq $section) {
968        splice (@{$self->{'associated_files'}}, $i, 1);
969    } else {
970        $i++;
971    }
972    }
973}
974
975sub delete_assoc_file {
976    my $self = shift (@_);
977    my ($assoc_filename) = @_;
978
979    my $i=0;
980    while ($i < scalar (@{$self->{'associated_files'}})) {
981    if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
982        splice (@{$self->{'associated_files'}}, $i, 1);
983    } else {
984        $i++;
985    }
986    }
987}
988
989sub reset_nextsection_ptr {
990    my $self = shift (@_);
991    my ($section) = @_;
992   
993    my $section_ptr = $self->_lookup_section($section);
994    $section_ptr->{'next_subsection'} = 1;
995}
996
9971;
Note: See TracBrowser for help on using the browser.