root/gsdl/trunk/perllib/doc.pm @ 20417

Revision 20417, 28.4 KB (checked in by kjdon, 10 years ago)

changed a typo in comment

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr   te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32    die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use ghtml;
40use File::stat;
41##use hashdoc;
42use docprint;
43
44# the document type may be indexed_doc, nonindexed_doc, or
45# classification
46
47our $OIDcount = 0;
48
49# rename_method can be 'url', 'none', 'base64'
50sub new {
51    my $class = shift (@_);
52    my ($source_filename, $doc_type, $rename_method) = @_;
53
54
55    my $self = bless {'associated_files'=>[],
56              'subsection_order'=>[],
57              'next_subsection'=>1,
58              'subsections'=>{},
59              'metadata'=>[],
60              'text'=>"",
61              'OIDtype'=>"hash"}, $class;
62
63    # used to set lastmodified here, but this can screw up the HASH ids, so
64    # the docsave processor now calls set_lastmodified
65
66    $self->{'source_path'} = $source_filename;
67   
68    if (defined $source_filename) {
69    $source_filename = &util::filename_within_collection($source_filename);
70    print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
71    $self->set_source_filename ($source_filename, $rename_method);
72    }
73
74    $self->set_doc_type ($doc_type) if defined $doc_type;
75
76    return $self;
77}
78# set lastmodified for OAI purposes, added by GRB, moved by kjdon
79sub set_lastmodified {
80    my $self = shift (@_);
81
82    my $source_path = $self->{'source_path'};
83   
84    if (defined $source_path && (-e $source_path)) {
85
86        my $file_stat = stat($source_path);
87    my $mtime = $file_stat->mtime;
88    $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $file_stat->mtime);
89    }
90}
91
92# clone the $self object
93sub duplicate {
94    my $self = shift (@_);
95
96    my $newobj = {};
97   
98    foreach my $k (keys %$self) {
99    $newobj->{$k} = &clone ($self->{$k});
100    }
101
102    bless $newobj, ref($self);
103    return $newobj;
104}
105
106sub clone {
107    my ($from) = @_;
108    my $type = ref ($from);
109
110    if ($type eq "HASH") {
111    my $to = {};
112    foreach my $key (keys %$from) {
113        $to->{$key} = &clone ($from->{$key});
114    }
115    return $to;
116    } elsif ($type eq "ARRAY") {
117    my $to = [];
118    foreach my $v (@$from) {
119        push (@$to, &clone ($v));
120    }
121    return $to;
122    } else {
123    return $from;
124    }
125}
126
127sub set_OIDtype {
128    my $self = shift (@_);
129    my ($type, $metadata) = @_;
130
131    if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|incremental|dirname|assigned)$/) {
132    $self->{'OIDtype'} = $type;
133    } else {
134    $self->{'OIDtype'} = "hash";
135    }
136
137    if ($type =~ /^assigned$/) {
138    if (defined $metadata) {
139        $self->{'OIDmetadata'} = $metadata;
140    } else {
141        $self->{'OIDmetadata'} = "dc.Identifier";
142    }
143    }
144}
145
146# rename_method can be 'url', 'none', 'base64'
147sub set_source_filename {
148    my $self = shift (@_);
149    my ($source_filename, $rename_method) = @_;
150
151    # Since the gsdlsourcefilename element goes into the doc.xml it has
152    # to be utf8. However, it should also *represent* the source filename
153    # (in the import directory) which may not be utf8 at all.
154    # For instance, if this meta element (gsdlsourcefilename) will be used
155    # by other applications that parse doc.xml in order to locate
156    # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
157    # encode the real filename as this is a binary-to-text encoding meaning
158    # that the resulting string is ASCII (utf8). Decoding will give the original.
159   
160#    print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
161
162    # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
163    my ($srcfilename,$dirname,$suffix)
164    = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
165#    print STDERR "-> $srcfilename -> ";
166    $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
167    $source_filename = &util::filename_cat($dirname, $srcfilename);
168#    print STDERR "$source_filename\n";
169   
170    $self->set_utf8_metadata_element ($self->get_top_section(),
171                 "gsdlsourcefilename",
172                 $source_filename);
173}
174
175sub set_converted_filename {
176    my $self = shift (@_);
177    my ($converted_filename) = @_;
178
179    # we know the converted filename is utf8
180    $self->set_utf8_metadata_element ($self->get_top_section(),
181                 "gsdlconvertedfilename",
182                 $converted_filename);
183}
184
185# returns the source_filename as it was provided
186sub get_unmodified_source_filename {
187    my $self = shift (@_);
188
189    return $self->{'source_path'};
190}
191
192# returns the source_filename with whatever rename_method was given
193sub get_source_filename {
194    my $self = shift (@_);
195
196    return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
197}
198
199
200
201# returns converted filename if available else returns source filename
202sub get_filename_for_hashing {
203    my $self = shift (@_);
204
205    my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
206
207    if (!defined $filename) {
208    my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
209    # if NULPlug processed file, then don't give a filename
210    if (defined $plugin_name && $plugin_name eq "NULPlug") {
211        $filename = undef;
212    } else { # returns the URL encoded source filename!
213        $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
214    }
215    }
216    return $filename;
217}
218
219sub set_doc_type {
220    my $self = shift (@_);
221    my ($doc_type) = @_;
222
223    $self->set_metadata_element ($self->get_top_section(),
224                 "gsdldoctype",
225                 $doc_type);
226}
227
228# returns the gsdldoctype as it was provided
229# the default of "indexed_doc" is used if no document
230# type was provided
231sub get_doc_type {
232    my $self = shift (@_);
233
234    my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
235    return $doc_type if (defined $doc_type);
236    return "indexed_doc";
237}
238
239
240# look up the reference to the a particular section
241sub _lookup_section {
242    my $self = shift (@_);
243    my ($section) = @_;
244
245    my ($num);
246    my $sectionref = $self;
247
248    while (defined $section && $section ne "") {
249   
250    ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
251   
252    $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
253   
254    $section = "" unless defined $section;
255   
256
257    if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
258        $sectionref = $sectionref->{'subsections'}->{$num};
259    } else {
260        return undef;
261    }
262    }
263   
264    return $sectionref;
265}
266
267# calculate OID by hashing the contents of the document
268sub _calc_OID {
269    my $self = shift (@_);
270    my ($filename) = @_;
271
272    my $osexe = &util::get_os_exe();
273
274    my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
275                       $ENV{'GSDLOS'},"hashfile$osexe");
276
277    my $result = "NULL";
278   
279    if (-e "$hashfile_exe") {
280#   $result = `\"$hashfile_exe\" \"$filename\"`;
281    $result = `hashfile$osexe \"$filename\"`;
282    ($result) = $result =~ /:\s*([0-9a-f]+)/i;
283    } else {
284    print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
285    }
286    return "HASH$result";
287}
288
289# methods dealing with OID, not groups of them.
290
291# if $OID is not provided one is calculated
292sub set_OID {
293    my $self = shift (@_);
294    my ($OID) = @_;
295   
296    my $use_hash_oid = 0;
297    # if an OID wasn't provided calculate one
298    if (!defined $OID) {
299    $OID = "NULL";
300    if ($self->{'OIDtype'} =~ /^hash/) {
301        $use_hash_oid = 1;
302    } elsif ($self->{'OIDtype'} eq "incremental") {
303        $OID = "D" . $OIDcount;
304        $OIDcount ++;
305     
306    } elsif ($self->{'OIDtype'} eq "dirname") {
307        $OID = 'J';
308        my $filename = $self->get_source_filename();
309        if (defined($filename)) { # && -e $filename) {
310        $OID = &File::Basename::dirname($filename);
311        if (defined $OID) {
312            $OID = 'J'.&File::Basename::basename($OID);
313            $OID = &util::tidy_up_oid($OID);
314        } else {
315            print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
316            $use_hash_oid = 1;
317        }
318        } else {
319        print STDERR "Failed to find filename, generating hash id\n";
320        $use_hash_oid = 1;
321        }   
322       
323    } elsif ($self->{'OIDtype'} eq "assigned") {
324        my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
325        if (defined $identifier && $identifier ne "") {
326        $OID = $identifier;
327        $OID = &util::tidy_up_oid($OID);
328        } else {
329        # need a hash id
330        print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
331        $use_hash_oid = 1;
332        }
333               
334    } else {
335        $use_hash_oid = 1;
336    }
337
338    if ($use_hash_oid) {
339        my $hash_on_file = 1;
340        if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
341        $hash_on_file = 0;
342        }
343        if ($hash_on_file) {
344        # "hash" OID - feed file to hashfile.exe
345        my $filename = $self->get_filename_for_hashing();
346        # -z: don't want to hash on the file if it is zero size
347        if (defined($filename) && -e $filename && !-z $filename) {
348            $OID = $self->_calc_OID ($filename);
349        } else {
350            $hash_on_file = 0;
351        }
352        }
353        if (!$hash_on_file) {
354        my $filename = &util::get_tmp_filename();
355        if (!open (OUTFILE, ">$filename")) {
356            print STDERR "doc::set_OID could not write to $filename\n";
357        } else {
358            my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
359            print OUTFILE $doc_text;
360            close (OUTFILE);
361        }
362        $OID = $self->_calc_OID ($filename);
363        &util::rm ($filename);
364        }
365    }
366    }
367    $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
368}
369
370# this uses hashdoc (embedded c thingy) which is faster but still
371# needs a little work to be suffiently stable
372sub ___set_OID {
373    my $self = shift (@_);
374    my ($OID) = @_;
375
376    # if an OID wasn't provided then calculate hash value based on document
377    if (!defined $OID)
378    {
379    my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
380    my $hash_len = length($hash_text);
381
382        $OID = &hashdoc::buffer($hash_text,$hash_len);
383    }
384
385    $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
386}
387
388# returns the OID for this document
389sub get_OID {
390    my $self = shift (@_);
391    my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
392    return $OID if (defined $OID);
393    return "NULL";
394}
395
396sub delete_OID {
397    my $self = shift (@_);
398   
399    $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
400}
401
402
403# methods for manipulating section names
404
405# returns the name of the top-most section (the top
406# level of the document
407sub get_top_section {
408    my $self = shift (@_);
409   
410    return "";
411}
412
413# returns a section
414sub get_parent_section {
415    my $self = shift (@_);
416    my ($section) = @_;
417
418    $section =~ s/(^|\.)\d+$//;
419
420    return $section;
421}
422
423# returns the first child section (or the end child
424# if there isn't any)
425sub get_begin_child {
426    my $self = shift (@_);
427    my ($section) = @_;
428
429    my $section_ptr = $self->_lookup_section($section);
430    return "" unless defined $section_ptr;
431
432    if (defined $section_ptr->{'subsection_order'}->[0]) {
433    return "$section.$section_ptr->{'subsection_order'}->[0]";
434    }
435
436    return $self->get_end_child ($section);
437}
438
439# returns the next child of a parent section
440sub get_next_child {
441    my $self = shift (@_);
442    my ($section) = @_;
443   
444    my $parent_section = $self->get_parent_section($section);
445    my $parent_section_ptr = $self->_lookup_section($parent_section);
446    return undef unless defined $parent_section_ptr;
447
448    my ($section_num) = $section =~ /(\d+)$/;
449    return undef unless defined $section_num;
450
451    my $i = 0;
452    my $section_order = $parent_section_ptr->{'subsection_order'};
453    while ($i < scalar(@$section_order)) {
454    last if $section_order->[$i] eq $section_num;
455    $i++;
456    }
457
458    $i++; # the next child
459    if ($i < scalar(@$section_order)) {
460    return $section_order->[$i] if $parent_section eq "";
461    return "$parent_section.$section_order->[$i]";
462    }
463
464    # no more sections in this level
465    return undef;
466}
467
468# returns a reference to a list of children
469sub get_children {
470    my $self = shift (@_);
471    my ($section) = @_;
472
473    my $section_ptr = $self->_lookup_section($section);
474    return [] unless defined $section_ptr;
475
476    my @children = @{$section_ptr->{'subsection_order'}};
477
478    map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
479    return \@children;
480}
481
482# returns the child section one past the last one (which
483# is coded as "0")
484sub get_end_child {
485    my $self = shift (@_);
486    my ($section) = @_;
487
488    return $section . ".0" unless $section eq "";
489    return "0";
490}
491
492# returns the next section in book order
493sub get_next_section {
494    my $self = shift (@_);
495    my ($section) = @_;
496
497    return undef unless defined $section;
498
499    my $section_ptr = $self->_lookup_section($section);
500    return undef unless defined $section_ptr;
501
502    # first try to find first child
503    if (defined $section_ptr->{'subsection_order'}->[0]) {
504    return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
505    return "$section.$section_ptr->{'subsection_order'}->[0]";
506    }
507
508    do {
509    # try to find sibling
510    my $next_child = $self->get_next_child ($section);
511    return $next_child if (defined $next_child);
512
513    # move up one level
514    $section = $self->get_parent_section ($section);
515    } while $section =~ /\d/;
516
517    return undef;
518}
519
520sub is_leaf_section {
521    my $self = shift (@_);
522    my ($section) = @_;
523
524    my $section_ptr = $self->_lookup_section($section);
525    return 1 unless defined $section_ptr;
526
527    return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
528}
529
530# methods for dealing with sections
531
532# returns the name of the inserted section
533sub insert_section {
534    my $self = shift (@_);
535    my ($before_section) = @_;
536
537    # get the child to insert before and its parent section
538    my $parent_section = "";
539    my $before_child = "0";
540    my @before_section = split (/\./, $before_section);
541    if (scalar(@before_section) > 0) {
542    $before_child = pop (@before_section);
543    $parent_section = join (".", @before_section);
544    }
545
546    my $parent_section_ptr = $self->_lookup_section($parent_section);
547    if (!defined $parent_section_ptr) {
548    print STDERR "doc::insert_section couldn't find parent section " .
549        "$parent_section\n";
550    return;
551    }
552
553    # get the next section number
554    my $section_num = $parent_section_ptr->{'next_subsection'}++;
555
556    my $i = 0;
557    while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
558       $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
559    $i++;
560    }
561   
562    # insert the section number into the order list
563    splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
564
565    # add this section to the parent section
566    my $section_ptr = {'subsection_order'=>[],
567               'next_subsection'=>1,
568               'subsections'=>{},
569               'metadata'=>[],
570               'text'=>""};
571    $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
572
573    # work out the full section number
574    my $section = $parent_section;
575    $section .= "." unless $section eq "";
576    $section .= $section_num;
577   
578    return $section;
579}
580
581# creates a pre-named section
582sub create_named_section {
583    my $self = shift (@_);
584    my ($mastersection) = @_;
585
586    my ($num);
587    my $section = $mastersection;
588    my $sectionref = $self;
589
590    while ($section ne "") {
591    ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
592    $num =~ s/^0+(\d)/$1/; # remove leading 0s
593    $section = "" unless defined $section;
594   
595    if (defined $num) {
596        if (!defined $sectionref->{'subsections'}->{$num}) {
597        push (@{$sectionref->{'subsection_order'}}, $num);
598        $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
599                            'next_subsection'=>1,
600                            'subsections'=>{},
601                            'metadata'=>[],
602                            'text'=>""};
603        if ($num >= $sectionref->{'next_subsection'}) {
604            $sectionref->{'next_subsection'} = $num + 1;
605        }
606        }
607        $sectionref = $sectionref->{'subsections'}->{$num};
608
609    } else {
610        print STDERR "doc::create_named_section couldn't create section ";
611        print STDERR "$mastersection\n";
612        last;
613    }
614    }
615}
616
617# returns a reference to a list of subsections
618sub list_subsections {
619    my $self = shift (@_);
620    my ($section) = @_;
621
622    my $section_ptr = $self->_lookup_section ($section);
623    if (!defined $section_ptr) {
624    print STDERR "doc::list_subsections couldn't find section $section\n";
625    return [];
626    }
627
628    return [@{$section_ptr->{'subsection_order'}}];
629}
630
631sub delete_section {
632    my $self = shift (@_);
633    my ($section) = @_;
634
635#    my $section_ptr = {'subsection_order'=>[],
636#              'next_subsection'=>1,
637#              'subsections'=>{},
638#              'metadata'=>[],
639#              'text'=>""};
640
641    # if this is the top section reset everything
642    if ($section eq "") {
643    $self->{'subsection_order'} = [];
644    $self->{'subsections'} = {};
645    $self->{'metadata'} = [];
646    $self->{'text'} = "";
647    return;
648    }
649
650    # find the parent of the section to delete
651    my $parent_section = "";
652    my $child = "0";
653    my @section = split (/\./, $section);
654    if (scalar(@section) > 0) {
655    $child = pop (@section);
656    $parent_section = join (".", @section);
657    }
658
659    my $parent_section_ptr = $self->_lookup_section($parent_section);
660    if (!defined $parent_section_ptr) {
661    print STDERR "doc::delete_section couldn't find parent section " .
662        "$parent_section\n";
663    return;
664    }
665
666    # remove this section from the subsection_order list
667    my $i = 0;
668    while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
669    if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
670        splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
671        last;
672    }
673    $i++;
674    }
675
676    # remove this section from the subsection hash
677    if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
678    undef $parent_section_ptr->{'subsections'}->{$child};
679    }
680}
681
682#--
683# methods for dealing with metadata
684
685# set_metadata_element and get_metadata_element are for metadata
686# which should only have one value. add_meta_data and get_metadata
687# are for metadata which can have more than one value.
688
689# returns the first metadata value which matches field
690
691# This version of get metadata element works much like the one above,
692# except it allows for the namespace portion of a metadata element to
693# be ignored, thus if you are searching for dc.Title, the first piece
694# of matching metadata ending with the name Title (once any namespace
695# is removed) would be returned.
696# 28-11-2003 John Thompson
697sub get_metadata_element {
698    my $self = shift (@_);
699    my ($section, $field, $ignore_namespace) = @_;
700    my ($data);
701
702    $ignore_namespace = 0 unless defined $ignore_namespace;
703
704    my $section_ptr = $self->_lookup_section($section);
705    if (!defined $section_ptr) {
706    print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
707    return;
708    }
709
710    # Remove the namespace if we are being told to ignore them
711    if($ignore_namespace) {
712    $field =~ s/^\w*\.//;
713    }
714
715    foreach $data (@{$section_ptr->{'metadata'}}) {
716
717    my $data_name = $data->[0];
718
719    # Remove the any namespace if we are being told to ignore them
720    if($ignore_namespace) {
721        $data_name =~ s/^\w*\.//;
722    }
723   
724    return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
725    }
726   
727    return undef; # was not found
728}
729
730# returns a list of the form [value1, value2, ...]
731sub get_metadata {
732    my $self = shift (@_);
733    my ($section, $field, $ignore_namespace) = @_;
734    my ($data);
735
736    $ignore_namespace = 0 unless defined $ignore_namespace;
737
738    my $section_ptr = $self->_lookup_section($section);
739    if (!defined $section_ptr) {
740        print STDERR "doc::get_metadata couldn't find section ",
741        $section, "\n";
742        return;
743    }
744
745    # Remove the any namespace if we are being told to ignore them
746    if($ignore_namespace) {
747    $field =~ s/^\w*\.//;
748    }
749
750    my @metadata = ();
751    foreach $data (@{$section_ptr->{'metadata'}}) {
752
753    my $data_name = $data->[0];
754    # Remove the any namespace if we are being told to ignore them
755    if($ignore_namespace) {
756        $data_name =~ s/^\w*\.//;
757    }
758
759        push (@metadata, $data->[1]) if ($data_name eq $field);
760    }
761
762    return \@metadata;
763}
764
765# returns a list of the form [[field,value],[field,value],...]
766sub get_all_metadata {
767    my $self = shift (@_);
768    my ($section) = @_;
769
770    my $section_ptr = $self->_lookup_section($section);
771    if (!defined $section_ptr) {
772    print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
773    return;
774    }
775   
776    return $section_ptr->{'metadata'};
777}
778
779# $value is optional
780sub delete_metadata {
781    my $self = shift (@_);
782    my ($section, $field, $value) = @_;
783
784    my $section_ptr = $self->_lookup_section($section);
785    if (!defined $section_ptr) {
786    print STDERR "doc::delete_metadata couldn't find section ", $section, "\n";
787    return;
788    }
789
790    my $i = 0;
791    while ($i < scalar (@{$section_ptr->{'metadata'}})) {
792    if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
793        (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
794        splice (@{$section_ptr->{'metadata'}}, $i, 1);
795    } else {
796        $i++;
797    }
798    }
799}
800
801sub delete_all_metadata {
802    my $self = shift (@_);
803    my ($section) = @_;
804
805    my $section_ptr = $self->_lookup_section($section);
806    if (!defined $section_ptr) {
807    print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
808    return;
809    }
810   
811    $section_ptr->{'metadata'} = [];
812}
813
814sub set_metadata_element {
815    my $self = shift (@_);
816    my ($section, $field, $value) = @_;
817
818    $self->set_utf8_metadata_element ($section, $field,
819                      &unicode::ascii2utf8(\$value));
820}
821
822# set_utf8_metadata_element assumes the text has already been
823# converted to the UTF-8 encoding.
824sub set_utf8_metadata_element {
825    my $self = shift (@_);
826    my ($section, $field, $value) = @_;
827
828    $self->delete_metadata ($section, $field);
829    $self->add_utf8_metadata ($section, $field, $value);
830}
831
832
833# add_metadata assumes the text is in (extended) ascii form. For
834# text which has already been converted to the UTF-8 format use
835# add_utf8_metadata.
836sub add_metadata {
837    my $self = shift (@_);
838    my ($section, $field, $value) = @_;
839
840    $self->add_utf8_metadata ($section, $field,
841                  &unicode::ascii2utf8(\$value));
842}
843
844sub add_utf8_metadata {
845    my $self = shift (@_);
846    my ($section, $field, $value) = @_;
847
848    my $section_ptr = $self->_lookup_section($section);
849    if (!defined $section_ptr) {
850    print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
851    return;
852    }
853    if (!defined $value) {
854    print STDERR "doc::add_utf8_metadata undefined value for $field\n";
855    return;
856    }
857    if (!defined $field) {
858    print STDERR "doc::add_utf8_metadata undefined metadata type \n";
859    return;
860    }
861   
862    #print STDERR "###$field=$value\n";
863    # double check that the value is utf-8
864    if (!&unicode::check_is_utf8($value)) {
865    print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
866    &unicode::ensure_utf8(\$value);
867    print STDERR " Tried converting to utf8: $value\n";
868    }
869
870    push (@{$section_ptr->{'metadata'}}, [$field, $value]);
871}
872
873
874# methods for dealing with text
875
876# returns the text for a section
877sub get_text {
878    my $self = shift (@_);
879    my ($section) = @_;
880
881    my $section_ptr = $self->_lookup_section($section);
882    if (!defined $section_ptr) {
883    print STDERR "doc::get_text couldn't find section " .
884        "$section\n";
885    return "";
886    }
887
888    return $section_ptr->{'text'};
889}
890
891# returns the (utf-8 encoded) length of the text for a section
892sub get_text_length {
893    my $self = shift (@_);
894    my ($section) = @_;
895
896    my $section_ptr = $self->_lookup_section($section);
897    if (!defined $section_ptr) {
898    print STDERR "doc::get_text_length couldn't find section " .
899        "$section\n";
900    return 0;
901    }
902
903    return length ($section_ptr->{'text'});
904}
905
906sub delete_text {
907    my $self = shift (@_);
908    my ($section) = @_;
909
910    my $section_ptr = $self->_lookup_section($section);
911    if (!defined $section_ptr) {
912    print STDERR "doc::delete_text couldn't find section " .
913        "$section\n";
914    return;
915    }
916
917    $section_ptr->{'text'} = "";
918}
919
920# add_text assumes the text is in (extended) ascii form. For
921# text which has been already converted to the UTF-8 format
922# use add_utf8_text.
923sub add_text {
924    my $self = shift (@_);
925    my ($section, $text) = @_;
926
927    # convert the text to UTF-8 encoded unicode characters
928    # and add the text
929    $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
930}
931
932
933# add_utf8_text assumes the text to be added has already
934# been converted to the UTF-8 encoding. For ascii text use
935# add_text
936sub add_utf8_text {
937    my $self = shift (@_);
938    my ($section, $text) = @_;
939
940    my $section_ptr = $self->_lookup_section($section);
941    if (!defined $section_ptr) {
942    print STDERR "doc::add_utf8_text couldn't find section " .
943        "$section\n";
944    return;
945    }
946
947    $section_ptr->{'text'} .= $text;
948}
949
950# returns the Source meta, which is the utf8 filename generated.
951# Added a separate method here for convenience
952sub get_source {
953    my $self = shift (@_);
954    return $self->get_metadata_element ($self->get_top_section(), "Source");
955}
956
957# returns the SourceFile meta, which is the url reference to the URL-encoded
958# version of Source (the utf8 filename). Added a separate method here for convenience
959sub get_sourcefile {
960    my $self = shift (@_);
961    return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
962}
963
964# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
965# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
966sub get_assocfile_from_sourcefile {
967    my $self = shift (@_);
968   
969    # get the SourceFile meta, which is a *URL* to a file on the filesystem
970    my $top_section = $self->get_top_section();
971    my $source_file = $self->get_metadata_element($top_section, "SourceFile");
972
973    # get the actual filename as it exists on the filesystem which this url refers to
974    $source_file = &unicode::url_to_filename($source_file);
975    my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
976    return $assocfilename;
977}
978
979# methods for dealing with associated files
980
981# a file is associated with a document, NOT a section.
982# if section is defined it is noted in the data structure
983# only so that files associated from a particular section
984# may be removed later (using delete_section_assoc_files)
985sub associate_file {
986    my $self = shift (@_);
987    my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
988    $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
989
990    # remove all associated files with the same name
991    $self->delete_assoc_file ($assoc_filename);
992   
993    push (@{$self->{'associated_files'}},
994      [$real_filename, $assoc_filename, $mime_type, $section]);
995}
996
997# returns a list of associated files in the form
998#   [[real_filename, assoc_filename, mimetype], ...]
999sub get_assoc_files {
1000    my $self = shift (@_);
1001
1002    return $self->{'associated_files'};
1003}
1004
1005
1006sub metadata_file {
1007    my $self = shift (@_);
1008    my ($real_filename, $filename) = @_;
1009   
1010    push (@{$self->{'metadata_files'}},
1011      [$real_filename, $filename]);
1012}
1013
1014sub get_meta_files {
1015    my $self = shift (@_);
1016
1017    return $self->{'metadata_files'};
1018}
1019
1020sub delete_section_assoc_files {
1021    my $self = shift (@_);
1022    my ($section) = @_;
1023
1024    my $i=0;
1025    while ($i < scalar (@{$self->{'associated_files'}})) {
1026    if (defined $self->{'associated_files'}->[$i]->[3] &&
1027        $self->{'associated_files'}->[$i]->[3] eq $section) {
1028        splice (@{$self->{'associated_files'}}, $i, 1);
1029    } else {
1030        $i++;
1031    }
1032    }
1033}
1034
1035sub delete_assoc_file {
1036    my $self = shift (@_);
1037    my ($assoc_filename) = @_;
1038
1039    my $i=0;
1040    while ($i < scalar (@{$self->{'associated_files'}})) {
1041    if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1042        splice (@{$self->{'associated_files'}}, $i, 1);
1043    } else {
1044        $i++;
1045    }
1046    }
1047}
1048
1049sub reset_nextsection_ptr {
1050    my $self = shift (@_);
1051    my ($section) = @_;
1052   
1053    my $section_ptr = $self->_lookup_section($section);
1054    $section_ptr->{'next_subsection'} = 1;
1055}
1056
10571;
Note: See TracBrowser for help on using the browser.