root/gsdl/trunk/perllib/doc.pm @ 19494

Revision 19494, 28.5 KB (checked in by davidb, 11 years ago)

Supporting routines that exploit the new 'metafiles' structures, introduction to track which metadata.xml file a piece of metadata came from

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr   te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32    die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use ghtml;
40use File::stat;
41##use hashdoc;
42use docprint;
43
44# the document type may be indexed_doc, nonindexed_doc, or
45# classification
46
47our $OIDcount = 0;
48
49sub new {
50    my $class = shift (@_);
51    my ($source_filename, $doc_type, $rename_method) = @_;
52
53
54    my $self = bless {'associated_files'=>[],
55              'subsection_order'=>[],
56              'next_subsection'=>1,
57              'subsections'=>{},
58              'metadata'=>[],
59              'text'=>"",
60              'OIDtype'=>"hash"}, $class;
61
62    # used to set lastmodified here, but this can screw up the HASH ids, so
63    # the docsave processor now calls set_lastmodified
64
65    $self->{'source_path'} = $source_filename;
66   
67    if (defined $source_filename) {
68    $source_filename = &util::filename_within_collection($source_filename);
69    print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
70    $self->set_source_filename ($source_filename, $rename_method);
71    }
72
73    $self->set_doc_type ($doc_type) if defined $doc_type;
74
75    return $self;
76}
77# set lastmodified for OAI purposes, added by GRB, moved by kjdon
78sub set_lastmodified {
79    my $self = shift (@_);
80
81    my $source_path = $self->{'source_path'};
82   
83    if (defined $source_path && (-e $source_path)) {
84
85        my $file_stat = stat($source_path);
86    my $mtime = $file_stat->mtime;
87    $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $file_stat->mtime);
88    }
89}
90
91# clone the $self object
92sub duplicate {
93    my $self = shift (@_);
94
95    my $newobj = {};
96   
97    foreach my $k (keys %$self) {
98    $newobj->{$k} = &clone ($self->{$k});
99    }
100
101    bless $newobj, ref($self);
102    return $newobj;
103}
104
105sub clone {
106    my ($from) = @_;
107    my $type = ref ($from);
108
109    if ($type eq "HASH") {
110    my $to = {};
111    foreach my $key (keys %$from) {
112        $to->{$key} = &clone ($from->{$key});
113    }
114    return $to;
115    } elsif ($type eq "ARRAY") {
116    my $to = [];
117    foreach my $v (@$from) {
118        push (@$to, &clone ($v));
119    }
120    return $to;
121    } else {
122    return $from;
123    }
124}
125
126sub set_OIDtype {
127    my $self = shift (@_);
128    my ($type, $metadata) = @_;
129
130    if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|incremental|dirname|assigned)$/) {
131    $self->{'OIDtype'} = $type;
132    } else {
133    $self->{'OIDtype'} = "hash";
134    }
135
136    if ($type =~ /^assigned$/) {
137    if (defined $metadata) {
138        $self->{'OIDmetadata'} = $metadata;
139    } else {
140        $self->{'OIDmetadata'} = "dc.Identifier";
141    }
142    }
143}
144
145sub set_source_filename {
146    my $self = shift (@_);
147    my ($source_filename, $rename_method) = @_;
148
149    # Since the gsdlsourcefilename element goes into the doc.xml it has
150    # to be utf8. However, it should also *represent* the source filename
151    # (in the import directory) which may not be utf8 at all.
152    # For instance, if this meta element (gsdlsourcefilename) will be used
153    # by other applications that parse doc.xml in order to locate
154    # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
155    # encode the real filename as this is a binary-to-text encoding meaning
156    # that the resulting string is ASCII (utf8). Decoding will give the original.
157   
158#    print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
159
160    # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
161    my ($srcfilename,$dirname,$suffix)
162    = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
163#    print STDERR "-> $srcfilename -> ";
164    $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
165    $source_filename = &util::filename_cat($dirname, $srcfilename);
166#    print STDERR "$source_filename\n";
167   
168    $self->set_utf8_metadata_element ($self->get_top_section(),
169                 "gsdlsourcefilename",
170                 $source_filename);
171}
172
173sub set_converted_filename {
174    my $self = shift (@_);
175    my ($converted_filename) = @_;
176
177    # we know the converted filename is utf8
178    $self->set_utf8_metadata_element ($self->get_top_section(),
179                 "gsdlconvertedfilename",
180                 $converted_filename);
181}
182
183# returns the source_filename as it was provided
184sub get_source_filename {
185    my $self = shift (@_);
186
187    return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
188}
189
190# returns converted filename if available else returns source filename
191sub get_filename_for_hashing {
192    my $self = shift (@_);
193
194    my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
195
196    if (!defined $filename) {
197    my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
198    # if NULPlug processed file, then don't give a filename
199    if (defined $plugin_name && $plugin_name eq "NULPlug") {
200        $filename = undef;
201    } else { # returns the URL encoded source filename!
202        $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
203    }
204    }
205    return $filename;
206}
207
208sub set_doc_type {
209    my $self = shift (@_);
210    my ($doc_type) = @_;
211
212    $self->set_metadata_element ($self->get_top_section(),
213                 "gsdldoctype",
214                 $doc_type);
215}
216
217# returns the gsdldoctype as it was provided
218# the default of "indexed_doc" is used if no document
219# type was provided
220sub get_doc_type {
221    my $self = shift (@_);
222
223    my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
224    return $doc_type if (defined $doc_type);
225    return "indexed_doc";
226}
227
228
229# look up the reference to the a particular section
230sub _lookup_section {
231    my $self = shift (@_);
232    my ($section) = @_;
233
234    my ($num);
235    my $sectionref = $self;
236
237    while (defined $section && $section ne "") {
238   
239    ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
240   
241    $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
242   
243    $section = "" unless defined $section;
244   
245
246    if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
247        $sectionref = $sectionref->{'subsections'}->{$num};
248    } else {
249        return undef;
250    }
251    }
252   
253    return $sectionref;
254}
255
256# calculate OID by hashing the contents of the document
257sub _calc_OID {
258    my $self = shift (@_);
259    my ($filename) = @_;
260
261    my $osexe = &util::get_os_exe();
262
263    my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
264                       $ENV{'GSDLOS'},"hashfile$osexe");
265
266    my $result = "NULL";
267   
268    if (-e "$hashfile_exe") {
269#   $result = `\"$hashfile_exe\" \"$filename\"`;
270    $result = `hashfile$osexe \"$filename\"`;
271    ($result) = $result =~ /:\s*([0-9a-f]+)/i;
272    } else {
273    print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
274    }
275    return "HASH$result";
276}
277
278# methods dealing with OID, not groups of them.
279
280# if $OID is not provided one is calculated
281sub set_OID {
282    my $self = shift (@_);
283    my ($OID) = @_;
284   
285    my $use_hash_oid = 0;
286    # if an OID wasn't provided calculate one
287    if (!defined $OID) {
288    $OID = "NULL";
289    if ($self->{'OIDtype'} =~ /^hash/) {
290        $use_hash_oid = 1;
291    } elsif ($self->{'OIDtype'} eq "incremental") {
292        $OID = "D" . $OIDcount;
293        $OIDcount ++;
294     
295    } elsif ($self->{'OIDtype'} eq "dirname") {
296        $OID = 'J';
297        my $filename = $self->get_source_filename();
298        if (defined($filename)) { # && -e $filename) {
299        $OID = &File::Basename::dirname($filename);
300        if (defined $OID) {
301            $OID = 'J'.&File::Basename::basename($OID);
302            $OID =~ s/\.//; #remove any periods
303        } else {
304            print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
305            $use_hash_oid = 1;
306        }
307        } else {
308        print STDERR "Failed to find filename, generating hash id\n";
309        $use_hash_oid = 1;
310        }   
311       
312    } elsif ($self->{'OIDtype'} eq "assigned") {
313        my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
314        if (defined $identifier && $identifier ne "") {
315        $OID = $identifier;
316        if ($OID =~ /\./) {
317            print STDERR "Warning, assigned identifier $identifier contains periods (.), removing them\n";
318            $OID =~ s/\.//g; #remove any periods
319        }
320        if ($OID =~ /^[\d]*$/) {
321            print STDERR "Warning, assigned identifier $identifier contains only digits. Prepending 'D'.\n";
322            $OID = "D" . $OID;
323        }       
324        } else {
325        # need a hash id
326        print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
327        $use_hash_oid = 1;
328        }
329               
330    } else {
331        $use_hash_oid = 1;
332    }
333
334    if ($use_hash_oid) {
335        my $hash_on_file = 1;
336        if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
337        $hash_on_file = 0;
338        }
339        if ($hash_on_file) {
340        # "hash" OID - feed file to hashfile.exe
341        my $filename = $self->get_filename_for_hashing();
342        # -z: don't want to hash on the file if it is zero size
343        if (defined($filename) && -e $filename && !-z $filename) {
344            $OID = $self->_calc_OID ($filename);
345        } else {
346            $hash_on_file = 0;
347        }
348        }
349        if (!$hash_on_file) {
350        my $filename = &util::get_tmp_filename();
351        if (!open (OUTFILE, ">$filename")) {
352            print STDERR "doc::set_OID could not write to $filename\n";
353        } else {
354            my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
355            print OUTFILE $doc_text;
356            close (OUTFILE);
357        }
358        $OID = $self->_calc_OID ($filename);
359        &util::rm ($filename);
360        }
361    }
362    }
363    $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
364}
365
366# this uses hashdoc (embedded c thingy) which is faster but still
367# needs a little work to be suffiently stable
368sub ___set_OID {
369    my $self = shift (@_);
370    my ($OID) = @_;
371
372    # if an OID wasn't provided then calculate hash value based on document
373    if (!defined $OID)
374    {
375    my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
376    my $hash_len = length($hash_text);
377
378        $OID = &hashdoc::buffer($hash_text,$hash_len);
379    }
380
381    $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
382}
383
384# returns the OID for this document
385sub get_OID {
386    my $self = shift (@_);
387    my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
388    return $OID if (defined $OID);
389    return "NULL";
390}
391
392sub delete_OID {
393    my $self = shift (@_);
394   
395    $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
396}
397
398
399# methods for manipulating section names
400
401# returns the name of the top-most section (the top
402# level of the document
403sub get_top_section {
404    my $self = shift (@_);
405   
406    return "";
407}
408
409# returns a section
410sub get_parent_section {
411    my $self = shift (@_);
412    my ($section) = @_;
413
414    $section =~ s/(^|\.)\d+$//;
415
416    return $section;
417}
418
419# returns the first child section (or the end child
420# if there isn't any)
421sub get_begin_child {
422    my $self = shift (@_);
423    my ($section) = @_;
424
425    my $section_ptr = $self->_lookup_section($section);
426    return "" unless defined $section_ptr;
427
428    if (defined $section_ptr->{'subsection_order'}->[0]) {
429    return "$section.$section_ptr->{'subsection_order'}->[0]";
430    }
431
432    return $self->get_end_child ($section);
433}
434
435# returns the next child of a parent section
436sub get_next_child {
437    my $self = shift (@_);
438    my ($section) = @_;
439   
440    my $parent_section = $self->get_parent_section($section);
441    my $parent_section_ptr = $self->_lookup_section($parent_section);
442    return undef unless defined $parent_section_ptr;
443
444    my ($section_num) = $section =~ /(\d+)$/;
445    return undef unless defined $section_num;
446
447    my $i = 0;
448    my $section_order = $parent_section_ptr->{'subsection_order'};
449    while ($i < scalar(@$section_order)) {
450    last if $section_order->[$i] eq $section_num;
451    $i++;
452    }
453
454    $i++; # the next child
455    if ($i < scalar(@$section_order)) {
456    return $section_order->[$i] if $parent_section eq "";
457    return "$parent_section.$section_order->[$i]";
458    }
459
460    # no more sections in this level
461    return undef;
462}
463
464# returns a reference to a list of children
465sub get_children {
466    my $self = shift (@_);
467    my ($section) = @_;
468
469    my $section_ptr = $self->_lookup_section($section);
470    return [] unless defined $section_ptr;
471
472    my @children = @{$section_ptr->{'subsection_order'}};
473
474    map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
475    return \@children;
476}
477
478# returns the child section one past the last one (which
479# is coded as "0")
480sub get_end_child {
481    my $self = shift (@_);
482    my ($section) = @_;
483
484    return $section . ".0" unless $section eq "";
485    return "0";
486}
487
488# returns the next section in book order
489sub get_next_section {
490    my $self = shift (@_);
491    my ($section) = @_;
492
493    return undef unless defined $section;
494
495    my $section_ptr = $self->_lookup_section($section);
496    return undef unless defined $section_ptr;
497
498    # first try to find first child
499    if (defined $section_ptr->{'subsection_order'}->[0]) {
500    return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
501    return "$section.$section_ptr->{'subsection_order'}->[0]";
502    }
503
504    do {
505    # try to find sibling
506    my $next_child = $self->get_next_child ($section);
507    return $next_child if (defined $next_child);
508
509    # move up one level
510    $section = $self->get_parent_section ($section);
511    } while $section =~ /\d/;
512
513    return undef;
514}
515
516sub is_leaf_section {
517    my $self = shift (@_);
518    my ($section) = @_;
519
520    my $section_ptr = $self->_lookup_section($section);
521    return 1 unless defined $section_ptr;
522
523    return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
524}
525
526# methods for dealing with sections
527
528# returns the name of the inserted section
529sub insert_section {
530    my $self = shift (@_);
531    my ($before_section) = @_;
532
533    # get the child to insert before and its parent section
534    my $parent_section = "";
535    my $before_child = "0";
536    my @before_section = split (/\./, $before_section);
537    if (scalar(@before_section) > 0) {
538    $before_child = pop (@before_section);
539    $parent_section = join (".", @before_section);
540    }
541
542    my $parent_section_ptr = $self->_lookup_section($parent_section);
543    if (!defined $parent_section_ptr) {
544    print STDERR "doc::insert_section couldn't find parent section " .
545        "$parent_section\n";
546    return;
547    }
548
549    # get the next section number
550    my $section_num = $parent_section_ptr->{'next_subsection'}++;
551
552    my $i = 0;
553    while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
554       $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
555    $i++;
556    }
557   
558    # insert the section number into the order list
559    splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
560
561    # add this section to the parent section
562    my $section_ptr = {'subsection_order'=>[],
563               'next_subsection'=>1,
564               'subsections'=>{},
565               'metadata'=>[],
566               'text'=>""};
567    $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
568
569    # work out the full section number
570    my $section = $parent_section;
571    $section .= "." unless $section eq "";
572    $section .= $section_num;
573   
574    return $section;
575}
576
577# creates a pre-named section
578sub create_named_section {
579    my $self = shift (@_);
580    my ($mastersection) = @_;
581
582    my ($num);
583    my $section = $mastersection;
584    my $sectionref = $self;
585
586    while ($section ne "") {
587    ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
588    $num =~ s/^0+(\d)/$1/; # remove leading 0s
589    $section = "" unless defined $section;
590   
591    if (defined $num) {
592        if (!defined $sectionref->{'subsections'}->{$num}) {
593        push (@{$sectionref->{'subsection_order'}}, $num);
594        $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
595                            'next_subsection'=>1,
596                            'subsections'=>{},
597                            'metadata'=>[],
598                            'text'=>""};
599        if ($num >= $sectionref->{'next_subsection'}) {
600            $sectionref->{'next_subsection'} = $num + 1;
601        }
602        }
603        $sectionref = $sectionref->{'subsections'}->{$num};
604
605    } else {
606        print STDERR "doc::create_named_section couldn't create section ";
607        print STDERR "$mastersection\n";
608        last;
609    }
610    }
611}
612
613# returns a reference to a list of subsections
614sub list_subsections {
615    my $self = shift (@_);
616    my ($section) = @_;
617
618    my $section_ptr = $self->_lookup_section ($section);
619    if (!defined $section_ptr) {
620    print STDERR "doc::list_subsections couldn't find section $section\n";
621    return [];
622    }
623
624    return [@{$section_ptr->{'subsection_order'}}];
625}
626
627sub delete_section {
628    my $self = shift (@_);
629    my ($section) = @_;
630
631#    my $section_ptr = {'subsection_order'=>[],
632#              'next_subsection'=>1,
633#              'subsections'=>{},
634#              'metadata'=>[],
635#              'text'=>""};
636
637    # if this is the top section reset everything
638    if ($section eq "") {
639    $self->{'subsection_order'} = [];
640    $self->{'subsections'} = {};
641    $self->{'metadata'} = [];
642    $self->{'text'} = "";
643    return;
644    }
645
646    # find the parent of the section to delete
647    my $parent_section = "";
648    my $child = "0";
649    my @section = split (/\./, $section);
650    if (scalar(@section) > 0) {
651    $child = pop (@section);
652    $parent_section = join (".", @section);
653    }
654
655    my $parent_section_ptr = $self->_lookup_section($parent_section);
656    if (!defined $parent_section_ptr) {
657    print STDERR "doc::delete_section couldn't find parent section " .
658        "$parent_section\n";
659    return;
660    }
661
662    # remove this section from the subsection_order list
663    my $i = 0;
664    while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
665    if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
666        splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
667        last;
668    }
669    $i++;
670    }
671
672    # remove this section from the subsection hash
673    if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
674    undef $parent_section_ptr->{'subsections'}->{$child};
675    }
676}
677
678#--
679# methods for dealing with metadata
680
681# set_metadata_element and get_metadata_element are for metadata
682# which should only have one value. add_meta_data and get_metadata
683# are for metadata which can have more than one value.
684
685# returns the first metadata value which matches field
686
687# This version of get metadata element works much like the one above,
688# except it allows for the namespace portion of a metadata element to
689# be ignored, thus if you are searching for dc.Title, the first piece
690# of matching metadata ending with the name Title (once any namespace
691# is removed) would be returned.
692# 28-11-2003 John Thompson
693sub get_metadata_element {
694    my $self = shift (@_);
695    my ($section, $field, $ignore_namespace) = @_;
696    my ($data);
697
698    $ignore_namespace = 0 unless defined $ignore_namespace;
699
700    my $section_ptr = $self->_lookup_section($section);
701    if (!defined $section_ptr) {
702    print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
703    return;
704    }
705
706    # Remove the any namespace if we are being told to ignore them
707    if($ignore_namespace) {
708    $field =~ s/^\w*\.//;
709    }
710
711    foreach $data (@{$section_ptr->{'metadata'}}) {
712
713    my $data_name = $data->[0];
714
715    # Remove the any namespace if we are being told to ignore them
716    if($ignore_namespace) {
717        $data_name =~ s/^\w*\.//;
718    }
719   
720    return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
721    }
722   
723    return undef; # was not found
724}
725
726# returns a list of the form [value1, value2, ...]
727sub get_metadata {
728    my $self = shift (@_);
729    my ($section, $field, $ignore_namespace) = @_;
730    my ($data);
731
732    $ignore_namespace = 0 unless defined $ignore_namespace;
733
734    my $section_ptr = $self->_lookup_section($section);
735    if (!defined $section_ptr) {
736        print STDERR "doc::get_metadata couldn't find section ",
737        $section, "\n";
738        return;
739    }
740
741    # Remove the any namespace if we are being told to ignore them
742    if($ignore_namespace) {
743    $field =~ s/^\w*\.//;
744    }
745
746    my @metadata = ();
747    foreach $data (@{$section_ptr->{'metadata'}}) {
748
749    my $data_name = $data->[0];
750    # Remove the any namespace if we are being told to ignore them
751    if($ignore_namespace) {
752        $data_name =~ s/^\w*\.//;
753    }
754
755        push (@metadata, $data->[1]) if ($data_name eq $field);
756    }
757
758    return \@metadata;
759}
760
761# returns a list of the form [[field,value],[field,value],...]
762sub get_all_metadata {
763    my $self = shift (@_);
764    my ($section) = @_;
765
766    my $section_ptr = $self->_lookup_section($section);
767    if (!defined $section_ptr) {
768    print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
769    return;
770    }
771   
772    return $section_ptr->{'metadata'};
773}
774
775# $value is optional
776sub delete_metadata {
777    my $self = shift (@_);
778    my ($section, $field, $value) = @_;
779
780    my $section_ptr = $self->_lookup_section($section);
781    if (!defined $section_ptr) {
782    print STDERR "doc::delete_metadata couldn't find section ", $section, "\n";
783    return;
784    }
785
786    my $i = 0;
787    while ($i < scalar (@{$section_ptr->{'metadata'}})) {
788    if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
789        (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
790        splice (@{$section_ptr->{'metadata'}}, $i, 1);
791    } else {
792        $i++;
793    }
794    }
795}
796
797sub delete_all_metadata {
798    my $self = shift (@_);
799    my ($section) = @_;
800
801    my $section_ptr = $self->_lookup_section($section);
802    if (!defined $section_ptr) {
803    print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
804    return;
805    }
806   
807    $section_ptr->{'metadata'} = [];
808}
809
810sub set_metadata_element {
811    my $self = shift (@_);
812    my ($section, $field, $value) = @_;
813
814    $self->set_utf8_metadata_element ($section, $field,
815                      &unicode::ascii2utf8(\$value));
816}
817
818# set_utf8_metadata_element assumes the text has already been
819# converted to the UTF-8 encoding.
820sub set_utf8_metadata_element {
821    my $self = shift (@_);
822    my ($section, $field, $value) = @_;
823
824    $self->delete_metadata ($section, $field);
825    $self->add_utf8_metadata ($section, $field, $value);
826}
827
828
829# add_metadata assumes the text is in (extended) ascii form. For
830# text which has already been converted to the UTF-8 format use
831# add_utf8_metadata.
832sub add_metadata {
833    my $self = shift (@_);
834    my ($section, $field, $value) = @_;
835
836    $self->add_utf8_metadata ($section, $field,
837                  &unicode::ascii2utf8(\$value));
838}
839
840sub add_utf8_metadata {
841    my $self = shift (@_);
842    my ($section, $field, $value) = @_;
843
844    my $section_ptr = $self->_lookup_section($section);
845    if (!defined $section_ptr) {
846    print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
847    return;
848    }
849    if (!defined $value) {
850    print STDERR "doc::add_utf8_metadata undefined value for $field\n";
851    return;
852    }
853    if (!defined $field) {
854    print STDERR "doc::add_utf8_metadata undefined metadata type \n";
855    return;
856    }
857   
858    #print STDERR "###$field=$value\n";
859    # double check that the value is utf-8
860    if (!&unicode::check_is_utf8($value)) {
861    print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
862    &unicode::ensure_utf8(\$value);
863    print STDERR " Tried converting to utf8: $value\n";
864    }
865
866    push (@{$section_ptr->{'metadata'}}, [$field, $value]);
867}
868
869
870# methods for dealing with text
871
872# returns the text for a section
873sub get_text {
874    my $self = shift (@_);
875    my ($section) = @_;
876
877    my $section_ptr = $self->_lookup_section($section);
878    if (!defined $section_ptr) {
879    print STDERR "doc::get_text couldn't find section " .
880        "$section\n";
881    return "";
882    }
883
884    return $section_ptr->{'text'};
885}
886
887# returns the (utf-8 encoded) length of the text for a section
888sub get_text_length {
889    my $self = shift (@_);
890    my ($section) = @_;
891
892    my $section_ptr = $self->_lookup_section($section);
893    if (!defined $section_ptr) {
894    print STDERR "doc::get_text_length couldn't find section " .
895        "$section\n";
896    return 0;
897    }
898
899    return length ($section_ptr->{'text'});
900}
901
902sub delete_text {
903    my $self = shift (@_);
904    my ($section) = @_;
905
906    my $section_ptr = $self->_lookup_section($section);
907    if (!defined $section_ptr) {
908    print STDERR "doc::delete_text couldn't find section " .
909        "$section\n";
910    return;
911    }
912
913    $section_ptr->{'text'} = "";
914}
915
916# add_text assumes the text is in (extended) ascii form. For
917# text which has been already converted to the UTF-8 format
918# use add_utf8_text.
919sub add_text {
920    my $self = shift (@_);
921    my ($section, $text) = @_;
922
923    # convert the text to UTF-8 encoded unicode characters
924    # and add the text
925    $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
926}
927
928
929# add_utf8_text assumes the text to be added has already
930# been converted to the UTF-8 encoding. For ascii text use
931# add_text
932sub add_utf8_text {
933    my $self = shift (@_);
934    my ($section, $text) = @_;
935
936    my $section_ptr = $self->_lookup_section($section);
937    if (!defined $section_ptr) {
938    print STDERR "doc::add_utf8_text couldn't find section " .
939        "$section\n";
940    return;
941    }
942
943    $section_ptr->{'text'} .= $text;
944}
945
946# returns the Source meta, which is the utf8 filename generated.
947# Added a separate method here for convenience
948sub get_source {
949    my $self = shift (@_);
950    return $self->get_metadata_element ($self->get_top_section(), "Source");
951}
952
953# returns the SourceFile meta, which is the url reference to the URL-encoded
954# version of Source (the utf8 filename). Added a separate method here for convenience
955sub get_sourcefile {
956    my $self = shift (@_);
957    return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
958}
959
960# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
961# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
962sub get_assocfile_from_sourcefile {
963    my $self = shift (@_);
964   
965    # get the SourceFile meta, which is a *URL* to a file on the filesystem
966    my $top_section = $self->get_top_section();
967    my $source_file = $self->get_metadata_element($top_section, "SourceFile");
968
969    # get the actual filename as it exists on the filesystem which this url refers to
970    $source_file = &unicode::url_to_filename($source_file);
971    my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
972    return $assocfilename;
973}
974
975# methods for dealing with associated files
976
977# a file is associated with a document, NOT a section.
978# if section is defined it is noted in the data structure
979# only so that files associated from a particular section
980# may be removed later (using delete_section_assoc_files)
981sub associate_file {
982    my $self = shift (@_);
983    my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
984    $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
985
986    # remove all associated files with the same name
987    $self->delete_assoc_file ($assoc_filename);
988   
989    push (@{$self->{'associated_files'}},
990      [$real_filename, $assoc_filename, $mime_type, $section]);
991}
992
993# returns a list of associated files in the form
994#   [[real_filename, assoc_filename, mimetype], ...]
995sub get_assoc_files {
996    my $self = shift (@_);
997
998    return $self->{'associated_files'};
999}
1000
1001
1002sub metadata_file {
1003    my $self = shift (@_);
1004    my ($real_filename, $filename) = @_;
1005   
1006    push (@{$self->{'metadata_files'}},
1007      [$real_filename, $filename]);
1008}
1009
1010sub get_meta_files {
1011    my $self = shift (@_);
1012
1013    return $self->{'metadata_files'};
1014}
1015
1016sub delete_section_assoc_files {
1017    my $self = shift (@_);
1018    my ($section) = @_;
1019
1020    my $i=0;
1021    while ($i < scalar (@{$self->{'associated_files'}})) {
1022    if (defined $self->{'associated_files'}->[$i]->[3] &&
1023        $self->{'associated_files'}->[$i]->[3] eq $section) {
1024        splice (@{$self->{'associated_files'}}, $i, 1);
1025    } else {
1026        $i++;
1027    }
1028    }
1029}
1030
1031sub delete_assoc_file {
1032    my $self = shift (@_);
1033    my ($assoc_filename) = @_;
1034
1035    my $i=0;
1036    while ($i < scalar (@{$self->{'associated_files'}})) {
1037    if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1038        splice (@{$self->{'associated_files'}}, $i, 1);
1039    } else {
1040        $i++;
1041    }
1042    }
1043}
1044
1045sub reset_nextsection_ptr {
1046    my $self = shift (@_);
1047    my ($section) = @_;
1048   
1049    my $section_ptr = $self->_lookup_section($section);
1050    $section_ptr->{'next_subsection'} = 1;
1051}
1052
10531;
Note: See TracBrowser for help on using the browser.