root/gsdl/trunk/perllib/doc.pm @ 20775

Revision 20775, 29.0 KB (checked in by kjdon, 10 years ago)

we need to store a list of original extra associated files, for incremental building. this is different from the resulting associated files for doc.xml. It will be the list of all files in import that comprised the document. added assocaite_source_file and get_source_assoc_files methods to set and get these files

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr   te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32    die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use ghtml;
40use File::stat;
41##use hashdoc;
42use docprint;
43
44# the document type may be indexed_doc, nonindexed_doc, or
45# classification
46
47our $OIDcount = 0;
48
49# rename_method can be 'url', 'none', 'base64'
50sub new {
51    my $class = shift (@_);
52    my ($source_filename, $doc_type, $rename_method) = @_;
53
54
55    my $self = bless {'associated_files'=>[],
56              'subsection_order'=>[],
57              'next_subsection'=>1,
58              'subsections'=>{},
59              'metadata'=>[],
60              'text'=>"",
61              'OIDtype'=>"hash"}, $class;
62
63    # used to set lastmodified here, but this can screw up the HASH ids, so
64    # the docsave processor now calls set_lastmodified
65
66    $self->{'source_path'} = $source_filename;
67   
68    if (defined $source_filename) {
69    $source_filename = &util::filename_within_collection($source_filename);
70    print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
71    $self->set_source_filename ($source_filename, $rename_method);
72    }
73
74    $self->set_doc_type ($doc_type) if defined $doc_type;
75
76    return $self;
77}
78# set lastmodified for OAI purposes, added by GRB, moved by kjdon
79sub set_lastmodified {
80    my $self = shift (@_);
81
82    my $source_path = $self->{'source_path'};
83   
84    if (defined $source_path && (-e $source_path)) {
85
86        my $file_stat = stat($source_path);
87    my $mtime = $file_stat->mtime;
88    $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $file_stat->mtime);
89    }
90}
91
92# clone the $self object
93sub duplicate {
94    my $self = shift (@_);
95
96    my $newobj = {};
97   
98    foreach my $k (keys %$self) {
99    $newobj->{$k} = &clone ($self->{$k});
100    }
101
102    bless $newobj, ref($self);
103    return $newobj;
104}
105
106sub clone {
107    my ($from) = @_;
108    my $type = ref ($from);
109
110    if ($type eq "HASH") {
111    my $to = {};
112    foreach my $key (keys %$from) {
113        $to->{$key} = &clone ($from->{$key});
114    }
115    return $to;
116    } elsif ($type eq "ARRAY") {
117    my $to = [];
118    foreach my $v (@$from) {
119        push (@$to, &clone ($v));
120    }
121    return $to;
122    } else {
123    return $from;
124    }
125}
126
127sub set_OIDtype {
128    my $self = shift (@_);
129    my ($type, $metadata) = @_;
130
131    if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|incremental|dirname|assigned)$/) {
132    $self->{'OIDtype'} = $type;
133    } else {
134    $self->{'OIDtype'} = "hash";
135    }
136
137    if ($type =~ /^assigned$/) {
138    if (defined $metadata) {
139        $self->{'OIDmetadata'} = $metadata;
140    } else {
141        $self->{'OIDmetadata'} = "dc.Identifier";
142    }
143    }
144}
145
146# rename_method can be 'url', 'none', 'base64'
147sub set_source_filename {
148    my $self = shift (@_);
149    my ($source_filename, $rename_method) = @_;
150
151    # Since the gsdlsourcefilename element goes into the doc.xml it has
152    # to be utf8. However, it should also *represent* the source filename
153    # (in the import directory) which may not be utf8 at all.
154    # For instance, if this meta element (gsdlsourcefilename) will be used
155    # by other applications that parse doc.xml in order to locate
156    # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
157    # encode the real filename as this is a binary-to-text encoding meaning
158    # that the resulting string is ASCII (utf8). Decoding will give the original.
159   
160#    print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
161
162    # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
163    my ($srcfilename,$dirname,$suffix)
164    = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
165#    print STDERR "-> $srcfilename -> ";
166    $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
167    $source_filename = &util::filename_cat($dirname, $srcfilename);
168#    print STDERR "$source_filename\n";
169   
170    $self->set_utf8_metadata_element ($self->get_top_section(),
171                 "gsdlsourcefilename",
172                 $source_filename);
173}
174
175sub set_converted_filename {
176    my $self = shift (@_);
177    my ($converted_filename) = @_;
178
179    # we know the converted filename is utf8
180    $self->set_utf8_metadata_element ($self->get_top_section(),
181                 "gsdlconvertedfilename",
182                 $converted_filename);
183}
184
185# returns the source_filename as it was provided
186sub get_unmodified_source_filename {
187    my $self = shift (@_);
188
189    return $self->{'source_path'};
190}
191
192# returns the source_filename with whatever rename_method was given
193sub get_source_filename {
194    my $self = shift (@_);
195
196    return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
197}
198
199
200
201# returns converted filename if available else returns source filename
202sub get_filename_for_hashing {
203    my $self = shift (@_);
204
205    my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
206
207    if (!defined $filename) {
208    my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
209    # if NULPlug processed file, then don't give a filename
210    if (defined $plugin_name && $plugin_name eq "NULPlug") {
211        $filename = undef;
212    } else { # returns the URL encoded source filename!
213        $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
214    }
215    }
216    return $filename;
217}
218
219sub set_doc_type {
220    my $self = shift (@_);
221    my ($doc_type) = @_;
222
223    $self->set_metadata_element ($self->get_top_section(),
224                 "gsdldoctype",
225                 $doc_type);
226}
227
228# returns the gsdldoctype as it was provided
229# the default of "indexed_doc" is used if no document
230# type was provided
231sub get_doc_type {
232    my $self = shift (@_);
233
234    my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
235    return $doc_type if (defined $doc_type);
236    return "indexed_doc";
237}
238
239
240# look up the reference to the a particular section
241sub _lookup_section {
242    my $self = shift (@_);
243    my ($section) = @_;
244
245    my ($num);
246    my $sectionref = $self;
247
248    while (defined $section && $section ne "") {
249   
250    ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
251   
252    $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
253   
254    $section = "" unless defined $section;
255   
256
257    if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
258        $sectionref = $sectionref->{'subsections'}->{$num};
259    } else {
260        return undef;
261    }
262    }
263   
264    return $sectionref;
265}
266
267# calculate OID by hashing the contents of the document
268sub _calc_OID {
269    my $self = shift (@_);
270    my ($filename) = @_;
271
272    my $osexe = &util::get_os_exe();
273
274    my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
275                       $ENV{'GSDLOS'},"hashfile$osexe");
276
277    my $result = "NULL";
278   
279    if (-e "$hashfile_exe") {
280#   $result = `\"$hashfile_exe\" \"$filename\"`;
281    $result = `hashfile$osexe \"$filename\"`;
282    ($result) = $result =~ /:\s*([0-9a-f]+)/i;
283    } else {
284    print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
285    }
286    return "HASH$result";
287}
288
289# methods dealing with OID, not groups of them.
290
291# if $OID is not provided one is calculated
292sub set_OID {
293    my $self = shift (@_);
294    my ($OID) = @_;
295   
296    my $use_hash_oid = 0;
297    # if an OID wasn't provided calculate one
298    if (!defined $OID) {
299    $OID = "NULL";
300    if ($self->{'OIDtype'} =~ /^hash/) {
301        $use_hash_oid = 1;
302    } elsif ($self->{'OIDtype'} eq "incremental") {
303        $OID = "D" . $OIDcount;
304        $OIDcount ++;
305     
306    } elsif ($self->{'OIDtype'} eq "dirname") {
307        $OID = 'J';
308        my $filename = $self->get_source_filename();
309        if (defined($filename)) { # && -e $filename) {
310        $OID = &File::Basename::dirname($filename);
311        if (defined $OID) {
312            $OID = 'J'.&File::Basename::basename($OID);
313            $OID = &util::tidy_up_oid($OID);
314        } else {
315            print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
316            $use_hash_oid = 1;
317        }
318        } else {
319        print STDERR "Failed to find filename, generating hash id\n";
320        $use_hash_oid = 1;
321        }   
322       
323    } elsif ($self->{'OIDtype'} eq "assigned") {
324        my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
325        if (defined $identifier && $identifier ne "") {
326        $OID = $identifier;
327        $OID = &util::tidy_up_oid($OID);
328        } else {
329        # need a hash id
330        print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
331        $use_hash_oid = 1;
332        }
333               
334    } else {
335        $use_hash_oid = 1;
336    }
337
338    if ($use_hash_oid) {
339        my $hash_on_file = 1;
340        if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
341        $hash_on_file = 0;
342        }
343        if ($hash_on_file) {
344        # "hash" OID - feed file to hashfile.exe
345        my $filename = $self->get_filename_for_hashing();
346        # -z: don't want to hash on the file if it is zero size
347        if (defined($filename) && -e $filename && !-z $filename) {
348            $OID = $self->_calc_OID ($filename);
349        } else {
350            $hash_on_file = 0;
351        }
352        }
353        if (!$hash_on_file) {
354        my $filename = &util::get_tmp_filename();
355        if (!open (OUTFILE, ">$filename")) {
356            print STDERR "doc::set_OID could not write to $filename\n";
357        } else {
358            my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
359            print OUTFILE $doc_text;
360            close (OUTFILE);
361        }
362        $OID = $self->_calc_OID ($filename);
363        &util::rm ($filename);
364        }
365    }
366    }
367    $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
368}
369
370# this uses hashdoc (embedded c thingy) which is faster but still
371# needs a little work to be suffiently stable
372sub ___set_OID {
373    my $self = shift (@_);
374    my ($OID) = @_;
375
376    # if an OID wasn't provided then calculate hash value based on document
377    if (!defined $OID)
378    {
379    my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
380    my $hash_len = length($hash_text);
381
382        $OID = &hashdoc::buffer($hash_text,$hash_len);
383    }
384
385    $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
386}
387
388# returns the OID for this document
389sub get_OID {
390    my $self = shift (@_);
391    my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
392    return $OID if (defined $OID);
393    return "NULL";
394}
395
396sub delete_OID {
397    my $self = shift (@_);
398   
399    $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
400}
401
402
403# methods for manipulating section names
404
405# returns the name of the top-most section (the top
406# level of the document
407sub get_top_section {
408    my $self = shift (@_);
409   
410    return "";
411}
412
413# returns a section
414sub get_parent_section {
415    my $self = shift (@_);
416    my ($section) = @_;
417
418    $section =~ s/(^|\.)\d+$//;
419
420    return $section;
421}
422
423# returns the first child section (or the end child
424# if there isn't any)
425sub get_begin_child {
426    my $self = shift (@_);
427    my ($section) = @_;
428
429    my $section_ptr = $self->_lookup_section($section);
430    return "" unless defined $section_ptr;
431
432    if (defined $section_ptr->{'subsection_order'}->[0]) {
433    return "$section.$section_ptr->{'subsection_order'}->[0]";
434    }
435
436    return $self->get_end_child ($section);
437}
438
439# returns the next child of a parent section
440sub get_next_child {
441    my $self = shift (@_);
442    my ($section) = @_;
443   
444    my $parent_section = $self->get_parent_section($section);
445    my $parent_section_ptr = $self->_lookup_section($parent_section);
446    return undef unless defined $parent_section_ptr;
447
448    my ($section_num) = $section =~ /(\d+)$/;
449    return undef unless defined $section_num;
450
451    my $i = 0;
452    my $section_order = $parent_section_ptr->{'subsection_order'};
453    while ($i < scalar(@$section_order)) {
454    last if $section_order->[$i] eq $section_num;
455    $i++;
456    }
457
458    $i++; # the next child
459    if ($i < scalar(@$section_order)) {
460    return $section_order->[$i] if $parent_section eq "";
461    return "$parent_section.$section_order->[$i]";
462    }
463
464    # no more sections in this level
465    return undef;
466}
467
468# returns a reference to a list of children
469sub get_children {
470    my $self = shift (@_);
471    my ($section) = @_;
472
473    my $section_ptr = $self->_lookup_section($section);
474    return [] unless defined $section_ptr;
475
476    my @children = @{$section_ptr->{'subsection_order'}};
477
478    map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
479    return \@children;
480}
481
482# returns the child section one past the last one (which
483# is coded as "0")
484sub get_end_child {
485    my $self = shift (@_);
486    my ($section) = @_;
487
488    return $section . ".0" unless $section eq "";
489    return "0";
490}
491
492# returns the next section in book order
493sub get_next_section {
494    my $self = shift (@_);
495    my ($section) = @_;
496
497    return undef unless defined $section;
498
499    my $section_ptr = $self->_lookup_section($section);
500    return undef unless defined $section_ptr;
501
502    # first try to find first child
503    if (defined $section_ptr->{'subsection_order'}->[0]) {
504    return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
505    return "$section.$section_ptr->{'subsection_order'}->[0]";
506    }
507
508    do {
509    # try to find sibling
510    my $next_child = $self->get_next_child ($section);
511    return $next_child if (defined $next_child);
512
513    # move up one level
514    $section = $self->get_parent_section ($section);
515    } while $section =~ /\d/;
516
517    return undef;
518}
519
520sub is_leaf_section {
521    my $self = shift (@_);
522    my ($section) = @_;
523
524    my $section_ptr = $self->_lookup_section($section);
525    return 1 unless defined $section_ptr;
526
527    return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
528}
529
530# methods for dealing with sections
531
532# returns the name of the inserted section
533sub insert_section {
534    my $self = shift (@_);
535    my ($before_section) = @_;
536
537    # get the child to insert before and its parent section
538    my $parent_section = "";
539    my $before_child = "0";
540    my @before_section = split (/\./, $before_section);
541    if (scalar(@before_section) > 0) {
542    $before_child = pop (@before_section);
543    $parent_section = join (".", @before_section);
544    }
545
546    my $parent_section_ptr = $self->_lookup_section($parent_section);
547    if (!defined $parent_section_ptr) {
548    print STDERR "doc::insert_section couldn't find parent section " .
549        "$parent_section\n";
550    return;
551    }
552
553    # get the next section number
554    my $section_num = $parent_section_ptr->{'next_subsection'}++;
555
556    my $i = 0;
557    while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
558       $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
559    $i++;
560    }
561   
562    # insert the section number into the order list
563    splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
564
565    # add this section to the parent section
566    my $section_ptr = {'subsection_order'=>[],
567               'next_subsection'=>1,
568               'subsections'=>{},
569               'metadata'=>[],
570               'text'=>""};
571    $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
572
573    # work out the full section number
574    my $section = $parent_section;
575    $section .= "." unless $section eq "";
576    $section .= $section_num;
577   
578    return $section;
579}
580
581# creates a pre-named section
582sub create_named_section {
583    my $self = shift (@_);
584    my ($mastersection) = @_;
585
586    my ($num);
587    my $section = $mastersection;
588    my $sectionref = $self;
589
590    while ($section ne "") {
591    ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
592    $num =~ s/^0+(\d)/$1/; # remove leading 0s
593    $section = "" unless defined $section;
594   
595    if (defined $num) {
596        if (!defined $sectionref->{'subsections'}->{$num}) {
597        push (@{$sectionref->{'subsection_order'}}, $num);
598        $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
599                            'next_subsection'=>1,
600                            'subsections'=>{},
601                            'metadata'=>[],
602                            'text'=>""};
603        if ($num >= $sectionref->{'next_subsection'}) {
604            $sectionref->{'next_subsection'} = $num + 1;
605        }
606        }
607        $sectionref = $sectionref->{'subsections'}->{$num};
608
609    } else {
610        print STDERR "doc::create_named_section couldn't create section ";
611        print STDERR "$mastersection\n";
612        last;
613    }
614    }
615}
616
617# returns a reference to a list of subsections
618sub list_subsections {
619    my $self = shift (@_);
620    my ($section) = @_;
621
622    my $section_ptr = $self->_lookup_section ($section);
623    if (!defined $section_ptr) {
624    print STDERR "doc::list_subsections couldn't find section $section\n";
625    return [];
626    }
627
628    return [@{$section_ptr->{'subsection_order'}}];
629}
630
631sub delete_section {
632    my $self = shift (@_);
633    my ($section) = @_;
634
635#    my $section_ptr = {'subsection_order'=>[],
636#              'next_subsection'=>1,
637#              'subsections'=>{},
638#              'metadata'=>[],
639#              'text'=>""};
640
641    # if this is the top section reset everything
642    if ($section eq "") {
643    $self->{'subsection_order'} = [];
644    $self->{'subsections'} = {};
645    $self->{'metadata'} = [];
646    $self->{'text'} = "";
647    return;
648    }
649
650    # find the parent of the section to delete
651    my $parent_section = "";
652    my $child = "0";
653    my @section = split (/\./, $section);
654    if (scalar(@section) > 0) {
655    $child = pop (@section);
656    $parent_section = join (".", @section);
657    }
658
659    my $parent_section_ptr = $self->_lookup_section($parent_section);
660    if (!defined $parent_section_ptr) {
661    print STDERR "doc::delete_section couldn't find parent section " .
662        "$parent_section\n";
663    return;
664    }
665
666    # remove this section from the subsection_order list
667    my $i = 0;
668    while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
669    if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
670        splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
671        last;
672    }
673    $i++;
674    }
675
676    # remove this section from the subsection hash
677    if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
678    undef $parent_section_ptr->{'subsections'}->{$child};
679    }
680}
681
682#--
683# methods for dealing with metadata
684
685# set_metadata_element and get_metadata_element are for metadata
686# which should only have one value. add_meta_data and get_metadata
687# are for metadata which can have more than one value.
688
689# returns the first metadata value which matches field
690
691# This version of get metadata element works much like the one above,
692# except it allows for the namespace portion of a metadata element to
693# be ignored, thus if you are searching for dc.Title, the first piece
694# of matching metadata ending with the name Title (once any namespace
695# is removed) would be returned.
696# 28-11-2003 John Thompson
697sub get_metadata_element {
698    my $self = shift (@_);
699    my ($section, $field, $ignore_namespace) = @_;
700    my ($data);
701
702    $ignore_namespace = 0 unless defined $ignore_namespace;
703
704    my $section_ptr = $self->_lookup_section($section);
705    if (!defined $section_ptr) {
706    print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
707    return;
708    }
709
710    # Remove the namespace if we are being told to ignore them
711    if($ignore_namespace) {
712    $field =~ s/^\w*\.//;
713    }
714
715    foreach $data (@{$section_ptr->{'metadata'}}) {
716
717    my $data_name = $data->[0];
718
719    # Remove the any namespace if we are being told to ignore them
720    if($ignore_namespace) {
721        $data_name =~ s/^\w*\.//;
722    }
723   
724    return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
725    }
726   
727    return undef; # was not found
728}
729
730# returns a list of the form [value1, value2, ...]
731sub get_metadata {
732    my $self = shift (@_);
733    my ($section, $field, $ignore_namespace) = @_;
734    my ($data);
735
736    $ignore_namespace = 0 unless defined $ignore_namespace;
737
738    my $section_ptr = $self->_lookup_section($section);
739    if (!defined $section_ptr) {
740        print STDERR "doc::get_metadata couldn't find section ",
741        $section, "\n";
742        return;
743    }
744
745    # Remove the any namespace if we are being told to ignore them
746    if($ignore_namespace) {
747    $field =~ s/^\w*\.//;
748    }
749
750    my @metadata = ();
751    foreach $data (@{$section_ptr->{'metadata'}}) {
752
753    my $data_name = $data->[0];
754    # Remove the any namespace if we are being told to ignore them
755    if($ignore_namespace) {
756        $data_name =~ s/^\w*\.//;
757    }
758
759        push (@metadata, $data->[1]) if ($data_name eq $field);
760    }
761
762    return \@metadata;
763}
764
765# returns a list of the form [[field,value],[field,value],...]
766sub get_all_metadata {
767    my $self = shift (@_);
768    my ($section) = @_;
769
770    my $section_ptr = $self->_lookup_section($section);
771    if (!defined $section_ptr) {
772    print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
773    return;
774    }
775   
776    return $section_ptr->{'metadata'};
777}
778
779# $value is optional
780sub delete_metadata {
781    my $self = shift (@_);
782    my ($section, $field, $value) = @_;
783
784    my $section_ptr = $self->_lookup_section($section);
785    if (!defined $section_ptr) {
786    print STDERR "doc::delete_metadata couldn't find section ", $section, "\n";
787    return;
788    }
789
790    my $i = 0;
791    while ($i < scalar (@{$section_ptr->{'metadata'}})) {
792    if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
793        (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
794        splice (@{$section_ptr->{'metadata'}}, $i, 1);
795    } else {
796        $i++;
797    }
798    }
799}
800
801sub delete_all_metadata {
802    my $self = shift (@_);
803    my ($section) = @_;
804
805    my $section_ptr = $self->_lookup_section($section);
806    if (!defined $section_ptr) {
807    print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
808    return;
809    }
810   
811    $section_ptr->{'metadata'} = [];
812}
813
814sub set_metadata_element {
815    my $self = shift (@_);
816    my ($section, $field, $value) = @_;
817
818    $self->set_utf8_metadata_element ($section, $field,
819                      &unicode::ascii2utf8(\$value));
820}
821
822# set_utf8_metadata_element assumes the text has already been
823# converted to the UTF-8 encoding.
824sub set_utf8_metadata_element {
825    my $self = shift (@_);
826    my ($section, $field, $value) = @_;
827
828    $self->delete_metadata ($section, $field);
829    $self->add_utf8_metadata ($section, $field, $value);
830}
831
832
833# add_metadata assumes the text is in (extended) ascii form. For
834# text which has already been converted to the UTF-8 format use
835# add_utf8_metadata.
836sub add_metadata {
837    my $self = shift (@_);
838    my ($section, $field, $value) = @_;
839
840    $self->add_utf8_metadata ($section, $field,
841                  &unicode::ascii2utf8(\$value));
842}
843
844sub add_utf8_metadata {
845    my $self = shift (@_);
846    my ($section, $field, $value) = @_;
847
848    my $section_ptr = $self->_lookup_section($section);
849    if (!defined $section_ptr) {
850    print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
851    return;
852    }
853    if (!defined $value) {
854    print STDERR "doc::add_utf8_metadata undefined value for $field\n";
855    return;
856    }
857    if (!defined $field) {
858    print STDERR "doc::add_utf8_metadata undefined metadata type \n";
859    return;
860    }
861   
862    #print STDERR "###$field=$value\n";
863    # double check that the value is utf-8
864    if (!&unicode::check_is_utf8($value)) {
865    print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
866    &unicode::ensure_utf8(\$value);
867    print STDERR " Tried converting to utf8: $value\n";
868    }
869
870    push (@{$section_ptr->{'metadata'}}, [$field, $value]);
871}
872
873
874# methods for dealing with text
875
876# returns the text for a section
877sub get_text {
878    my $self = shift (@_);
879    my ($section) = @_;
880
881    my $section_ptr = $self->_lookup_section($section);
882    if (!defined $section_ptr) {
883    print STDERR "doc::get_text couldn't find section " .
884        "$section\n";
885    return "";
886    }
887
888    return $section_ptr->{'text'};
889}
890
891# returns the (utf-8 encoded) length of the text for a section
892sub get_text_length {
893    my $self = shift (@_);
894    my ($section) = @_;
895
896    my $section_ptr = $self->_lookup_section($section);
897    if (!defined $section_ptr) {
898    print STDERR "doc::get_text_length couldn't find section " .
899        "$section\n";
900    return 0;
901    }
902
903    return length ($section_ptr->{'text'});
904}
905
906sub delete_text {
907    my $self = shift (@_);
908    my ($section) = @_;
909
910    my $section_ptr = $self->_lookup_section($section);
911    if (!defined $section_ptr) {
912    print STDERR "doc::delete_text couldn't find section " .
913        "$section\n";
914    return;
915    }
916
917    $section_ptr->{'text'} = "";
918}
919
920# add_text assumes the text is in (extended) ascii form. For
921# text which has been already converted to the UTF-8 format
922# use add_utf8_text.
923sub add_text {
924    my $self = shift (@_);
925    my ($section, $text) = @_;
926
927    # convert the text to UTF-8 encoded unicode characters
928    # and add the text
929    $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
930}
931
932
933# add_utf8_text assumes the text to be added has already
934# been converted to the UTF-8 encoding. For ascii text use
935# add_text
936sub add_utf8_text {
937    my $self = shift (@_);
938    my ($section, $text) = @_;
939
940    my $section_ptr = $self->_lookup_section($section);
941    if (!defined $section_ptr) {
942    print STDERR "doc::add_utf8_text couldn't find section " .
943        "$section\n";
944    return;
945    }
946
947    $section_ptr->{'text'} .= $text;
948}
949
950# returns the Source meta, which is the utf8 filename generated.
951# Added a separate method here for convenience
952sub get_source {
953    my $self = shift (@_);
954    return $self->get_metadata_element ($self->get_top_section(), "Source");
955}
956
957# returns the SourceFile meta, which is the url reference to the URL-encoded
958# version of Source (the utf8 filename). Added a separate method here for convenience
959sub get_sourcefile {
960    my $self = shift (@_);
961    return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
962}
963
964# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
965# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
966sub get_assocfile_from_sourcefile {
967    my $self = shift (@_);
968   
969    # get the SourceFile meta, which is a *URL* to a file on the filesystem
970    my $top_section = $self->get_top_section();
971    my $source_file = $self->get_metadata_element($top_section, "SourceFile");
972
973    # get the actual filename as it exists on the filesystem which this url refers to
974    $source_file = &unicode::url_to_filename($source_file);
975    my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
976    return $assocfilename;
977}
978
979# methods for dealing with associated files
980
981# a file is associated with a document, NOT a section.
982# if section is defined it is noted in the data structure
983# only so that files associated from a particular section
984# may be removed later (using delete_section_assoc_files)
985sub associate_file {
986    my $self = shift (@_);
987    my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
988    $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
989
990    # remove all associated files with the same name
991    $self->delete_assoc_file ($assoc_filename);
992   
993    push (@{$self->{'associated_files'}},
994      [$real_filename, $assoc_filename, $mime_type, $section]);
995}
996
997# returns a list of associated files in the form
998#   [[real_filename, assoc_filename, mimetype], ...]
999sub get_assoc_files {
1000    my $self = shift (@_);
1001
1002    return $self->{'associated_files'};
1003}
1004
1005# the following two methods used to keep track of original associated files
1006# for incremental building. eg a txt file used by an item file does not end
1007# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1008# file for incremental build
1009sub associate_source_file {
1010    my $self = shift (@_);
1011    my ($full_filename) = @_;
1012
1013    push (@{$self->{'source_assoc_files'}}, $full_filename);
1014
1015}
1016
1017sub get_source_assoc_files {
1018    my $self = shift (@_);
1019
1020    return $self->{'source_assoc_files'};
1021 
1022
1023}
1024sub metadata_file {
1025    my $self = shift (@_);
1026    my ($real_filename, $filename) = @_;
1027   
1028    push (@{$self->{'metadata_files'}},
1029      [$real_filename, $filename]);
1030}
1031
1032# used for writing out archiveinf-doc.gdb, to list all the metadata files
1033sub get_meta_files {
1034    my $self = shift (@_);
1035
1036    return $self->{'metadata_files'};
1037}
1038
1039sub delete_section_assoc_files {
1040    my $self = shift (@_);
1041    my ($section) = @_;
1042
1043    my $i=0;
1044    while ($i < scalar (@{$self->{'associated_files'}})) {
1045    if (defined $self->{'associated_files'}->[$i]->[3] &&
1046        $self->{'associated_files'}->[$i]->[3] eq $section) {
1047        splice (@{$self->{'associated_files'}}, $i, 1);
1048    } else {
1049        $i++;
1050    }
1051    }
1052}
1053
1054sub delete_assoc_file {
1055    my $self = shift (@_);
1056    my ($assoc_filename) = @_;
1057
1058    my $i=0;
1059    while ($i < scalar (@{$self->{'associated_files'}})) {
1060    if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1061        splice (@{$self->{'associated_files'}}, $i, 1);
1062    } else {
1063        $i++;
1064    }
1065    }
1066}
1067
1068sub reset_nextsection_ptr {
1069    my $self = shift (@_);
1070    my ($section) = @_;
1071   
1072    my $section_ptr = $self->_lookup_section($section);
1073    $section_ptr->{'next_subsection'} = 1;
1074}
1075
10761;
Note: See TracBrowser for help on using the browser.