root/main/trunk/greenstone2/perllib/doc.pm @ 27393

Revision 27393, 38.2 KB (checked in by jmt12, 6 years ago)

Replace hardcoded -e with FileUtils::fileExists() call and util::rm() with FileUtils::removeFiles() call

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr   te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32    die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use FileUtils;
40use ghtml;
41use File::stat;
42##use hashdoc;
43use docprint;
44
45# the document type may be indexed_doc, nonindexed_doc, or
46# classification
47
48our $OIDcount = 0;
49
50# rename_method can be 'url', 'none', 'base64'
51sub new {
52    my $class = shift (@_);
53    my ($source_filename, $doc_type, $rename_method) = @_;
54
55
56    my $self = bless {'associated_files'=>[],
57              'subsection_order'=>[],
58              'next_subsection'=>1,
59              'subsections'=>{},
60              'metadata'=>[],
61              'text'=>"",
62              'OIDtype'=>"hash"}, $class;
63
64    # used to set lastmodified here, but this can screw up the HASH ids, so
65    # the docsave processor now calls set_lastmodified
66
67    $self->set_source_path($source_filename);
68   
69    if (defined $source_filename) {
70    $source_filename = &util::filename_within_collection($source_filename);
71    print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
72    $self->set_source_filename ($source_filename, $rename_method);
73    }
74
75    $self->set_doc_type ($doc_type) if defined $doc_type;
76
77    return $self;
78}
79
80
81sub set_source_path
82{
83    my $self = shift @_;
84    my ($source_filename) = @_;
85
86    if (defined $source_filename) {
87    # On Windows the source_filename can be in terse DOS format
88    # e.g. test~1.txt
89
90    $self->{'terse_source_path'} = $source_filename;
91
92        # Use the FileUtil library methods as they are aware of more special
93        # cases such as HDFS [jmt12]
94    if (&FileUtils::fileExists($source_filename))
95        {
96        # See if we can do better for Windows with a filename
97        if ($ENV{'GSDLOS'} =~ /^windows$/i) {
98        require Win32;
99        $self->{'source_path'} = Win32::GetLongPathName($source_filename);
100        }
101        else {
102        # For Unix-based systems, there is no difference between the two
103        $self->{'source_path'} = $source_filename;
104        }
105    }
106    else {
107        print STDERR "Warning: In doc::set_source_path(), file\n";
108        print STDERR "           $source_filename\n";
109        print STDERR "         does not exist\n";
110       
111        # (default) Set it to whatever we were given
112        $self->{'source_path'} = $source_filename;
113    }   
114    }
115    else {
116    # Previous code for setting source_path allowed for
117    # it to be undefined, so continue this practice
118    $self->{'terse_source_path'} = undef;
119    $self->{'source_path'} = undef;
120    }
121}
122
123
124sub get_source_path
125{
126    my $self = shift @_;
127
128    return $self->{'terse_source_path'};
129}
130
131# set lastmodified for OAI purposes, added by GRB, moved by kjdon
132sub set_oailastmodified {
133    my $self = shift (@_);
134
135    my $source_path = $self->{'terse_source_path'};
136   
137    if (defined $source_path && (-e $source_path)) {
138    my $current_time = time;
139
140    my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
141        $wday, $yday, $isdst) = localtime($current_time);
142
143    my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
144
145    $self->add_utf8_metadata($self->get_top_section(), "oailastmodified", $current_time);
146    $self->add_utf8_metadata($self->get_top_section(), "oailastmodifieddate", $date_modified);
147    }
148}
149
150# no longer used for OAI purposes, since lastmodified is not what we want as the
151# Datestamp of a document. This doc metadata may be useful for general purposes.
152sub set_lastmodified {
153    my $self = shift (@_);
154
155    my $source_path = $self->{'terse_source_path'};
156   
157    if (defined $source_path && (-e $source_path)) {
158
159        my $file_stat = stat($source_path);
160    my $mtime = $file_stat->mtime;
161    my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
162        $wday, $yday, $isdst) = localtime($mtime);
163
164    my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
165
166    $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $mtime);
167    $self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
168    }
169}
170
171# clone the $self object
172sub duplicate {
173    my $self = shift (@_);
174
175    my $newobj = {};
176   
177    foreach my $k (keys %$self) {
178    $newobj->{$k} = &clone ($self->{$k});
179    }
180
181    bless $newobj, ref($self);
182    return $newobj;
183}
184
185sub clone {
186    my ($from) = @_;
187    my $type = ref ($from);
188
189    if ($type eq "HASH") {
190    my $to = {};
191    foreach my $key (keys %$from) {
192        $to->{$key} = &clone ($from->{$key});
193    }
194    return $to;
195    } elsif ($type eq "ARRAY") {
196    my $to = [];
197    foreach my $v (@$from) {
198        push (@$to, &clone ($v));
199    }
200    return $to;
201    } else {
202    return $from;
203    }
204}
205
206sub set_OIDtype {
207    my $self = shift (@_);
208    my ($type, $metadata) = @_;
209
210    if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|hash_on_full_filename|incremental|filename|dirname|full_filename|assigned)$/) {
211    $self->{'OIDtype'} = $type;
212    } else {
213    $self->{'OIDtype'} = "hash";
214    }
215
216    if ($type =~ /^assigned$/) {
217    if (defined $metadata) {
218        $self->{'OIDmetadata'} = $metadata;
219    } else {
220        $self->{'OIDmetadata'} = "dc.Identifier";
221    }
222    }
223}
224
225# rename_method can be 'url', 'none', 'base64'
226sub set_source_filename {
227    my $self = shift (@_);
228    my ($source_filename, $rename_method) = @_;
229
230    # Since the gsdlsourcefilename element goes into the doc.xml it has
231    # to be utf8. However, it should also *represent* the source filename
232    # (in the import directory) which may not be utf8 at all.
233    # For instance, if this meta element (gsdlsourcefilename) will be used
234    # by other applications that parse doc.xml in order to locate
235    # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
236    # encode the real filename as this is a binary-to-text encoding meaning
237    # that the resulting string is ASCII (utf8). Decoding will give the original.
238   
239#    print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
240
241    # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
242    $source_filename = $self->encode_filename($source_filename, $rename_method);
243#    my ($srcfilename,$dirname,$suffix)
244#   = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
245#    print STDERR "-> $srcfilename -> ";
246#    $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
247#    $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
248#    print STDERR "$source_filename\n";
249   
250    $self->set_utf8_metadata_element ($self->get_top_section(),
251                 "gsdlsourcefilename",
252                 $source_filename);
253}
254
255sub encode_filename {
256    my $self = shift (@_);
257    my ($source_filename, $rename_method) = @_;
258
259     my ($srcfilename,$dirname,$suffix)
260    = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
261#    print STDERR "-> $srcfilename -> ";
262    $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
263    $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
264
265    return $source_filename;
266}
267
268sub set_converted_filename {
269    my $self = shift (@_);
270    my ($converted_filename) = @_;
271
272    # we know the converted filename is utf8
273    $self->set_utf8_metadata_element ($self->get_top_section(),
274                 "gsdlconvertedfilename",
275                 $converted_filename);
276}
277
278# returns the source_filename as it was provided
279sub get_unmodified_source_filename {
280    my $self = shift (@_);
281
282    return $self->{'terse_source_path'};
283}
284
285# returns the source_filename with whatever rename_method was given
286sub get_source_filename {
287    my $self = shift (@_);
288
289    return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
290}
291
292
293
294# returns converted filename if available else returns source filename
295sub get_filename_for_hashing {
296    my $self = shift (@_);
297
298    my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
299
300    if (!defined $filename) {
301    my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
302    # if NULPlug processed file, then don't give a filename
303    if (defined $plugin_name && $plugin_name eq "NULPlug") {
304        $filename = undef;
305    } else { # returns the URL encoded source filename!
306        $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
307    }
308    }
309
310    if (!&FileUtils::isFilenameAbsolute($filename)) {
311    $filename = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$filename);
312    }
313
314    return $filename;
315}
316
317sub set_doc_type {
318    my $self = shift (@_);
319    my ($doc_type) = @_;
320
321    $self->set_metadata_element ($self->get_top_section(),
322                 "gsdldoctype",
323                 $doc_type);
324}
325
326# returns the gsdldoctype as it was provided
327# the default of "indexed_doc" is used if no document
328# type was provided
329sub get_doc_type {
330    my $self = shift (@_);
331
332    my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
333    return $doc_type if (defined $doc_type);
334    return "indexed_doc";
335}
336
337
338# look up the reference to the a particular section
339sub _lookup_section {
340    my $self = shift (@_);
341    my ($section) = @_;
342
343    my ($num);
344    my $sectionref = $self;
345
346    while (defined $section && $section ne "") {
347   
348    ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
349   
350    $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
351   
352    $section = "" unless defined $section;
353   
354
355    if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
356        $sectionref = $sectionref->{'subsections'}->{$num};
357    } else {
358        return undef;
359    }
360    }
361   
362    return $sectionref;
363}
364
365# calculate OID by hashing the contents of the document
366sub _calc_OID {
367    my $self = shift (@_);
368    my ($filename) = @_;
369
370
371    my $osexe = &util::get_os_exe();
372
373    my $hashfile_exe = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin",
374                       $ENV{'GSDLOS'},"hashfile$osexe");
375
376    my $result = "NULL";
377
378   
379    if (-e "$hashfile_exe") {
380#   $result = `\"$hashfile_exe\" \"$filename\"`;
381#   $result = `hashfile$osexe \"$filename\" 2>&1`;
382    $result = `hashfile$osexe \"$filename\"`;
383
384    ($result) = $result =~ /:\s*([0-9a-f]+)/i;
385    } else {
386    print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
387    }
388    return "HASH$result";
389}
390
391# methods dealing with OID, not groups of them.
392
393# if $OID is not provided one is calculated
394sub set_OID {
395    my $self = shift (@_);
396    my ($OID) = @_;
397   
398    my $use_hash_oid = 0;
399    # if an OID wasn't provided calculate one
400    if (!defined $OID) {
401    $OID = "NULL";
402    if ($self->{'OIDtype'} =~ /^hash/) {
403        $use_hash_oid = 1;
404    } elsif ($self->{'OIDtype'} eq "incremental") {
405        $OID = "D" . $OIDcount;
406        $OIDcount ++;
407    } elsif ($self->{'OIDtype'} eq "filename") {
408        my $filename = $self->get_source_filename();
409        $OID = &File::Basename::fileparse($filename, qr/\.[^.]*/);
410        $OID = &util::tidy_up_oid($OID);
411    } elsif ($self->{'OIDtype'} eq "full_filename") {
412        my $source_filename = $self->get_source_filename();
413        my $dirsep = &util::get_os_dirsep();
414
415        $source_filename =~ s/^import$dirsep//;
416        $source_filename =~ s/$dirsep/-/g;
417        $source_filename =~ s/\./_/g;
418
419        $OID = $source_filename;
420        $OID = &util::tidy_up_oid($OID);
421    } elsif ($self->{'OIDtype'} eq "dirname") {
422        my $filename = $self->get_source_filename();
423        if (defined($filename)) { # && -e $filename) {
424        # get the immediate parent directory
425        $OID = &File::Basename::dirname($filename);
426        if (defined $OID) {
427            $OID = &File::Basename::basename($OID);
428            $OID = &util::tidy_up_oid($OID);
429        } else {
430            print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
431            $use_hash_oid = 1;
432        }
433        } else {
434        print STDERR "Failed to find a filename, generating hash id\n";
435        $use_hash_oid = 1;
436        }   
437       
438    } elsif ($self->{'OIDtype'} eq "assigned") {
439        my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
440        if (defined $identifier && $identifier ne "") {
441        $OID = $identifier;
442        $OID = &util::tidy_up_oid($OID);
443        } else {
444        # need a hash id
445        print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
446        $use_hash_oid = 1;
447        }
448               
449    } else {
450        $use_hash_oid = 1;
451    }
452
453    if ($use_hash_oid) {
454        my $hash_on_file = 1;
455        my $hash_on_ga_xml = 0;
456
457        if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
458        $hash_on_file = 0;
459        $hash_on_ga_xml = 1;
460        }
461
462        if ($self->{'OIDtype'} eq "hash_on_full_filename") {
463        $hash_on_file = 0;
464        $hash_on_ga_xml = 0;
465
466        my $source_filename = $self->get_source_filename();
467        my $dirsep = &util::get_os_dirsep();
468
469        $source_filename =~ s/^import$dirsep//;
470        $source_filename =~ s/$dirsep/-/g;
471        $source_filename =~ s/\./_/g;
472       
473        # If the filename is very short then (handled naively)
474        # this can cause conjestion in the hash-values
475        # computed, leading documents sharing the same leading
476        # Hex values in the computed has.
477        #
478        # The solution taken here is to replace the name of
479        # the file name a sufficient number of times (up to
480        # the character limit defined in 'rep_limit' and
481        # make that the content that is hashed on
482
483        # *** Think twice before changing the following value
484        # as it will break backward compatability of computed
485        # document HASH values
486
487        my $rep_limit  = 256;
488        my $hash_content = undef;
489
490        if (length($source_filename)<$rep_limit) {
491            my $rep_string = "$source_filename|";
492            my $rs_len = length($rep_string);
493
494            my $clone_times = int(($rep_limit-1)/$rs_len) +1;
495           
496            $hash_content = substr($rep_string x $clone_times, 0, $rep_limit);
497        }
498        else {
499            $hash_content = $source_filename;
500        }
501
502        my $filename = &util::get_tmp_filename();
503        if (!open (OUTFILE, ">:utf8", $filename)) {
504            print STDERR "doc::set_OID could not write to $filename\n";
505        } else {
506            print OUTFILE $hash_content;
507            close (OUTFILE);
508        }
509        $OID = $self->_calc_OID ($filename);
510
511        &util::rm ($filename);
512        }
513
514        if ($hash_on_file) {
515        # "hash" OID - feed file to hashfile.exe
516        my $filename = $self->get_filename_for_hashing();
517       
518        # -z: don't want to hash on the file if it is zero size
519        if (defined($filename) && -e $filename && !-z $filename) {
520            $OID = $self->_calc_OID ($filename);
521        } else {
522            $hash_on_ga_xml = 1; # switch to back-up plan, and hash on GA file instead
523        }
524        }
525
526        if ($hash_on_ga_xml) {
527        # In addition being asked to explicity calculate the has based on the GA file,
528        # can also end up coming into this block is doing 'hash_on_file' but the file
529        # itself is of zero bytes (as could be the case with 'doc.nul' file
530
531        my $filename = &util::get_tmp_filename();
532        if (!open (OUTFILE, ">:utf8", $filename)) {
533            print STDERR "doc::set_OID could not write to $filename\n";
534        } else {
535            my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
536            print OUTFILE $doc_text;
537            close (OUTFILE);
538        }
539        $OID = $self->_calc_OID ($filename);
540        &FileUtils::removeFiles($filename);
541        }
542    }
543    }
544    $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
545}
546
547# this uses hashdoc (embedded c thingy) which is faster but still
548# needs a little work to be suffiently stable
549sub ___set_OID {
550    my $self = shift (@_);
551    my ($OID) = @_;
552
553    # if an OID wasn't provided then calculate hash value based on document
554    if (!defined $OID)
555    {
556    my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
557    my $hash_len = length($hash_text);
558
559        $OID = &hashdoc::buffer($hash_text,$hash_len);
560    }
561
562    $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
563}
564
565# returns the OID for this document
566sub get_OID {
567    my $self = shift (@_);
568    my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
569    return $OID if (defined $OID);
570    return "NULL";
571}
572
573sub delete_OID {
574    my $self = shift (@_);
575   
576    $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
577}
578
579
580# methods for manipulating section names
581
582# returns the name of the top-most section (the top
583# level of the document
584sub get_top_section {
585    my $self = shift (@_);
586   
587    return "";
588}
589
590# returns a section
591sub get_parent_section {
592    my $self = shift (@_);
593    my ($section) = @_;
594
595    $section =~ s/(^|\.)\d+$//;
596
597    return $section;
598}
599
600# returns the first child section (or the end child
601# if there isn't any)
602sub get_begin_child {
603    my $self = shift (@_);
604    my ($section) = @_;
605
606    my $section_ptr = $self->_lookup_section($section);
607    return "" unless defined $section_ptr;
608
609    if (defined $section_ptr->{'subsection_order'}->[0]) {
610    return "$section.$section_ptr->{'subsection_order'}->[0]";
611    }
612
613    return $self->get_end_child ($section);
614}
615
616# returns the next child of a parent section
617sub get_next_child {
618    my $self = shift (@_);
619    my ($section) = @_;
620   
621    my $parent_section = $self->get_parent_section($section);
622    my $parent_section_ptr = $self->_lookup_section($parent_section);
623    return undef unless defined $parent_section_ptr;
624
625    my ($section_num) = $section =~ /(\d+)$/;
626    return undef unless defined $section_num;
627
628    my $i = 0;
629    my $section_order = $parent_section_ptr->{'subsection_order'};
630    while ($i < scalar(@$section_order)) {
631    last if $section_order->[$i] eq $section_num;
632    $i++;
633    }
634
635    $i++; # the next child
636    if ($i < scalar(@$section_order)) {
637    return $section_order->[$i] if $parent_section eq "";
638    return "$parent_section.$section_order->[$i]";
639    }
640
641    # no more sections in this level
642    return undef;
643}
644
645# returns a reference to a list of children
646sub get_children {
647    my $self = shift (@_);
648    my ($section) = @_;
649
650    my $section_ptr = $self->_lookup_section($section);
651    return [] unless defined $section_ptr;
652
653    my @children = @{$section_ptr->{'subsection_order'}};
654
655    map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
656    return \@children;
657}
658
659# returns the child section one past the last one (which
660# is coded as "0")
661sub get_end_child {
662    my $self = shift (@_);
663    my ($section) = @_;
664
665    return $section . ".0" unless $section eq "";
666    return "0";
667}
668
669# returns the next section in book order
670sub get_next_section {
671    my $self = shift (@_);
672    my ($section) = @_;
673
674    return undef unless defined $section;
675
676    my $section_ptr = $self->_lookup_section($section);
677    return undef unless defined $section_ptr;
678
679    # first try to find first child
680    if (defined $section_ptr->{'subsection_order'}->[0]) {
681    return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
682    return "$section.$section_ptr->{'subsection_order'}->[0]";
683    }
684
685    do {
686    # try to find sibling
687    my $next_child = $self->get_next_child ($section);
688    return $next_child if (defined $next_child);
689
690    # move up one level
691    $section = $self->get_parent_section ($section);
692    } while $section =~ /\d/;
693
694    return undef;
695}
696
697sub is_leaf_section {
698    my $self = shift (@_);
699    my ($section) = @_;
700
701    my $section_ptr = $self->_lookup_section($section);
702    return 1 unless defined $section_ptr;
703
704    return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
705}
706
707# methods for dealing with sections
708
709# returns the name of the inserted section
710sub insert_section {
711    my $self = shift (@_);
712    my ($before_section) = @_;
713
714    # get the child to insert before and its parent section
715    my $parent_section = "";
716    my $before_child = "0";
717    my @before_section = split (/\./, $before_section);
718    if (scalar(@before_section) > 0) {
719    $before_child = pop (@before_section);
720    $parent_section = join (".", @before_section);
721    }
722
723    my $parent_section_ptr = $self->_lookup_section($parent_section);
724    if (!defined $parent_section_ptr) {
725    print STDERR "doc::insert_section couldn't find parent section " .
726        "$parent_section\n";
727    return;
728    }
729
730    # get the next section number
731    my $section_num = $parent_section_ptr->{'next_subsection'}++;
732
733    my $i = 0;
734    while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
735       $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
736    $i++;
737    }
738   
739    # insert the section number into the order list
740    splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
741
742    # add this section to the parent section
743    my $section_ptr = {'subsection_order'=>[],
744               'next_subsection'=>1,
745               'subsections'=>{},
746               'metadata'=>[],
747               'text'=>""};
748    $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
749
750    # work out the full section number
751    my $section = $parent_section;
752    $section .= "." unless $section eq "";
753    $section .= $section_num;
754   
755    return $section;
756}
757
758# creates a pre-named section
759sub create_named_section {
760    my $self = shift (@_);
761    my ($mastersection) = @_;
762
763    my ($num);
764    my $section = $mastersection;
765    my $sectionref = $self;
766
767    while ($section ne "") {
768    ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
769    $num =~ s/^0+(\d)/$1/; # remove leading 0s
770    $section = "" unless defined $section;
771   
772    if (defined $num) {
773        if (!defined $sectionref->{'subsections'}->{$num}) {
774        push (@{$sectionref->{'subsection_order'}}, $num);
775        $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
776                            'next_subsection'=>1,
777                            'subsections'=>{},
778                            'metadata'=>[],
779                            'text'=>""};
780        if ($num >= $sectionref->{'next_subsection'}) {
781            $sectionref->{'next_subsection'} = $num + 1;
782        }
783        }
784        $sectionref = $sectionref->{'subsections'}->{$num};
785
786    } else {
787        print STDERR "doc::create_named_section couldn't create section ";
788        print STDERR "$mastersection\n";
789        last;
790    }
791    }
792}
793
794# returns a reference to a list of subsections
795sub list_subsections {
796    my $self = shift (@_);
797    my ($section) = @_;
798
799    my $section_ptr = $self->_lookup_section ($section);
800    if (!defined $section_ptr) {
801    print STDERR "doc::list_subsections couldn't find section $section\n";
802    return [];
803    }
804
805    return [@{$section_ptr->{'subsection_order'}}];
806}
807
808sub delete_section {
809    my $self = shift (@_);
810    my ($section) = @_;
811
812#    my $section_ptr = {'subsection_order'=>[],
813#              'next_subsection'=>1,
814#              'subsections'=>{},
815#              'metadata'=>[],
816#              'text'=>""};
817
818    # if this is the top section reset everything
819    if ($section eq "") {
820    $self->{'subsection_order'} = [];
821    $self->{'subsections'} = {};
822    $self->{'metadata'} = [];
823    $self->{'text'} = "";
824    return;
825    }
826
827    # find the parent of the section to delete
828    my $parent_section = "";
829    my $child = "0";
830    my @section = split (/\./, $section);
831    if (scalar(@section) > 0) {
832    $child = pop (@section);
833    $parent_section = join (".", @section);
834    }
835
836    my $parent_section_ptr = $self->_lookup_section($parent_section);
837    if (!defined $parent_section_ptr) {
838    print STDERR "doc::delete_section couldn't find parent section " .
839        "$parent_section\n";
840    return;
841    }
842
843    # remove this section from the subsection_order list
844    my $i = 0;
845    while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
846    if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
847        splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
848        last;
849    }
850    $i++;
851    }
852
853    # remove this section from the subsection hash
854    if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
855    undef $parent_section_ptr->{'subsections'}->{$child};
856    }
857}
858
859#--
860# methods for dealing with metadata
861
862# set_metadata_element and get_metadata_element are for metadata
863# which should only have one value. add_meta_data and get_metadata
864# are for metadata which can have more than one value.
865
866# returns the first metadata value which matches field
867
868# This version of get metadata element works much like the one above,
869# except it allows for the namespace portion of a metadata element to
870# be ignored, thus if you are searching for dc.Title, the first piece
871# of matching metadata ending with the name Title (once any namespace
872# is removed) would be returned.
873# 28-11-2003 John Thompson
874sub get_metadata_element {
875    my $self = shift (@_);
876    my ($section, $field, $ignore_namespace) = @_;
877    my ($data);
878
879    $ignore_namespace = 0 unless defined $ignore_namespace;
880
881    my $section_ptr = $self->_lookup_section($section);
882    if (!defined $section_ptr) {
883    print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
884    return;
885    }
886
887    # Remove any namespace if we are being told to ignore them
888    if($ignore_namespace) {
889    $field =~ s/^.*\.//; #$field =~ s/^\w*\.//;
890    }
891
892    foreach $data (@{$section_ptr->{'metadata'}}) {
893
894    my $data_name = $data->[0];
895
896    # Remove any namespace if we are being told to ignore them
897    if($ignore_namespace) {
898        $data_name =~ s/^.*\.//; #$data_name =~ s/^\w*\.//;
899    }
900    # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
901    $data_name =~ s/^ex\.([^.]+)$/$1/; #$data_name =~ s/^ex\.//;
902   
903    return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
904    }
905   
906    return undef; # was not found
907}
908
909# returns a list of the form [value1, value2, ...]
910sub get_metadata {
911    my $self = shift (@_);
912    my ($section, $field, $ignore_namespace) = @_;
913    my ($data);
914
915    $ignore_namespace = 0 unless defined $ignore_namespace;
916
917    my $section_ptr = $self->_lookup_section($section);
918    if (!defined $section_ptr) {
919        print STDERR "doc::get_metadata couldn't find section ",
920        $section, "\n";
921        return;
922    }
923
924    # Remove any namespace if we are being told to ignore them
925    if($ignore_namespace) {
926    $field =~ s/^.*\.//;
927    }
928
929    my @metadata = ();
930    foreach $data (@{$section_ptr->{'metadata'}}) {
931
932    my $data_name = $data->[0];
933
934    # Remove any namespace if we are being told to ignore them
935    if($ignore_namespace) {
936        $data_name =~ s/^.*\.//;
937    }   
938    # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
939    $data_name =~ s/^ex\.([^.]+)$/$1/;
940
941        push (@metadata, $data->[1]) if ($data_name eq $field);
942    }
943
944    return \@metadata;
945}
946
947sub get_metadata_hashmap {
948    my $self = shift (@_);
949    my ($section, $opt_namespace) = @_;
950   
951    my $section_ptr = $self->_lookup_section($section);
952    if (!defined $section_ptr) {
953            print STDERR "doc::get_metadata couldn't find section ",
954            $section, "\n";
955            return;
956        }
957
958    my $metadata_hashmap = {};
959    foreach my $data (@{$section_ptr->{'metadata'}}) {
960            my $metaname = $data->[0];
961         
962            if ((!defined $opt_namespace) || ($metaname =~ m/^$opt_namespace\./)) {
963                if (!defined $metadata_hashmap->{$metaname}) {
964                    $metadata_hashmap->{$metaname} = [];
965                  }
966                my $metaval_list = $metadata_hashmap->{$metaname};
967                push(@$metaval_list, $data->[1]);
968              }
969          }
970   
971    return $metadata_hashmap;
972}
973
974# returns a list of the form [[field,value],[field,value],...]
975sub get_all_metadata {
976    my $self = shift (@_);
977    my ($section) = @_;
978
979    my $section_ptr = $self->_lookup_section($section);
980    if (!defined $section_ptr) {
981    print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
982    return;
983    }
984   
985    return $section_ptr->{'metadata'};
986}
987
988# $value is optional
989sub delete_metadata {
990    my $self = shift (@_);
991    my ($section, $field, $value) = @_;
992
993    my $section_ptr = $self->_lookup_section($section);
994    if (!defined $section_ptr) {
995    print STDERR "doc::delete_metadata couldn't find section ", $section, "$field\n";
996    return;
997    }
998
999    my $i = 0;
1000    while ($i < scalar (@{$section_ptr->{'metadata'}})) {
1001    if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
1002        (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
1003        splice (@{$section_ptr->{'metadata'}}, $i, 1);
1004    } else {
1005        $i++;
1006    }
1007    }
1008}
1009
1010sub delete_all_metadata {
1011    my $self = shift (@_);
1012    my ($section) = @_;
1013
1014    my $section_ptr = $self->_lookup_section($section);
1015    if (!defined $section_ptr) {
1016    print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
1017    return;
1018    }
1019   
1020    $section_ptr->{'metadata'} = [];
1021}
1022
1023sub set_metadata_element {
1024    my $self = shift (@_);
1025    my ($section, $field, $value) = @_;
1026
1027    $self->set_utf8_metadata_element ($section, $field,
1028                      &unicode::ascii2utf8(\$value));
1029}
1030
1031# set_utf8_metadata_element assumes the text has already been
1032# converted to the UTF-8 encoding.
1033sub set_utf8_metadata_element {
1034    my $self = shift (@_);
1035    my ($section, $field, $value) = @_;
1036   
1037    $self->delete_metadata ($section, $field);
1038    $self->add_utf8_metadata ($section, $field, $value);
1039}
1040
1041
1042# add_metadata assumes the text is in (extended) ascii form. For
1043# text which has already been converted to the UTF-8 format use
1044# add_utf8_metadata.
1045sub add_metadata {
1046    my $self = shift (@_);
1047    my ($section, $field, $value) = @_;
1048   
1049    $self->add_utf8_metadata ($section, $field,
1050                  &unicode::ascii2utf8(\$value));
1051}
1052
1053sub add_utf8_metadata {
1054    my $self = shift (@_);
1055    my ($section, $field, $value) = @_;
1056   
1057    #    my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1058    #    my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
1059    #    print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
1060
1061    my $section_ptr = $self->_lookup_section($section);
1062    if (!defined $section_ptr) {
1063    print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
1064    return;
1065    }
1066    if (!defined $value) {
1067    print STDERR "doc::add_utf8_metadata undefined value for $field\n";
1068    return;
1069    }
1070    if (!defined $field) {
1071    print STDERR "doc::add_utf8_metadata undefined metadata type \n";
1072    return;
1073    }
1074   
1075    #print STDERR "###$field=$value\n";
1076
1077    # For now, supress this check.  Given that text data read in is now
1078    # Unicode aware, then the following block of code can (ironically enough)
1079    # cause our unicode compliant string to be re-encoded (leading to
1080    # a double-encoded UTF-8 string, which we definitely don't want!).
1081   
1082
1083    # double check that the value is utf-8
1084    #    if (!&unicode::check_is_utf8($value)) {
1085    #   print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
1086    #   &unicode::ensure_utf8(\$value);
1087    #   print STDERR " Tried converting to utf8: $value\n";
1088    #    }
1089
1090    #If the metadata value is either a latitude or a longitude value then we want to save a shortened version for spacial searching purposes
1091    if ($field =~ m/^(.+\.)?Latitude$/ || $field =~ m/^(.+\.)?Longitude$/)
1092    {
1093            my ($mdprefix,$metaname) = ($field =~ m/(.+)\.(.+)$/);
1094        if (defined $mdprefix) {
1095            # Add in a version of Latitude/Longitude without the metadata namespace prefix to keep Runtime happy
1096            push (@{$section_ptr->{'metadata'}}, [$metaname, $value]);
1097        }
1098
1099        my $direction;
1100        if($value =~ m/^-/)
1101        {
1102            $direction = ($field eq "Latitude") ? "S" : "W";
1103        }
1104        else
1105        {
1106            $direction = ($field eq "Latitude") ? "N" : "E";
1107        }
1108       
1109        my ($beforeDec, $afterDec) = ($value =~ m/^-?([0-9]+)\.([0-9]+)$/);
1110        if(defined $beforeDec && defined $afterDec)
1111        {
1112            my $name = ($field eq "Latitude") ? "LatShort" : "LngShort";
1113            push (@{$section_ptr->{'metadata'}}, [$name, $beforeDec . $direction]);
1114           
1115            for(my $i = 2; $i <= 4; $i++)
1116            {
1117                if(length($afterDec) >= $i)
1118                {
1119                    push (@{$section_ptr->{'metadata'}}, [$name, substr($afterDec, 0, $i)]);
1120                }
1121            }
1122           
1123            #Only add the metadata if it has not already been added
1124            my $metaMap = $self->get_metadata_hashmap($section);
1125        }
1126    }
1127
1128    push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1129}
1130
1131
1132# methods for dealing with text
1133
1134# returns the text for a section
1135sub get_text {
1136    my $self = shift (@_);
1137    my ($section) = @_;
1138
1139    my $section_ptr = $self->_lookup_section($section);
1140    if (!defined $section_ptr) {
1141    print STDERR "doc::get_text couldn't find section " .
1142        "$section\n";
1143    return "";
1144    }
1145
1146    return $section_ptr->{'text'};
1147}
1148
1149# returns the (utf-8 encoded) length of the text for a section
1150sub get_text_length {
1151    my $self = shift (@_);
1152    my ($section) = @_;
1153
1154    my $section_ptr = $self->_lookup_section($section);
1155    if (!defined $section_ptr) {
1156    print STDERR "doc::get_text_length couldn't find section " .
1157        "$section\n";
1158    return 0;
1159    }
1160
1161    return length ($section_ptr->{'text'});
1162}
1163
1164# returns the total length for all the sections
1165sub get_total_text_length {
1166    my $self = shift (@_);
1167
1168    my $section = $self->get_top_section();
1169    my $length = 0;
1170    while (defined $section) {
1171    $length += $self->get_text_length($section);
1172    $section = $self->get_next_section($section);
1173    }
1174    return $length;
1175}
1176
1177sub delete_text {
1178    my $self = shift (@_);
1179    my ($section) = @_;
1180
1181    my $section_ptr = $self->_lookup_section($section);
1182    if (!defined $section_ptr) {
1183    print STDERR "doc::delete_text couldn't find section " .
1184        "$section\n";
1185    return;
1186    }
1187
1188    $section_ptr->{'text'} = "";
1189}
1190
1191# add_text assumes the text is in (extended) ascii form. For
1192# text which has been already converted to the UTF-8 format
1193# use add_utf8_text.
1194sub add_text {
1195    my $self = shift (@_);
1196    my ($section, $text) = @_;
1197
1198    # convert the text to UTF-8 encoded unicode characters
1199    # and add the text
1200    $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1201}
1202
1203
1204# add_utf8_text assumes the text to be added has already
1205# been converted to the UTF-8 encoding. For ascii text use
1206# add_text
1207sub add_utf8_text {
1208    my $self = shift (@_);
1209    my ($section, $text) = @_;
1210
1211    my $section_ptr = $self->_lookup_section($section);
1212    if (!defined $section_ptr) {
1213    print STDERR "doc::add_utf8_text couldn't find section " .
1214        "$section\n";
1215    return;
1216    }
1217
1218    $section_ptr->{'text'} .= $text;
1219}
1220
1221# returns the Source meta, which is the utf8 filename generated.
1222# Added a separate method here for convenience
1223sub get_source {
1224    my $self = shift (@_);
1225    return $self->get_metadata_element ($self->get_top_section(), "Source");
1226}
1227
1228# returns the SourceFile meta, which is the url reference to the URL-encoded
1229# version of Source (the utf8 filename). Added a separate method here for convenience
1230sub get_sourcefile {
1231    my $self = shift (@_);
1232    return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
1233}
1234
1235# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
1236# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
1237sub get_assocfile_from_sourcefile {
1238    my $self = shift (@_);
1239   
1240    # get the SourceFile meta, which is a *URL* to a file on the filesystem
1241    my $top_section = $self->get_top_section();
1242    my $source_file = $self->get_metadata_element($top_section, "SourceFile");
1243
1244    # get the actual filename as it exists on the filesystem which this url refers to
1245    $source_file = &unicode::url_to_filename($source_file);
1246    my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
1247    return $assocfilename;
1248}
1249
1250# methods for dealing with associated files
1251
1252# a file is associated with a document, NOT a section.
1253# if section is defined it is noted in the data structure
1254# only so that files associated from a particular section
1255# may be removed later (using delete_section_assoc_files)
1256sub associate_file {
1257    my $self = shift (@_);
1258    my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1259    $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1260
1261    # remove all associated files with the same name
1262    $self->delete_assoc_file ($assoc_filename);
1263
1264    # Too harsh a requirement
1265    # Definitely get HTML docs, for example, with some missing
1266    # support files
1267#    if (!&util::fd_exists($real_filename)) {
1268#   print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n";
1269#   exit -1;
1270#    }
1271
1272#    print STDERR "**** is the following a UTF8 rep of *real* filename?\n   $real_filename\n";
1273#    print STDERR "****##### so, ensure it is before storing?!?!?\n";
1274##    my $utf8_filename = Encode::encode("utf8",$filename);
1275
1276    push (@{$self->{'associated_files'}},
1277      [$real_filename, $assoc_filename, $mime_type, $section]);
1278}
1279
1280# returns a list of associated files in the form
1281#   [[real_filename, assoc_filename, mimetype], ...]
1282sub get_assoc_files {
1283    my $self = shift (@_);
1284
1285    return $self->{'associated_files'};
1286}
1287
1288# the following two methods used to keep track of original associated files
1289# for incremental building. eg a txt file used by an item file does not end
1290# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1291# file for incremental build
1292sub associate_source_file {
1293    my $self = shift (@_);
1294    my ($full_filename) = @_;
1295
1296    push (@{$self->{'source_assoc_files'}}, $full_filename);
1297
1298}
1299
1300sub get_source_assoc_files {
1301    my $self = shift (@_);
1302
1303    return $self->{'source_assoc_files'};
1304 
1305
1306}
1307sub metadata_file {
1308    my $self = shift (@_);
1309    my ($real_filename, $filename) = @_;
1310   
1311    push (@{$self->{'metadata_files'}},
1312      [$real_filename, $filename]);
1313}
1314
1315# used for writing out the archiveinf-doc info database, to list all the metadata files
1316sub get_meta_files {
1317    my $self = shift (@_);
1318
1319    return $self->{'metadata_files'};
1320}
1321
1322sub delete_section_assoc_files {
1323    my $self = shift (@_);
1324    my ($section) = @_;
1325
1326    my $i=0;
1327    while ($i < scalar (@{$self->{'associated_files'}})) {
1328    if (defined $self->{'associated_files'}->[$i]->[3] &&
1329        $self->{'associated_files'}->[$i]->[3] eq $section) {
1330        splice (@{$self->{'associated_files'}}, $i, 1);
1331    } else {
1332        $i++;
1333    }
1334    }
1335}
1336
1337sub delete_assoc_file {
1338    my $self = shift (@_);
1339    my ($assoc_filename) = @_;
1340
1341    my $i=0;
1342    while ($i < scalar (@{$self->{'associated_files'}})) {
1343    if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1344        splice (@{$self->{'associated_files'}}, $i, 1);
1345    } else {
1346        $i++;
1347    }
1348    }
1349}
1350
1351sub reset_nextsection_ptr {
1352    my $self = shift (@_);
1353    my ($section) = @_;
1354   
1355    my $section_ptr = $self->_lookup_section($section);
1356    $section_ptr->{'next_subsection'} = 1;
1357}
1358
13591;
Note: See TracBrowser for help on using the browser.