root/main/trunk/greenstone2/perllib/doc.pm @ 26536

Revision 26536, 37.8 KB (checked in by davidb, 7 years ago)

Introduction of two new OIDtype values (hash_on_full_filename and full_filename) designed to help provide more stable document IDs for collections that are rebuilt over time, including rebuilt after the Greenstone install has been upgraded

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr   te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32    die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use ghtml;
40use File::stat;
41##use hashdoc;
42use docprint;
43
44# the document type may be indexed_doc, nonindexed_doc, or
45# classification
46
47our $OIDcount = 0;
48
49# rename_method can be 'url', 'none', 'base64'
50sub new {
51    my $class = shift (@_);
52    my ($source_filename, $doc_type, $rename_method) = @_;
53
54
55    my $self = bless {'associated_files'=>[],
56              'subsection_order'=>[],
57              'next_subsection'=>1,
58              'subsections'=>{},
59              'metadata'=>[],
60              'text'=>"",
61              'OIDtype'=>"hash"}, $class;
62
63    # used to set lastmodified here, but this can screw up the HASH ids, so
64    # the docsave processor now calls set_lastmodified
65
66    $self->set_source_path($source_filename);
67   
68    if (defined $source_filename) {
69    $source_filename = &util::filename_within_collection($source_filename);
70    print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
71    $self->set_source_filename ($source_filename, $rename_method);
72    }
73
74    $self->set_doc_type ($doc_type) if defined $doc_type;
75
76    return $self;
77}
78
79
80sub set_source_path
81{
82    my $self = shift @_;
83    my ($source_filename) = @_;
84
85    if (defined $source_filename) {
86    # On Windows the source_filename can be in terse DOS format
87    # e.g. test~1.txt
88
89    $self->{'terse_source_path'} = $source_filename;
90
91    if (-e $source_filename) {
92        # See if we can do better for Windows with a filename
93        if ($ENV{'GSDLOS'} =~ /^windows$/i) {
94        require Win32;
95        $self->{'source_path'} = Win32::GetLongPathName($source_filename);
96        }
97        else {
98        # For Unix-based systems, there is no difference between the two
99        $self->{'source_path'} = $source_filename;
100        }
101    }
102    else {
103        print STDERR "Warning: In doc::set_source_path(), file\n";
104        print STDERR "           $source_filename\n";
105        print STDERR "         does not exist\n";
106       
107        # (default) Set it to whatever we were given
108        $self->{'source_path'} = $source_filename;
109    }   
110    }
111    else {
112    # Previous code for setting source_path allowed for
113    # it to be undefined, so continue this practice
114    $self->{'terse_source_path'} = undef;
115    $self->{'source_path'} = undef;
116    }
117}
118
119
120sub get_source_path
121{
122    my $self = shift @_;
123
124    return $self->{'terse_source_path'};
125}
126
127# set lastmodified for OAI purposes, added by GRB, moved by kjdon
128sub set_oailastmodified {
129    my $self = shift (@_);
130
131    my $source_path = $self->{'terse_source_path'};
132   
133    if (defined $source_path && (-e $source_path)) {
134    my $current_time = time;
135
136    my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
137        $wday, $yday, $isdst) = localtime($current_time);
138
139    my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
140
141    $self->add_utf8_metadata($self->get_top_section(), "oailastmodified", $current_time);
142    $self->add_utf8_metadata($self->get_top_section(), "oailastmodifieddate", $date_modified);
143    }
144}
145
146# no longer used for OAI purposes, since lastmodified is not what we want as the
147# Datestamp of a document. This doc metadata may be useful for general purposes.
148sub set_lastmodified {
149    my $self = shift (@_);
150
151    my $source_path = $self->{'terse_source_path'};
152   
153    if (defined $source_path && (-e $source_path)) {
154
155        my $file_stat = stat($source_path);
156    my $mtime = $file_stat->mtime;
157    my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
158        $wday, $yday, $isdst) = localtime($mtime);
159
160    my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
161
162    $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $mtime);
163    $self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
164    }
165}
166
167# clone the $self object
168sub duplicate {
169    my $self = shift (@_);
170
171    my $newobj = {};
172   
173    foreach my $k (keys %$self) {
174    $newobj->{$k} = &clone ($self->{$k});
175    }
176
177    bless $newobj, ref($self);
178    return $newobj;
179}
180
181sub clone {
182    my ($from) = @_;
183    my $type = ref ($from);
184
185    if ($type eq "HASH") {
186    my $to = {};
187    foreach my $key (keys %$from) {
188        $to->{$key} = &clone ($from->{$key});
189    }
190    return $to;
191    } elsif ($type eq "ARRAY") {
192    my $to = [];
193    foreach my $v (@$from) {
194        push (@$to, &clone ($v));
195    }
196    return $to;
197    } else {
198    return $from;
199    }
200}
201
202sub set_OIDtype {
203    my $self = shift (@_);
204    my ($type, $metadata) = @_;
205
206    if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|hash_on_full_filename|incremental|filename|dirname|full_filename|assigned)$/) {
207    $self->{'OIDtype'} = $type;
208    } else {
209    $self->{'OIDtype'} = "hash";
210    }
211
212    if ($type =~ /^assigned$/) {
213    if (defined $metadata) {
214        $self->{'OIDmetadata'} = $metadata;
215    } else {
216        $self->{'OIDmetadata'} = "dc.Identifier";
217    }
218    }
219}
220
221# rename_method can be 'url', 'none', 'base64'
222sub set_source_filename {
223    my $self = shift (@_);
224    my ($source_filename, $rename_method) = @_;
225
226    # Since the gsdlsourcefilename element goes into the doc.xml it has
227    # to be utf8. However, it should also *represent* the source filename
228    # (in the import directory) which may not be utf8 at all.
229    # For instance, if this meta element (gsdlsourcefilename) will be used
230    # by other applications that parse doc.xml in order to locate
231    # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
232    # encode the real filename as this is a binary-to-text encoding meaning
233    # that the resulting string is ASCII (utf8). Decoding will give the original.
234   
235#    print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
236
237    # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
238    $source_filename = $self->encode_filename($source_filename, $rename_method);
239#    my ($srcfilename,$dirname,$suffix)
240#   = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
241#    print STDERR "-> $srcfilename -> ";
242#    $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
243#    $source_filename = &util::filename_cat($dirname, $srcfilename);
244#    print STDERR "$source_filename\n";
245   
246    $self->set_utf8_metadata_element ($self->get_top_section(),
247                 "gsdlsourcefilename",
248                 $source_filename);
249}
250
251sub encode_filename {
252    my $self = shift (@_);
253    my ($source_filename, $rename_method) = @_;
254
255     my ($srcfilename,$dirname,$suffix)
256    = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
257#    print STDERR "-> $srcfilename -> ";
258    $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
259    $source_filename = &util::filename_cat($dirname, $srcfilename);
260
261    return $source_filename;
262}
263
264sub set_converted_filename {
265    my $self = shift (@_);
266    my ($converted_filename) = @_;
267
268    # we know the converted filename is utf8
269    $self->set_utf8_metadata_element ($self->get_top_section(),
270                 "gsdlconvertedfilename",
271                 $converted_filename);
272}
273
274# returns the source_filename as it was provided
275sub get_unmodified_source_filename {
276    my $self = shift (@_);
277
278    return $self->{'terse_source_path'};
279}
280
281# returns the source_filename with whatever rename_method was given
282sub get_source_filename {
283    my $self = shift (@_);
284
285    return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
286}
287
288
289
290# returns converted filename if available else returns source filename
291sub get_filename_for_hashing {
292    my $self = shift (@_);
293
294    my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
295
296    if (!defined $filename) {
297    my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
298    # if NULPlug processed file, then don't give a filename
299    if (defined $plugin_name && $plugin_name eq "NULPlug") {
300        $filename = undef;
301    } else { # returns the URL encoded source filename!
302        $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
303    }
304    }
305
306    if (!&util::filename_is_absolute($filename)) {
307    $filename = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$filename);
308    }
309
310    return $filename;
311}
312
313sub set_doc_type {
314    my $self = shift (@_);
315    my ($doc_type) = @_;
316
317    $self->set_metadata_element ($self->get_top_section(),
318                 "gsdldoctype",
319                 $doc_type);
320}
321
322# returns the gsdldoctype as it was provided
323# the default of "indexed_doc" is used if no document
324# type was provided
325sub get_doc_type {
326    my $self = shift (@_);
327
328    my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
329    return $doc_type if (defined $doc_type);
330    return "indexed_doc";
331}
332
333
334# look up the reference to the a particular section
335sub _lookup_section {
336    my $self = shift (@_);
337    my ($section) = @_;
338
339    my ($num);
340    my $sectionref = $self;
341
342    while (defined $section && $section ne "") {
343   
344    ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
345   
346    $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
347   
348    $section = "" unless defined $section;
349   
350
351    if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
352        $sectionref = $sectionref->{'subsections'}->{$num};
353    } else {
354        return undef;
355    }
356    }
357   
358    return $sectionref;
359}
360
361# calculate OID by hashing the contents of the document
362sub _calc_OID {
363    my $self = shift (@_);
364    my ($filename) = @_;
365
366
367    my $osexe = &util::get_os_exe();
368
369    my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
370                       $ENV{'GSDLOS'},"hashfile$osexe");
371
372    my $result = "NULL";
373
374   
375    if (-e "$hashfile_exe") {
376#   $result = `\"$hashfile_exe\" \"$filename\"`;
377#   $result = `hashfile$osexe \"$filename\" 2>&1`;
378    $result = `hashfile$osexe \"$filename\"`;
379
380    ($result) = $result =~ /:\s*([0-9a-f]+)/i;
381    } else {
382    print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
383    }
384    return "HASH$result";
385}
386
387# methods dealing with OID, not groups of them.
388
389# if $OID is not provided one is calculated
390sub set_OID {
391    my $self = shift (@_);
392    my ($OID) = @_;
393   
394    my $use_hash_oid = 0;
395    # if an OID wasn't provided calculate one
396    if (!defined $OID) {
397    $OID = "NULL";
398    if ($self->{'OIDtype'} =~ /^hash/) {
399        $use_hash_oid = 1;
400    } elsif ($self->{'OIDtype'} eq "incremental") {
401        $OID = "D" . $OIDcount;
402        $OIDcount ++;
403    } elsif ($self->{'OIDtype'} eq "filename") {
404        my $filename = $self->get_source_filename();
405        $OID = &File::Basename::fileparse($filename, qr/\.[^.]*/);
406        $OID = &util::tidy_up_oid($OID);
407    } elsif ($self->{'OIDtype'} eq "full_filename") {
408        my $source_filename = $self->get_source_filename();
409        my $dirsep = &util::get_os_dirsep();
410
411        $source_filename =~ s/^import$dirsep//;
412        $source_filename =~ s/$dirsep/-/g;
413        $source_filename =~ s/\./_/g;
414
415        $OID = $source_filename;
416        $OID = &util::tidy_up_oid($OID);
417    } elsif ($self->{'OIDtype'} eq "dirname") {
418        $OID = 'J';
419        my $filename = $self->get_source_filename();
420        if (defined($filename)) { # && -e $filename) {
421        $OID = &File::Basename::dirname($filename);
422        if (defined $OID) {
423            $OID = 'J'.&File::Basename::basename($OID);
424            $OID = &util::tidy_up_oid($OID);
425        } else {
426            print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
427            $use_hash_oid = 1;
428        }
429        } else {
430        print STDERR "Failed to find filename, generating hash id\n";
431        $use_hash_oid = 1;
432        }   
433       
434    } elsif ($self->{'OIDtype'} eq "assigned") {
435        my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
436        if (defined $identifier && $identifier ne "") {
437        $OID = $identifier;
438        $OID = &util::tidy_up_oid($OID);
439        } else {
440        # need a hash id
441        print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
442        $use_hash_oid = 1;
443        }
444               
445    } else {
446        $use_hash_oid = 1;
447    }
448
449    if ($use_hash_oid) {
450        my $hash_on_file = 1;
451        my $hash_on_ga_xml = 0;
452
453        if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
454        $hash_on_file = 0;
455        $hash_on_ga_xml = 1;
456        }
457
458        if ($self->{'OIDtype'} eq "hash_on_full_filename") {
459        $hash_on_file = 0;
460        $hash_on_ga_xml = 0;
461
462        my $source_filename = $self->get_source_filename();
463        my $dirsep = &util::get_os_dirsep();
464
465        $source_filename =~ s/^import$dirsep//;
466        $source_filename =~ s/$dirsep/-/g;
467        $source_filename =~ s/\./_/g;
468       
469        # If the filename is very short then (handled naively)
470        # this can cause conjestion in the hash-values
471        # computed, leading documents sharing the same leading
472        # Hex values in the computed has.
473        #
474        # The solution taken here is to replace the name of
475        # the file name a sufficient number of times (up to
476        # the character limit defined in 'rep_limit' and
477        # make that the content that is hashed on
478
479        # *** Think twice before changing the following value
480        # as it will break backward compatability of computed
481        # document HASH values
482
483        my $rep_limit  = 256;
484        my $hash_content = undef;
485
486        if (length($source_filename)<$rep_limit) {
487            my $rep_string = "$source_filename|";
488            my $rs_len = length($rep_string);
489
490            my $clone_times = int(($rep_limit-1)/$rs_len) +1;
491           
492            $hash_content = substr($rep_string x $clone_times, 0, $rep_limit);
493        }
494        else {
495            $hash_content = $source_filename;
496        }
497
498        my $filename = &util::get_tmp_filename();
499        if (!open (OUTFILE, ">:utf8", $filename)) {
500            print STDERR "doc::set_OID could not write to $filename\n";
501        } else {
502            print OUTFILE $hash_content;
503            close (OUTFILE);
504        }
505        $OID = $self->_calc_OID ($filename);
506
507        print STDERR "****!!! the computed hash for: '", $source_filename, "' is: ", $OID,"\n\n";
508
509        &util::rm ($filename);
510        }
511
512        if ($hash_on_file) {
513        # "hash" OID - feed file to hashfile.exe
514        my $filename = $self->get_filename_for_hashing();
515       
516        # -z: don't want to hash on the file if it is zero size
517        if (defined($filename) && -e $filename && !-z $filename) {
518            $OID = $self->_calc_OID ($filename);
519        } else {
520            $hash_on_ga_xml = 1; # switch to back-up plan, and hash on GA file instead
521        }
522        }
523
524        if ($hash_on_ga_xml) {
525        # In addition being asked to explicity calculate the has based on the GA file,
526        # can also end up coming into this block is doing 'hash_on_file' but the file
527        # itself is of zero bytes (as could be the case with 'doc.nul' file
528
529        my $filename = &util::get_tmp_filename();
530        if (!open (OUTFILE, ">:utf8", $filename)) {
531            print STDERR "doc::set_OID could not write to $filename\n";
532        } else {
533            my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
534            print OUTFILE $doc_text;
535            close (OUTFILE);
536        }
537        $OID = $self->_calc_OID ($filename);
538        &util::rm ($filename);
539        }
540    }
541    }
542    $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
543}
544
545# this uses hashdoc (embedded c thingy) which is faster but still
546# needs a little work to be suffiently stable
547sub ___set_OID {
548    my $self = shift (@_);
549    my ($OID) = @_;
550
551    # if an OID wasn't provided then calculate hash value based on document
552    if (!defined $OID)
553    {
554    my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
555    my $hash_len = length($hash_text);
556
557        $OID = &hashdoc::buffer($hash_text,$hash_len);
558    }
559
560    $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
561}
562
563# returns the OID for this document
564sub get_OID {
565    my $self = shift (@_);
566    my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
567    return $OID if (defined $OID);
568    return "NULL";
569}
570
571sub delete_OID {
572    my $self = shift (@_);
573   
574    $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
575}
576
577
578# methods for manipulating section names
579
580# returns the name of the top-most section (the top
581# level of the document
582sub get_top_section {
583    my $self = shift (@_);
584   
585    return "";
586}
587
588# returns a section
589sub get_parent_section {
590    my $self = shift (@_);
591    my ($section) = @_;
592
593    $section =~ s/(^|\.)\d+$//;
594
595    return $section;
596}
597
598# returns the first child section (or the end child
599# if there isn't any)
600sub get_begin_child {
601    my $self = shift (@_);
602    my ($section) = @_;
603
604    my $section_ptr = $self->_lookup_section($section);
605    return "" unless defined $section_ptr;
606
607    if (defined $section_ptr->{'subsection_order'}->[0]) {
608    return "$section.$section_ptr->{'subsection_order'}->[0]";
609    }
610
611    return $self->get_end_child ($section);
612}
613
614# returns the next child of a parent section
615sub get_next_child {
616    my $self = shift (@_);
617    my ($section) = @_;
618   
619    my $parent_section = $self->get_parent_section($section);
620    my $parent_section_ptr = $self->_lookup_section($parent_section);
621    return undef unless defined $parent_section_ptr;
622
623    my ($section_num) = $section =~ /(\d+)$/;
624    return undef unless defined $section_num;
625
626    my $i = 0;
627    my $section_order = $parent_section_ptr->{'subsection_order'};
628    while ($i < scalar(@$section_order)) {
629    last if $section_order->[$i] eq $section_num;
630    $i++;
631    }
632
633    $i++; # the next child
634    if ($i < scalar(@$section_order)) {
635    return $section_order->[$i] if $parent_section eq "";
636    return "$parent_section.$section_order->[$i]";
637    }
638
639    # no more sections in this level
640    return undef;
641}
642
643# returns a reference to a list of children
644sub get_children {
645    my $self = shift (@_);
646    my ($section) = @_;
647
648    my $section_ptr = $self->_lookup_section($section);
649    return [] unless defined $section_ptr;
650
651    my @children = @{$section_ptr->{'subsection_order'}};
652
653    map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
654    return \@children;
655}
656
657# returns the child section one past the last one (which
658# is coded as "0")
659sub get_end_child {
660    my $self = shift (@_);
661    my ($section) = @_;
662
663    return $section . ".0" unless $section eq "";
664    return "0";
665}
666
667# returns the next section in book order
668sub get_next_section {
669    my $self = shift (@_);
670    my ($section) = @_;
671
672    return undef unless defined $section;
673
674    my $section_ptr = $self->_lookup_section($section);
675    return undef unless defined $section_ptr;
676
677    # first try to find first child
678    if (defined $section_ptr->{'subsection_order'}->[0]) {
679    return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
680    return "$section.$section_ptr->{'subsection_order'}->[0]";
681    }
682
683    do {
684    # try to find sibling
685    my $next_child = $self->get_next_child ($section);
686    return $next_child if (defined $next_child);
687
688    # move up one level
689    $section = $self->get_parent_section ($section);
690    } while $section =~ /\d/;
691
692    return undef;
693}
694
695sub is_leaf_section {
696    my $self = shift (@_);
697    my ($section) = @_;
698
699    my $section_ptr = $self->_lookup_section($section);
700    return 1 unless defined $section_ptr;
701
702    return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
703}
704
705# methods for dealing with sections
706
707# returns the name of the inserted section
708sub insert_section {
709    my $self = shift (@_);
710    my ($before_section) = @_;
711
712    # get the child to insert before and its parent section
713    my $parent_section = "";
714    my $before_child = "0";
715    my @before_section = split (/\./, $before_section);
716    if (scalar(@before_section) > 0) {
717    $before_child = pop (@before_section);
718    $parent_section = join (".", @before_section);
719    }
720
721    my $parent_section_ptr = $self->_lookup_section($parent_section);
722    if (!defined $parent_section_ptr) {
723    print STDERR "doc::insert_section couldn't find parent section " .
724        "$parent_section\n";
725    return;
726    }
727
728    # get the next section number
729    my $section_num = $parent_section_ptr->{'next_subsection'}++;
730
731    my $i = 0;
732    while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
733       $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
734    $i++;
735    }
736   
737    # insert the section number into the order list
738    splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
739
740    # add this section to the parent section
741    my $section_ptr = {'subsection_order'=>[],
742               'next_subsection'=>1,
743               'subsections'=>{},
744               'metadata'=>[],
745               'text'=>""};
746    $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
747
748    # work out the full section number
749    my $section = $parent_section;
750    $section .= "." unless $section eq "";
751    $section .= $section_num;
752   
753    return $section;
754}
755
756# creates a pre-named section
757sub create_named_section {
758    my $self = shift (@_);
759    my ($mastersection) = @_;
760
761    my ($num);
762    my $section = $mastersection;
763    my $sectionref = $self;
764
765    while ($section ne "") {
766    ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
767    $num =~ s/^0+(\d)/$1/; # remove leading 0s
768    $section = "" unless defined $section;
769   
770    if (defined $num) {
771        if (!defined $sectionref->{'subsections'}->{$num}) {
772        push (@{$sectionref->{'subsection_order'}}, $num);
773        $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
774                            'next_subsection'=>1,
775                            'subsections'=>{},
776                            'metadata'=>[],
777                            'text'=>""};
778        if ($num >= $sectionref->{'next_subsection'}) {
779            $sectionref->{'next_subsection'} = $num + 1;
780        }
781        }
782        $sectionref = $sectionref->{'subsections'}->{$num};
783
784    } else {
785        print STDERR "doc::create_named_section couldn't create section ";
786        print STDERR "$mastersection\n";
787        last;
788    }
789    }
790}
791
792# returns a reference to a list of subsections
793sub list_subsections {
794    my $self = shift (@_);
795    my ($section) = @_;
796
797    my $section_ptr = $self->_lookup_section ($section);
798    if (!defined $section_ptr) {
799    print STDERR "doc::list_subsections couldn't find section $section\n";
800    return [];
801    }
802
803    return [@{$section_ptr->{'subsection_order'}}];
804}
805
806sub delete_section {
807    my $self = shift (@_);
808    my ($section) = @_;
809
810#    my $section_ptr = {'subsection_order'=>[],
811#              'next_subsection'=>1,
812#              'subsections'=>{},
813#              'metadata'=>[],
814#              'text'=>""};
815
816    # if this is the top section reset everything
817    if ($section eq "") {
818    $self->{'subsection_order'} = [];
819    $self->{'subsections'} = {};
820    $self->{'metadata'} = [];
821    $self->{'text'} = "";
822    return;
823    }
824
825    # find the parent of the section to delete
826    my $parent_section = "";
827    my $child = "0";
828    my @section = split (/\./, $section);
829    if (scalar(@section) > 0) {
830    $child = pop (@section);
831    $parent_section = join (".", @section);
832    }
833
834    my $parent_section_ptr = $self->_lookup_section($parent_section);
835    if (!defined $parent_section_ptr) {
836    print STDERR "doc::delete_section couldn't find parent section " .
837        "$parent_section\n";
838    return;
839    }
840
841    # remove this section from the subsection_order list
842    my $i = 0;
843    while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
844    if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
845        splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
846        last;
847    }
848    $i++;
849    }
850
851    # remove this section from the subsection hash
852    if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
853    undef $parent_section_ptr->{'subsections'}->{$child};
854    }
855}
856
857#--
858# methods for dealing with metadata
859
860# set_metadata_element and get_metadata_element are for metadata
861# which should only have one value. add_meta_data and get_metadata
862# are for metadata which can have more than one value.
863
864# returns the first metadata value which matches field
865
866# This version of get metadata element works much like the one above,
867# except it allows for the namespace portion of a metadata element to
868# be ignored, thus if you are searching for dc.Title, the first piece
869# of matching metadata ending with the name Title (once any namespace
870# is removed) would be returned.
871# 28-11-2003 John Thompson
872sub get_metadata_element {
873    my $self = shift (@_);
874    my ($section, $field, $ignore_namespace) = @_;
875    my ($data);
876
877    $ignore_namespace = 0 unless defined $ignore_namespace;
878
879    my $section_ptr = $self->_lookup_section($section);
880    if (!defined $section_ptr) {
881    print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
882    return;
883    }
884
885    # Remove any namespace if we are being told to ignore them
886    if($ignore_namespace) {
887    $field =~ s/^.*\.//; #$field =~ s/^\w*\.//;
888    }
889
890    foreach $data (@{$section_ptr->{'metadata'}}) {
891
892    my $data_name = $data->[0];
893
894    # Remove any namespace if we are being told to ignore them
895    if($ignore_namespace) {
896        $data_name =~ s/^.*\.//; #$data_name =~ s/^\w*\.//;
897    }
898    # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
899    $data_name =~ s/^ex\.([^.]+)$/$1/; #$data_name =~ s/^ex\.//;
900   
901    return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
902    }
903   
904    return undef; # was not found
905}
906
907# returns a list of the form [value1, value2, ...]
908sub get_metadata {
909    my $self = shift (@_);
910    my ($section, $field, $ignore_namespace) = @_;
911    my ($data);
912
913    $ignore_namespace = 0 unless defined $ignore_namespace;
914
915    my $section_ptr = $self->_lookup_section($section);
916    if (!defined $section_ptr) {
917        print STDERR "doc::get_metadata couldn't find section ",
918        $section, "\n";
919        return;
920    }
921
922    # Remove any namespace if we are being told to ignore them
923    if($ignore_namespace) {
924    $field =~ s/^.*\.//;
925    }
926
927    my @metadata = ();
928    foreach $data (@{$section_ptr->{'metadata'}}) {
929
930    my $data_name = $data->[0];
931
932    # Remove any namespace if we are being told to ignore them
933    if($ignore_namespace) {
934        $data_name =~ s/^.*\.//;
935    }   
936    # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
937    $data_name =~ s/^ex\.([^.]+)$/$1/;
938
939        push (@metadata, $data->[1]) if ($data_name eq $field);
940    }
941
942    return \@metadata;
943}
944
945sub get_metadata_hashmap {
946    my $self = shift (@_);
947    my ($section, $opt_namespace) = @_;
948   
949    my $section_ptr = $self->_lookup_section($section);
950    if (!defined $section_ptr) {
951            print STDERR "doc::get_metadata couldn't find section ",
952            $section, "\n";
953            return;
954        }
955
956    my $metadata_hashmap = {};
957    foreach my $data (@{$section_ptr->{'metadata'}}) {
958            my $metaname = $data->[0];
959         
960            if ((!defined $opt_namespace) || ($metaname =~ m/^$opt_namespace\./)) {
961                if (!defined $metadata_hashmap->{$metaname}) {
962                    $metadata_hashmap->{$metaname} = [];
963                  }
964                my $metaval_list = $metadata_hashmap->{$metaname};
965                push(@$metaval_list, $data->[1]);
966              }
967          }
968   
969    return $metadata_hashmap;
970}
971
972# returns a list of the form [[field,value],[field,value],...]
973sub get_all_metadata {
974    my $self = shift (@_);
975    my ($section) = @_;
976
977    my $section_ptr = $self->_lookup_section($section);
978    if (!defined $section_ptr) {
979    print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
980    return;
981    }
982   
983    return $section_ptr->{'metadata'};
984}
985
986# $value is optional
987sub delete_metadata {
988    my $self = shift (@_);
989    my ($section, $field, $value) = @_;
990
991    my $section_ptr = $self->_lookup_section($section);
992    if (!defined $section_ptr) {
993    print STDERR "doc::delete_metadata couldn't find section ", $section, "$field\n";
994    return;
995    }
996
997    my $i = 0;
998    while ($i < scalar (@{$section_ptr->{'metadata'}})) {
999    if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
1000        (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
1001        splice (@{$section_ptr->{'metadata'}}, $i, 1);
1002    } else {
1003        $i++;
1004    }
1005    }
1006}
1007
1008sub delete_all_metadata {
1009    my $self = shift (@_);
1010    my ($section) = @_;
1011
1012    my $section_ptr = $self->_lookup_section($section);
1013    if (!defined $section_ptr) {
1014    print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
1015    return;
1016    }
1017   
1018    $section_ptr->{'metadata'} = [];
1019}
1020
1021sub set_metadata_element {
1022    my $self = shift (@_);
1023    my ($section, $field, $value) = @_;
1024
1025    $self->set_utf8_metadata_element ($section, $field,
1026                      &unicode::ascii2utf8(\$value));
1027}
1028
1029# set_utf8_metadata_element assumes the text has already been
1030# converted to the UTF-8 encoding.
1031sub set_utf8_metadata_element {
1032    my $self = shift (@_);
1033    my ($section, $field, $value) = @_;
1034   
1035    $self->delete_metadata ($section, $field);
1036    $self->add_utf8_metadata ($section, $field, $value);
1037}
1038
1039
1040# add_metadata assumes the text is in (extended) ascii form. For
1041# text which has already been converted to the UTF-8 format use
1042# add_utf8_metadata.
1043sub add_metadata {
1044    my $self = shift (@_);
1045    my ($section, $field, $value) = @_;
1046   
1047    $self->add_utf8_metadata ($section, $field,
1048                  &unicode::ascii2utf8(\$value));
1049}
1050
1051sub add_utf8_metadata {
1052    my $self = shift (@_);
1053    my ($section, $field, $value) = @_;
1054   
1055    #    my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1056    #    my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
1057    #    print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
1058
1059    my $section_ptr = $self->_lookup_section($section);
1060    if (!defined $section_ptr) {
1061    print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
1062    return;
1063    }
1064    if (!defined $value) {
1065    print STDERR "doc::add_utf8_metadata undefined value for $field\n";
1066    return;
1067    }
1068    if (!defined $field) {
1069    print STDERR "doc::add_utf8_metadata undefined metadata type \n";
1070    return;
1071    }
1072   
1073    #print STDERR "###$field=$value\n";
1074
1075    # For now, supress this check.  Given that text data read in is now
1076    # Unicode aware, then the following block of code can (ironically enough)
1077    # cause our unicode compliant string to be re-encoded (leading to
1078    # a double-encoded UTF-8 string, which we definitely don't want!).
1079   
1080
1081    # double check that the value is utf-8
1082    #    if (!&unicode::check_is_utf8($value)) {
1083    #   print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
1084    #   &unicode::ensure_utf8(\$value);
1085    #   print STDERR " Tried converting to utf8: $value\n";
1086    #    }
1087
1088    #If the metadata value is either a latitude or a longitude value then we want to save a shortened version for spacial searching purposes
1089    if($field eq "Latitude" || $field eq "Longitude")
1090    {
1091        my $direction;
1092        if($value =~ m/^-/)
1093        {
1094            $direction = ($field eq "Latitude") ? "S" : "W";
1095        }
1096        else
1097        {
1098            $direction = ($field eq "Latitude") ? "N" : "E";
1099        }
1100       
1101        my ($beforeDec, $afterDec) = ($value =~ m/^-?([0-9]+)\.([0-9]+)$/);
1102        if(defined $beforeDec && defined $afterDec)
1103        {
1104            my $name = ($field eq "Latitude") ? "LatShort" : "LngShort";
1105            push (@{$section_ptr->{'metadata'}}, [$name, $beforeDec . $direction]);
1106           
1107            for(my $i = 2; $i <= 4; $i++)
1108            {
1109                if(length($afterDec) >= $i)
1110                {
1111                    push (@{$section_ptr->{'metadata'}}, [$name, substr($afterDec, 0, $i)]);
1112                }
1113            }
1114           
1115            #Only add the metadata if it has not already been added
1116            my $metaMap = $self->get_metadata_hashmap($section);
1117        }
1118    }
1119
1120    push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1121}
1122
1123
1124# methods for dealing with text
1125
1126# returns the text for a section
1127sub get_text {
1128    my $self = shift (@_);
1129    my ($section) = @_;
1130
1131    my $section_ptr = $self->_lookup_section($section);
1132    if (!defined $section_ptr) {
1133    print STDERR "doc::get_text couldn't find section " .
1134        "$section\n";
1135    return "";
1136    }
1137
1138    return $section_ptr->{'text'};
1139}
1140
1141# returns the (utf-8 encoded) length of the text for a section
1142sub get_text_length {
1143    my $self = shift (@_);
1144    my ($section) = @_;
1145
1146    my $section_ptr = $self->_lookup_section($section);
1147    if (!defined $section_ptr) {
1148    print STDERR "doc::get_text_length couldn't find section " .
1149        "$section\n";
1150    return 0;
1151    }
1152
1153    return length ($section_ptr->{'text'});
1154}
1155
1156# returns the total length for all the sections
1157sub get_total_text_length {
1158    my $self = shift (@_);
1159
1160    my $section = $self->get_top_section();
1161    my $length = 0;
1162    while (defined $section) {
1163    $length += $self->get_text_length($section);
1164    $section = $self->get_next_section($section);
1165    }
1166    return $length;
1167}
1168
1169sub delete_text {
1170    my $self = shift (@_);
1171    my ($section) = @_;
1172
1173    my $section_ptr = $self->_lookup_section($section);
1174    if (!defined $section_ptr) {
1175    print STDERR "doc::delete_text couldn't find section " .
1176        "$section\n";
1177    return;
1178    }
1179
1180    $section_ptr->{'text'} = "";
1181}
1182
1183# add_text assumes the text is in (extended) ascii form. For
1184# text which has been already converted to the UTF-8 format
1185# use add_utf8_text.
1186sub add_text {
1187    my $self = shift (@_);
1188    my ($section, $text) = @_;
1189
1190    # convert the text to UTF-8 encoded unicode characters
1191    # and add the text
1192    $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1193}
1194
1195
1196# add_utf8_text assumes the text to be added has already
1197# been converted to the UTF-8 encoding. For ascii text use
1198# add_text
1199sub add_utf8_text {
1200    my $self = shift (@_);
1201    my ($section, $text) = @_;
1202
1203    my $section_ptr = $self->_lookup_section($section);
1204    if (!defined $section_ptr) {
1205    print STDERR "doc::add_utf8_text couldn't find section " .
1206        "$section\n";
1207    return;
1208    }
1209
1210    $section_ptr->{'text'} .= $text;
1211}
1212
1213# returns the Source meta, which is the utf8 filename generated.
1214# Added a separate method here for convenience
1215sub get_source {
1216    my $self = shift (@_);
1217    return $self->get_metadata_element ($self->get_top_section(), "Source");
1218}
1219
1220# returns the SourceFile meta, which is the url reference to the URL-encoded
1221# version of Source (the utf8 filename). Added a separate method here for convenience
1222sub get_sourcefile {
1223    my $self = shift (@_);
1224    return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
1225}
1226
1227# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
1228# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
1229sub get_assocfile_from_sourcefile {
1230    my $self = shift (@_);
1231   
1232    # get the SourceFile meta, which is a *URL* to a file on the filesystem
1233    my $top_section = $self->get_top_section();
1234    my $source_file = $self->get_metadata_element($top_section, "SourceFile");
1235
1236    # get the actual filename as it exists on the filesystem which this url refers to
1237    $source_file = &unicode::url_to_filename($source_file);
1238    my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
1239    return $assocfilename;
1240}
1241
1242# methods for dealing with associated files
1243
1244# a file is associated with a document, NOT a section.
1245# if section is defined it is noted in the data structure
1246# only so that files associated from a particular section
1247# may be removed later (using delete_section_assoc_files)
1248sub associate_file {
1249    my $self = shift (@_);
1250    my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1251    $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1252
1253    # remove all associated files with the same name
1254    $self->delete_assoc_file ($assoc_filename);
1255
1256    # Too harsh a requirement
1257    # Definitely get HTML docs, for example, with some missing
1258    # support files
1259#    if (!&util::fd_exists($real_filename)) {
1260#   print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n";
1261#   exit -1;
1262#    }
1263
1264#    print STDERR "**** is the following a UTF8 rep of *real* filename?\n   $real_filename\n";
1265#    print STDERR "****##### so, ensure it is before storing?!?!?\n";
1266##    my $utf8_filename = Encode::encode("utf8",$filename);
1267
1268    push (@{$self->{'associated_files'}},
1269      [$real_filename, $assoc_filename, $mime_type, $section]);
1270}
1271
1272# returns a list of associated files in the form
1273#   [[real_filename, assoc_filename, mimetype], ...]
1274sub get_assoc_files {
1275    my $self = shift (@_);
1276
1277    return $self->{'associated_files'};
1278}
1279
1280# the following two methods used to keep track of original associated files
1281# for incremental building. eg a txt file used by an item file does not end
1282# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1283# file for incremental build
1284sub associate_source_file {
1285    my $self = shift (@_);
1286    my ($full_filename) = @_;
1287
1288    push (@{$self->{'source_assoc_files'}}, $full_filename);
1289
1290}
1291
1292sub get_source_assoc_files {
1293    my $self = shift (@_);
1294
1295    return $self->{'source_assoc_files'};
1296 
1297
1298}
1299sub metadata_file {
1300    my $self = shift (@_);
1301    my ($real_filename, $filename) = @_;
1302   
1303    push (@{$self->{'metadata_files'}},
1304      [$real_filename, $filename]);
1305}
1306
1307# used for writing out the archiveinf-doc info database, to list all the metadata files
1308sub get_meta_files {
1309    my $self = shift (@_);
1310
1311    return $self->{'metadata_files'};
1312}
1313
1314sub delete_section_assoc_files {
1315    my $self = shift (@_);
1316    my ($section) = @_;
1317
1318    my $i=0;
1319    while ($i < scalar (@{$self->{'associated_files'}})) {
1320    if (defined $self->{'associated_files'}->[$i]->[3] &&
1321        $self->{'associated_files'}->[$i]->[3] eq $section) {
1322        splice (@{$self->{'associated_files'}}, $i, 1);
1323    } else {
1324        $i++;
1325    }
1326    }
1327}
1328
1329sub delete_assoc_file {
1330    my $self = shift (@_);
1331    my ($assoc_filename) = @_;
1332
1333    my $i=0;
1334    while ($i < scalar (@{$self->{'associated_files'}})) {
1335    if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1336        splice (@{$self->{'associated_files'}}, $i, 1);
1337    } else {
1338        $i++;
1339    }
1340    }
1341}
1342
1343sub reset_nextsection_ptr {
1344    my $self = shift (@_);
1345    my ($section) = @_;
1346   
1347    my $section_ptr = $self->_lookup_section($section);
1348    $section_ptr->{'next_subsection'} = 1;
1349}
1350
13511;
Note: See TracBrowser for help on using the browser.