root/main/trunk/greenstone2/perllib/doc.pm @ 26221

Revision 26221, 35.5 KB (checked in by kjdon, 7 years ago)

new OIDtype, filename, will use the file name without any folders or file extension. Must be unique filenames in the collection. BasePlugin? add_OID method returns if an id has already been set

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr   te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32    die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use ghtml;
40use File::stat;
41##use hashdoc;
42use docprint;
43
44# the document type may be indexed_doc, nonindexed_doc, or
45# classification
46
47our $OIDcount = 0;
48
49# rename_method can be 'url', 'none', 'base64'
50sub new {
51    my $class = shift (@_);
52    my ($source_filename, $doc_type, $rename_method) = @_;
53
54
55    my $self = bless {'associated_files'=>[],
56              'subsection_order'=>[],
57              'next_subsection'=>1,
58              'subsections'=>{},
59              'metadata'=>[],
60              'text'=>"",
61              'OIDtype'=>"hash"}, $class;
62
63    # used to set lastmodified here, but this can screw up the HASH ids, so
64    # the docsave processor now calls set_lastmodified
65
66    $self->set_source_path($source_filename);
67   
68    if (defined $source_filename) {
69    $source_filename = &util::filename_within_collection($source_filename);
70    print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
71    $self->set_source_filename ($source_filename, $rename_method);
72    }
73
74    $self->set_doc_type ($doc_type) if defined $doc_type;
75
76    return $self;
77}
78
79
80sub set_source_path
81{
82    my $self = shift @_;
83    my ($source_filename) = @_;
84
85    if (defined $source_filename) {
86    # On Windows the source_filename can be in terse DOS format
87    # e.g. test~1.txt
88
89    $self->{'terse_source_path'} = $source_filename;
90
91    if (-e $source_filename) {
92        # See if we can do better for Windows with a filename
93        if ($ENV{'GSDLOS'} =~ /^windows$/i) {
94        require Win32;
95        $self->{'source_path'} = Win32::GetLongPathName($source_filename);
96        }
97        else {
98        # For Unix-based systems, there is no difference between the two
99        $self->{'source_path'} = $source_filename;
100        }
101    }
102    else {
103        print STDERR "Warning: In doc::set_source_path(), file\n";
104        print STDERR "           $source_filename\n";
105        print STDERR "         does not exist\n";
106       
107        # (default) Set it to whatever we were given
108        $self->{'source_path'} = $source_filename;
109    }   
110    }
111    else {
112    # Previous code for setting source_path allowed for
113    # it to be undefined, so continue this practice
114    $self->{'terse_source_path'} = undef;
115    $self->{'source_path'} = undef;
116    }
117}
118
119
120sub get_source_path
121{
122    my $self = shift @_;
123
124    return $self->{'terse_source_path'};
125}
126
127# set lastmodified for OAI purposes, added by GRB, moved by kjdon
128sub set_oailastmodified {
129    my $self = shift (@_);
130
131    my $source_path = $self->{'terse_source_path'};
132   
133    if (defined $source_path && (-e $source_path)) {
134    my $current_time = time;
135
136    my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
137        $wday, $yday, $isdst) = localtime($current_time);
138
139    my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
140
141    $self->add_utf8_metadata($self->get_top_section(), "oailastmodified", $current_time);
142    $self->add_utf8_metadata($self->get_top_section(), "oailastmodifieddate", $date_modified);
143    }
144}
145
146# no longer used for OAI purposes, since lastmodified is not what we want as the
147# Datestamp of a document. This doc metadata may be useful for general purposes.
148sub set_lastmodified {
149    my $self = shift (@_);
150
151    my $source_path = $self->{'terse_source_path'};
152   
153    if (defined $source_path && (-e $source_path)) {
154
155        my $file_stat = stat($source_path);
156    my $mtime = $file_stat->mtime;
157    my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
158        $wday, $yday, $isdst) = localtime($mtime);
159
160    my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
161
162    $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $mtime);
163    $self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
164    }
165}
166
167# clone the $self object
168sub duplicate {
169    my $self = shift (@_);
170
171    my $newobj = {};
172   
173    foreach my $k (keys %$self) {
174    $newobj->{$k} = &clone ($self->{$k});
175    }
176
177    bless $newobj, ref($self);
178    return $newobj;
179}
180
181sub clone {
182    my ($from) = @_;
183    my $type = ref ($from);
184
185    if ($type eq "HASH") {
186    my $to = {};
187    foreach my $key (keys %$from) {
188        $to->{$key} = &clone ($from->{$key});
189    }
190    return $to;
191    } elsif ($type eq "ARRAY") {
192    my $to = [];
193    foreach my $v (@$from) {
194        push (@$to, &clone ($v));
195    }
196    return $to;
197    } else {
198    return $from;
199    }
200}
201
202sub set_OIDtype {
203    my $self = shift (@_);
204    my ($type, $metadata) = @_;
205
206    if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|incremental|filename|dirname|assigned)$/) {
207    $self->{'OIDtype'} = $type;
208    } else {
209    $self->{'OIDtype'} = "hash";
210    }
211
212    if ($type =~ /^assigned$/) {
213    if (defined $metadata) {
214        $self->{'OIDmetadata'} = $metadata;
215    } else {
216        $self->{'OIDmetadata'} = "dc.Identifier";
217    }
218    }
219}
220
221# rename_method can be 'url', 'none', 'base64'
222sub set_source_filename {
223    my $self = shift (@_);
224    my ($source_filename, $rename_method) = @_;
225
226    # Since the gsdlsourcefilename element goes into the doc.xml it has
227    # to be utf8. However, it should also *represent* the source filename
228    # (in the import directory) which may not be utf8 at all.
229    # For instance, if this meta element (gsdlsourcefilename) will be used
230    # by other applications that parse doc.xml in order to locate
231    # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
232    # encode the real filename as this is a binary-to-text encoding meaning
233    # that the resulting string is ASCII (utf8). Decoding will give the original.
234   
235#    print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
236
237    # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
238    $source_filename = $self->encode_filename($source_filename, $rename_method);
239#    my ($srcfilename,$dirname,$suffix)
240#   = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
241#    print STDERR "-> $srcfilename -> ";
242#    $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
243#    $source_filename = &util::filename_cat($dirname, $srcfilename);
244#    print STDERR "$source_filename\n";
245   
246    $self->set_utf8_metadata_element ($self->get_top_section(),
247                 "gsdlsourcefilename",
248                 $source_filename);
249}
250
251sub encode_filename {
252    my $self = shift (@_);
253    my ($source_filename, $rename_method) = @_;
254
255     my ($srcfilename,$dirname,$suffix)
256    = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
257#    print STDERR "-> $srcfilename -> ";
258    $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
259    $source_filename = &util::filename_cat($dirname, $srcfilename);
260
261    return $source_filename;
262}
263
264sub set_converted_filename {
265    my $self = shift (@_);
266    my ($converted_filename) = @_;
267
268    # we know the converted filename is utf8
269    $self->set_utf8_metadata_element ($self->get_top_section(),
270                 "gsdlconvertedfilename",
271                 $converted_filename);
272}
273
274# returns the source_filename as it was provided
275sub get_unmodified_source_filename {
276    my $self = shift (@_);
277
278    return $self->{'terse_source_path'};
279}
280
281# returns the source_filename with whatever rename_method was given
282sub get_source_filename {
283    my $self = shift (@_);
284
285    return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
286}
287
288
289
290# returns converted filename if available else returns source filename
291sub get_filename_for_hashing {
292    my $self = shift (@_);
293
294    my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
295
296    if (!defined $filename) {
297    my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
298    # if NULPlug processed file, then don't give a filename
299    if (defined $plugin_name && $plugin_name eq "NULPlug") {
300        $filename = undef;
301    } else { # returns the URL encoded source filename!
302        $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
303    }
304    }
305
306    if (!&util::filename_is_absolute($filename)) {
307    $filename = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$filename);
308    }
309
310    return $filename;
311}
312
313sub set_doc_type {
314    my $self = shift (@_);
315    my ($doc_type) = @_;
316
317    $self->set_metadata_element ($self->get_top_section(),
318                 "gsdldoctype",
319                 $doc_type);
320}
321
322# returns the gsdldoctype as it was provided
323# the default of "indexed_doc" is used if no document
324# type was provided
325sub get_doc_type {
326    my $self = shift (@_);
327
328    my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
329    return $doc_type if (defined $doc_type);
330    return "indexed_doc";
331}
332
333
334# look up the reference to the a particular section
335sub _lookup_section {
336    my $self = shift (@_);
337    my ($section) = @_;
338
339    my ($num);
340    my $sectionref = $self;
341
342    while (defined $section && $section ne "") {
343   
344    ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
345   
346    $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
347   
348    $section = "" unless defined $section;
349   
350
351    if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
352        $sectionref = $sectionref->{'subsections'}->{$num};
353    } else {
354        return undef;
355    }
356    }
357   
358    return $sectionref;
359}
360
361# calculate OID by hashing the contents of the document
362sub _calc_OID {
363    my $self = shift (@_);
364    my ($filename) = @_;
365
366
367    my $osexe = &util::get_os_exe();
368
369    my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
370                       $ENV{'GSDLOS'},"hashfile$osexe");
371
372    my $result = "NULL";
373
374   
375    if (-e "$hashfile_exe") {
376#   $result = `\"$hashfile_exe\" \"$filename\"`;
377#   $result = `hashfile$osexe \"$filename\" 2>&1`;
378    $result = `hashfile$osexe \"$filename\"`;
379
380    ($result) = $result =~ /:\s*([0-9a-f]+)/i;
381    } else {
382    print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
383    }
384    return "HASH$result";
385}
386
387# methods dealing with OID, not groups of them.
388
389# if $OID is not provided one is calculated
390sub set_OID {
391    my $self = shift (@_);
392    my ($OID) = @_;
393   
394    my $use_hash_oid = 0;
395    # if an OID wasn't provided calculate one
396    if (!defined $OID) {
397    $OID = "NULL";
398    if ($self->{'OIDtype'} =~ /^hash/) {
399        $use_hash_oid = 1;
400    } elsif ($self->{'OIDtype'} eq "incremental") {
401        $OID = "D" . $OIDcount;
402        $OIDcount ++;
403    } elsif ($self->{'OIDtype'} eq "filename") {
404        my $filename = $self->get_source_filename();
405        $OID = &File::Basename::fileparse($filename, qr/\.[^.]*/);
406        $OID = &util::tidy_up_oid($OID);
407    } elsif ($self->{'OIDtype'} eq "dirname") {
408        $OID = 'J';
409        my $filename = $self->get_source_filename();
410        if (defined($filename)) { # && -e $filename) {
411        $OID = &File::Basename::dirname($filename);
412        if (defined $OID) {
413            $OID = 'J'.&File::Basename::basename($OID);
414            $OID = &util::tidy_up_oid($OID);
415        } else {
416            print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
417            $use_hash_oid = 1;
418        }
419        } else {
420        print STDERR "Failed to find filename, generating hash id\n";
421        $use_hash_oid = 1;
422        }   
423       
424    } elsif ($self->{'OIDtype'} eq "assigned") {
425        my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
426        if (defined $identifier && $identifier ne "") {
427        $OID = $identifier;
428        $OID = &util::tidy_up_oid($OID);
429        } else {
430        # need a hash id
431        print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
432        $use_hash_oid = 1;
433        }
434               
435    } else {
436        $use_hash_oid = 1;
437    }
438
439    if ($use_hash_oid) {
440        my $hash_on_file = 1;
441        if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
442        $hash_on_file = 0;
443        }
444        if ($hash_on_file) {
445        # "hash" OID - feed file to hashfile.exe
446        my $filename = $self->get_filename_for_hashing();
447       
448        # -z: don't want to hash on the file if it is zero size
449        if (defined($filename) && -e $filename && !-z $filename) {
450            $OID = $self->_calc_OID ($filename);
451        } else {
452            $hash_on_file = 0;
453        }
454        }
455        if (!$hash_on_file) {
456        my $filename = &util::get_tmp_filename();
457        if (!open (OUTFILE, ">:utf8", $filename)) {
458            print STDERR "doc::set_OID could not write to $filename\n";
459        } else {
460            my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
461            print OUTFILE $doc_text;
462            close (OUTFILE);
463        }
464        $OID = $self->_calc_OID ($filename);
465        &util::rm ($filename);
466        }
467    }
468    }
469    $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
470}
471
472# this uses hashdoc (embedded c thingy) which is faster but still
473# needs a little work to be suffiently stable
474sub ___set_OID {
475    my $self = shift (@_);
476    my ($OID) = @_;
477
478    # if an OID wasn't provided then calculate hash value based on document
479    if (!defined $OID)
480    {
481    my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
482    my $hash_len = length($hash_text);
483
484        $OID = &hashdoc::buffer($hash_text,$hash_len);
485    }
486
487    $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
488}
489
490# returns the OID for this document
491sub get_OID {
492    my $self = shift (@_);
493    my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
494    return $OID if (defined $OID);
495    return "NULL";
496}
497
498sub delete_OID {
499    my $self = shift (@_);
500   
501    $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
502}
503
504
505# methods for manipulating section names
506
507# returns the name of the top-most section (the top
508# level of the document
509sub get_top_section {
510    my $self = shift (@_);
511   
512    return "";
513}
514
515# returns a section
516sub get_parent_section {
517    my $self = shift (@_);
518    my ($section) = @_;
519
520    $section =~ s/(^|\.)\d+$//;
521
522    return $section;
523}
524
525# returns the first child section (or the end child
526# if there isn't any)
527sub get_begin_child {
528    my $self = shift (@_);
529    my ($section) = @_;
530
531    my $section_ptr = $self->_lookup_section($section);
532    return "" unless defined $section_ptr;
533
534    if (defined $section_ptr->{'subsection_order'}->[0]) {
535    return "$section.$section_ptr->{'subsection_order'}->[0]";
536    }
537
538    return $self->get_end_child ($section);
539}
540
541# returns the next child of a parent section
542sub get_next_child {
543    my $self = shift (@_);
544    my ($section) = @_;
545   
546    my $parent_section = $self->get_parent_section($section);
547    my $parent_section_ptr = $self->_lookup_section($parent_section);
548    return undef unless defined $parent_section_ptr;
549
550    my ($section_num) = $section =~ /(\d+)$/;
551    return undef unless defined $section_num;
552
553    my $i = 0;
554    my $section_order = $parent_section_ptr->{'subsection_order'};
555    while ($i < scalar(@$section_order)) {
556    last if $section_order->[$i] eq $section_num;
557    $i++;
558    }
559
560    $i++; # the next child
561    if ($i < scalar(@$section_order)) {
562    return $section_order->[$i] if $parent_section eq "";
563    return "$parent_section.$section_order->[$i]";
564    }
565
566    # no more sections in this level
567    return undef;
568}
569
570# returns a reference to a list of children
571sub get_children {
572    my $self = shift (@_);
573    my ($section) = @_;
574
575    my $section_ptr = $self->_lookup_section($section);
576    return [] unless defined $section_ptr;
577
578    my @children = @{$section_ptr->{'subsection_order'}};
579
580    map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
581    return \@children;
582}
583
584# returns the child section one past the last one (which
585# is coded as "0")
586sub get_end_child {
587    my $self = shift (@_);
588    my ($section) = @_;
589
590    return $section . ".0" unless $section eq "";
591    return "0";
592}
593
594# returns the next section in book order
595sub get_next_section {
596    my $self = shift (@_);
597    my ($section) = @_;
598
599    return undef unless defined $section;
600
601    my $section_ptr = $self->_lookup_section($section);
602    return undef unless defined $section_ptr;
603
604    # first try to find first child
605    if (defined $section_ptr->{'subsection_order'}->[0]) {
606    return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
607    return "$section.$section_ptr->{'subsection_order'}->[0]";
608    }
609
610    do {
611    # try to find sibling
612    my $next_child = $self->get_next_child ($section);
613    return $next_child if (defined $next_child);
614
615    # move up one level
616    $section = $self->get_parent_section ($section);
617    } while $section =~ /\d/;
618
619    return undef;
620}
621
622sub is_leaf_section {
623    my $self = shift (@_);
624    my ($section) = @_;
625
626    my $section_ptr = $self->_lookup_section($section);
627    return 1 unless defined $section_ptr;
628
629    return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
630}
631
632# methods for dealing with sections
633
634# returns the name of the inserted section
635sub insert_section {
636    my $self = shift (@_);
637    my ($before_section) = @_;
638
639    # get the child to insert before and its parent section
640    my $parent_section = "";
641    my $before_child = "0";
642    my @before_section = split (/\./, $before_section);
643    if (scalar(@before_section) > 0) {
644    $before_child = pop (@before_section);
645    $parent_section = join (".", @before_section);
646    }
647
648    my $parent_section_ptr = $self->_lookup_section($parent_section);
649    if (!defined $parent_section_ptr) {
650    print STDERR "doc::insert_section couldn't find parent section " .
651        "$parent_section\n";
652    return;
653    }
654
655    # get the next section number
656    my $section_num = $parent_section_ptr->{'next_subsection'}++;
657
658    my $i = 0;
659    while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
660       $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
661    $i++;
662    }
663   
664    # insert the section number into the order list
665    splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
666
667    # add this section to the parent section
668    my $section_ptr = {'subsection_order'=>[],
669               'next_subsection'=>1,
670               'subsections'=>{},
671               'metadata'=>[],
672               'text'=>""};
673    $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
674
675    # work out the full section number
676    my $section = $parent_section;
677    $section .= "." unless $section eq "";
678    $section .= $section_num;
679   
680    return $section;
681}
682
683# creates a pre-named section
684sub create_named_section {
685    my $self = shift (@_);
686    my ($mastersection) = @_;
687
688    my ($num);
689    my $section = $mastersection;
690    my $sectionref = $self;
691
692    while ($section ne "") {
693    ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
694    $num =~ s/^0+(\d)/$1/; # remove leading 0s
695    $section = "" unless defined $section;
696   
697    if (defined $num) {
698        if (!defined $sectionref->{'subsections'}->{$num}) {
699        push (@{$sectionref->{'subsection_order'}}, $num);
700        $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
701                            'next_subsection'=>1,
702                            'subsections'=>{},
703                            'metadata'=>[],
704                            'text'=>""};
705        if ($num >= $sectionref->{'next_subsection'}) {
706            $sectionref->{'next_subsection'} = $num + 1;
707        }
708        }
709        $sectionref = $sectionref->{'subsections'}->{$num};
710
711    } else {
712        print STDERR "doc::create_named_section couldn't create section ";
713        print STDERR "$mastersection\n";
714        last;
715    }
716    }
717}
718
719# returns a reference to a list of subsections
720sub list_subsections {
721    my $self = shift (@_);
722    my ($section) = @_;
723
724    my $section_ptr = $self->_lookup_section ($section);
725    if (!defined $section_ptr) {
726    print STDERR "doc::list_subsections couldn't find section $section\n";
727    return [];
728    }
729
730    return [@{$section_ptr->{'subsection_order'}}];
731}
732
733sub delete_section {
734    my $self = shift (@_);
735    my ($section) = @_;
736
737#    my $section_ptr = {'subsection_order'=>[],
738#              'next_subsection'=>1,
739#              'subsections'=>{},
740#              'metadata'=>[],
741#              'text'=>""};
742
743    # if this is the top section reset everything
744    if ($section eq "") {
745    $self->{'subsection_order'} = [];
746    $self->{'subsections'} = {};
747    $self->{'metadata'} = [];
748    $self->{'text'} = "";
749    return;
750    }
751
752    # find the parent of the section to delete
753    my $parent_section = "";
754    my $child = "0";
755    my @section = split (/\./, $section);
756    if (scalar(@section) > 0) {
757    $child = pop (@section);
758    $parent_section = join (".", @section);
759    }
760
761    my $parent_section_ptr = $self->_lookup_section($parent_section);
762    if (!defined $parent_section_ptr) {
763    print STDERR "doc::delete_section couldn't find parent section " .
764        "$parent_section\n";
765    return;
766    }
767
768    # remove this section from the subsection_order list
769    my $i = 0;
770    while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
771    if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
772        splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
773        last;
774    }
775    $i++;
776    }
777
778    # remove this section from the subsection hash
779    if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
780    undef $parent_section_ptr->{'subsections'}->{$child};
781    }
782}
783
784#--
785# methods for dealing with metadata
786
787# set_metadata_element and get_metadata_element are for metadata
788# which should only have one value. add_meta_data and get_metadata
789# are for metadata which can have more than one value.
790
791# returns the first metadata value which matches field
792
793# This version of get metadata element works much like the one above,
794# except it allows for the namespace portion of a metadata element to
795# be ignored, thus if you are searching for dc.Title, the first piece
796# of matching metadata ending with the name Title (once any namespace
797# is removed) would be returned.
798# 28-11-2003 John Thompson
799sub get_metadata_element {
800    my $self = shift (@_);
801    my ($section, $field, $ignore_namespace) = @_;
802    my ($data);
803
804    $ignore_namespace = 0 unless defined $ignore_namespace;
805
806    my $section_ptr = $self->_lookup_section($section);
807    if (!defined $section_ptr) {
808    print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
809    return;
810    }
811
812    # Remove any namespace if we are being told to ignore them
813    if($ignore_namespace) {
814    $field =~ s/^.*\.//; #$field =~ s/^\w*\.//;
815    }
816
817    foreach $data (@{$section_ptr->{'metadata'}}) {
818
819    my $data_name = $data->[0];
820
821    # Remove any namespace if we are being told to ignore them
822    if($ignore_namespace) {
823        $data_name =~ s/^.*\.//; #$data_name =~ s/^\w*\.//;
824    }
825    # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
826    $data_name =~ s/^ex\.([^.]+)$/$1/; #$data_name =~ s/^ex\.//;
827   
828    return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
829    }
830   
831    return undef; # was not found
832}
833
834# returns a list of the form [value1, value2, ...]
835sub get_metadata {
836    my $self = shift (@_);
837    my ($section, $field, $ignore_namespace) = @_;
838    my ($data);
839
840    $ignore_namespace = 0 unless defined $ignore_namespace;
841
842    my $section_ptr = $self->_lookup_section($section);
843    if (!defined $section_ptr) {
844        print STDERR "doc::get_metadata couldn't find section ",
845        $section, "\n";
846        return;
847    }
848
849    # Remove any namespace if we are being told to ignore them
850    if($ignore_namespace) {
851    $field =~ s/^.*\.//;
852    }
853
854    my @metadata = ();
855    foreach $data (@{$section_ptr->{'metadata'}}) {
856
857    my $data_name = $data->[0];
858
859    # Remove any namespace if we are being told to ignore them
860    if($ignore_namespace) {
861        $data_name =~ s/^.*\.//;
862    }   
863    # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
864    $data_name =~ s/^ex\.([^.]+)$/$1/;
865
866        push (@metadata, $data->[1]) if ($data_name eq $field);
867    }
868
869    return \@metadata;
870}
871
872sub get_metadata_hashmap {
873    my $self = shift (@_);
874    my ($section, $opt_namespace) = @_;
875   
876    my $section_ptr = $self->_lookup_section($section);
877    if (!defined $section_ptr) {
878            print STDERR "doc::get_metadata couldn't find section ",
879            $section, "\n";
880            return;
881        }
882
883    my $metadata_hashmap = {};
884    foreach my $data (@{$section_ptr->{'metadata'}}) {
885            my $metaname = $data->[0];
886         
887            if ((!defined $opt_namespace) || ($metaname =~ m/^$opt_namespace\./)) {
888                if (!defined $metadata_hashmap->{$metaname}) {
889                    $metadata_hashmap->{$metaname} = [];
890                  }
891                my $metaval_list = $metadata_hashmap->{$metaname};
892                push(@$metaval_list, $data->[1]);
893              }
894          }
895   
896    return $metadata_hashmap;
897}
898
899# returns a list of the form [[field,value],[field,value],...]
900sub get_all_metadata {
901    my $self = shift (@_);
902    my ($section) = @_;
903
904    my $section_ptr = $self->_lookup_section($section);
905    if (!defined $section_ptr) {
906    print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
907    return;
908    }
909   
910    return $section_ptr->{'metadata'};
911}
912
913# $value is optional
914sub delete_metadata {
915    my $self = shift (@_);
916    my ($section, $field, $value) = @_;
917
918    my $section_ptr = $self->_lookup_section($section);
919    if (!defined $section_ptr) {
920    print STDERR "doc::delete_metadata couldn't find section ", $section, "$field\n";
921    return;
922    }
923
924    my $i = 0;
925    while ($i < scalar (@{$section_ptr->{'metadata'}})) {
926    if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
927        (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
928        splice (@{$section_ptr->{'metadata'}}, $i, 1);
929    } else {
930        $i++;
931    }
932    }
933}
934
935sub delete_all_metadata {
936    my $self = shift (@_);
937    my ($section) = @_;
938
939    my $section_ptr = $self->_lookup_section($section);
940    if (!defined $section_ptr) {
941    print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
942    return;
943    }
944   
945    $section_ptr->{'metadata'} = [];
946}
947
948sub set_metadata_element {
949    my $self = shift (@_);
950    my ($section, $field, $value) = @_;
951
952    $self->set_utf8_metadata_element ($section, $field,
953                      &unicode::ascii2utf8(\$value));
954}
955
956# set_utf8_metadata_element assumes the text has already been
957# converted to the UTF-8 encoding.
958sub set_utf8_metadata_element {
959    my $self = shift (@_);
960    my ($section, $field, $value) = @_;
961   
962    $self->delete_metadata ($section, $field);
963    $self->add_utf8_metadata ($section, $field, $value);
964}
965
966
967# add_metadata assumes the text is in (extended) ascii form. For
968# text which has already been converted to the UTF-8 format use
969# add_utf8_metadata.
970sub add_metadata {
971    my $self = shift (@_);
972    my ($section, $field, $value) = @_;
973   
974    $self->add_utf8_metadata ($section, $field,
975                  &unicode::ascii2utf8(\$value));
976}
977
978sub add_utf8_metadata {
979    my $self = shift (@_);
980    my ($section, $field, $value) = @_;
981   
982    #    my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
983    #    my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
984    #    print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
985
986    my $section_ptr = $self->_lookup_section($section);
987    if (!defined $section_ptr) {
988    print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
989    return;
990    }
991    if (!defined $value) {
992    print STDERR "doc::add_utf8_metadata undefined value for $field\n";
993    return;
994    }
995    if (!defined $field) {
996    print STDERR "doc::add_utf8_metadata undefined metadata type \n";
997    return;
998    }
999   
1000    #print STDERR "###$field=$value\n";
1001
1002    # For now, supress this check.  Given that text data read in is now
1003    # Unicode aware, then the following block of code can (ironically enough)
1004    # cause our unicode compliant string to be re-encoded (leading to
1005    # a double-encoded UTF-8 string, which we definitely don't want!).
1006   
1007
1008    # double check that the value is utf-8
1009    #    if (!&unicode::check_is_utf8($value)) {
1010    #   print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
1011    #   &unicode::ensure_utf8(\$value);
1012    #   print STDERR " Tried converting to utf8: $value\n";
1013    #    }
1014
1015    #If the metadata value is either a latitude or a longitude value then we want to save a shortened version for spacial searching purposes
1016    if($field eq "Latitude" || $field eq "Longitude")
1017    {
1018        my $direction;
1019        if($value =~ m/^-/)
1020        {
1021            $direction = ($field eq "Latitude") ? "S" : "W";
1022        }
1023        else
1024        {
1025            $direction = ($field eq "Latitude") ? "N" : "E";
1026        }
1027       
1028        my ($beforeDec, $afterDec) = ($value =~ m/^-?([0-9]+)\.([0-9]+)$/);
1029        if(defined $beforeDec && defined $afterDec)
1030        {
1031            my $name = ($field eq "Latitude") ? "LatShort" : "LngShort";
1032            push (@{$section_ptr->{'metadata'}}, [$name, $beforeDec . $direction]);
1033           
1034            for(my $i = 2; $i <= 4; $i++)
1035            {
1036                if(length($afterDec) >= $i)
1037                {
1038                    push (@{$section_ptr->{'metadata'}}, [$name, substr($afterDec, 0, $i)]);
1039                }
1040            }
1041           
1042            #Only add the metadata if it has not already been added
1043            my $metaMap = $self->get_metadata_hashmap($section);
1044        }
1045    }
1046
1047    push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1048}
1049
1050
1051# methods for dealing with text
1052
1053# returns the text for a section
1054sub get_text {
1055    my $self = shift (@_);
1056    my ($section) = @_;
1057
1058    my $section_ptr = $self->_lookup_section($section);
1059    if (!defined $section_ptr) {
1060    print STDERR "doc::get_text couldn't find section " .
1061        "$section\n";
1062    return "";
1063    }
1064
1065    return $section_ptr->{'text'};
1066}
1067
1068# returns the (utf-8 encoded) length of the text for a section
1069sub get_text_length {
1070    my $self = shift (@_);
1071    my ($section) = @_;
1072
1073    my $section_ptr = $self->_lookup_section($section);
1074    if (!defined $section_ptr) {
1075    print STDERR "doc::get_text_length couldn't find section " .
1076        "$section\n";
1077    return 0;
1078    }
1079
1080    return length ($section_ptr->{'text'});
1081}
1082
1083# returns the total length for all the sections
1084sub get_total_text_length {
1085    my $self = shift (@_);
1086
1087    my $section = $self->get_top_section();
1088    my $length = 0;
1089    while (defined $section) {
1090    $length += $self->get_text_length($section);
1091    $section = $self->get_next_section($section);
1092    }
1093    return $length;
1094}
1095
1096sub delete_text {
1097    my $self = shift (@_);
1098    my ($section) = @_;
1099
1100    my $section_ptr = $self->_lookup_section($section);
1101    if (!defined $section_ptr) {
1102    print STDERR "doc::delete_text couldn't find section " .
1103        "$section\n";
1104    return;
1105    }
1106
1107    $section_ptr->{'text'} = "";
1108}
1109
1110# add_text assumes the text is in (extended) ascii form. For
1111# text which has been already converted to the UTF-8 format
1112# use add_utf8_text.
1113sub add_text {
1114    my $self = shift (@_);
1115    my ($section, $text) = @_;
1116
1117    # convert the text to UTF-8 encoded unicode characters
1118    # and add the text
1119    $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1120}
1121
1122
1123# add_utf8_text assumes the text to be added has already
1124# been converted to the UTF-8 encoding. For ascii text use
1125# add_text
1126sub add_utf8_text {
1127    my $self = shift (@_);
1128    my ($section, $text) = @_;
1129
1130    my $section_ptr = $self->_lookup_section($section);
1131    if (!defined $section_ptr) {
1132    print STDERR "doc::add_utf8_text couldn't find section " .
1133        "$section\n";
1134    return;
1135    }
1136
1137    $section_ptr->{'text'} .= $text;
1138}
1139
1140# returns the Source meta, which is the utf8 filename generated.
1141# Added a separate method here for convenience
1142sub get_source {
1143    my $self = shift (@_);
1144    return $self->get_metadata_element ($self->get_top_section(), "Source");
1145}
1146
1147# returns the SourceFile meta, which is the url reference to the URL-encoded
1148# version of Source (the utf8 filename). Added a separate method here for convenience
1149sub get_sourcefile {
1150    my $self = shift (@_);
1151    return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
1152}
1153
1154# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
1155# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
1156sub get_assocfile_from_sourcefile {
1157    my $self = shift (@_);
1158   
1159    # get the SourceFile meta, which is a *URL* to a file on the filesystem
1160    my $top_section = $self->get_top_section();
1161    my $source_file = $self->get_metadata_element($top_section, "SourceFile");
1162
1163    # get the actual filename as it exists on the filesystem which this url refers to
1164    $source_file = &unicode::url_to_filename($source_file);
1165    my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
1166    return $assocfilename;
1167}
1168
1169# methods for dealing with associated files
1170
1171# a file is associated with a document, NOT a section.
1172# if section is defined it is noted in the data structure
1173# only so that files associated from a particular section
1174# may be removed later (using delete_section_assoc_files)
1175sub associate_file {
1176    my $self = shift (@_);
1177    my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1178    $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1179
1180    # remove all associated files with the same name
1181    $self->delete_assoc_file ($assoc_filename);
1182
1183    # Too harsh a requirement
1184    # Definitely get HTML docs, for example, with some missing
1185    # support files
1186#    if (!&util::fd_exists($real_filename)) {
1187#   print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n";
1188#   exit -1;
1189#    }
1190
1191#    print STDERR "**** is the following a UTF8 rep of *real* filename?\n   $real_filename\n";
1192#    print STDERR "****##### so, ensure it is before storing?!?!?\n";
1193##    my $utf8_filename = Encode::encode("utf8",$filename);
1194
1195    push (@{$self->{'associated_files'}},
1196      [$real_filename, $assoc_filename, $mime_type, $section]);
1197}
1198
1199# returns a list of associated files in the form
1200#   [[real_filename, assoc_filename, mimetype], ...]
1201sub get_assoc_files {
1202    my $self = shift (@_);
1203
1204    return $self->{'associated_files'};
1205}
1206
1207# the following two methods used to keep track of original associated files
1208# for incremental building. eg a txt file used by an item file does not end
1209# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1210# file for incremental build
1211sub associate_source_file {
1212    my $self = shift (@_);
1213    my ($full_filename) = @_;
1214
1215    push (@{$self->{'source_assoc_files'}}, $full_filename);
1216
1217}
1218
1219sub get_source_assoc_files {
1220    my $self = shift (@_);
1221
1222    return $self->{'source_assoc_files'};
1223 
1224
1225}
1226sub metadata_file {
1227    my $self = shift (@_);
1228    my ($real_filename, $filename) = @_;
1229   
1230    push (@{$self->{'metadata_files'}},
1231      [$real_filename, $filename]);
1232}
1233
1234# used for writing out the archiveinf-doc info database, to list all the metadata files
1235sub get_meta_files {
1236    my $self = shift (@_);
1237
1238    return $self->{'metadata_files'};
1239}
1240
1241sub delete_section_assoc_files {
1242    my $self = shift (@_);
1243    my ($section) = @_;
1244
1245    my $i=0;
1246    while ($i < scalar (@{$self->{'associated_files'}})) {
1247    if (defined $self->{'associated_files'}->[$i]->[3] &&
1248        $self->{'associated_files'}->[$i]->[3] eq $section) {
1249        splice (@{$self->{'associated_files'}}, $i, 1);
1250    } else {
1251        $i++;
1252    }
1253    }
1254}
1255
1256sub delete_assoc_file {
1257    my $self = shift (@_);
1258    my ($assoc_filename) = @_;
1259
1260    my $i=0;
1261    while ($i < scalar (@{$self->{'associated_files'}})) {
1262    if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1263        splice (@{$self->{'associated_files'}}, $i, 1);
1264    } else {
1265        $i++;
1266    }
1267    }
1268}
1269
1270sub reset_nextsection_ptr {
1271    my $self = shift (@_);
1272    my ($section) = @_;
1273   
1274    my $section_ptr = $self->_lookup_section($section);
1275    $section_ptr->{'next_subsection'} = 1;
1276}
1277
12781;
Note: See TracBrowser for help on using the browser.