root/main/trunk/greenstone2/perllib/doc.pm @ 33126

Revision 33126, 46.4 KB (checked in by wy59, 6 months ago)

Incremental changes following previous commit. This time the code is more backwards compatible with older collections that contain only Latitude and Longitude meta. In such cases, we now ADDITIONALLY add Coordinate (and Coordshort) meta, while still also outputting Lat and Lng meta. The JS and XSL code however now prefers to work with Coordinate meta where present.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr   te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32    die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use FileUtils;
40use ghtml;
41use File::stat;
42##use hashdoc;
43use docprint;
44use JSON;
45
46# We just need pi from the Trig lib
47# Import constants pi2, pip2, pip4 (2*pi, pi/2, pi/4).
48use Math::Trig ':pi';
49
50# the document type may be indexed_doc, nonindexed_doc, or
51# classification
52
53our $OIDcount = 0;
54
55# rename_method can be 'url', 'none', 'base64'
56sub new {
57    my $class = shift (@_);
58    my ($source_filename, $doc_type, $rename_method) = @_;
59
60
61    my $self = bless {'associated_files'=>[],
62              'subsection_order'=>[],
63              'next_subsection'=>1,
64              'subsections'=>{},
65              'metadata'=>[],
66              'text'=>"",
67              'OIDtype'=>"hash"}, $class;
68
69    # used to set lastmodified here, but this can screw up the HASH ids, so
70    # the docsave processor now calls set_lastmodified
71
72    $self->set_source_path($source_filename);
73   
74    if (defined $source_filename) {
75    $source_filename = &util::filename_within_collection($source_filename);
76    print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
77    $self->set_source_filename ($source_filename, $rename_method);
78    }
79
80    $self->set_doc_type ($doc_type) if defined $doc_type;
81
82    return $self;
83}
84
85
86sub set_source_path
87{
88    my $self = shift @_;
89    my ($source_filename) = @_;
90
91    if (defined $source_filename) {
92    # On Windows the source_filename can be in terse DOS format
93    # e.g. test~1.txt
94
95    $self->{'terse_source_path'} = $source_filename;
96
97        # Use the FileUtil library methods as they are aware of more special
98        # cases such as HDFS [jmt12]
99    if (&FileUtils::fileExists($source_filename))
100        {
101        # See if we can do better for Windows with a filename
102        if ($ENV{'GSDLOS'} =~ /^windows$/i) {
103        require Win32;
104        $self->{'source_path'} = Win32::GetLongPathName($source_filename);
105        }
106        else {
107        # For Unix-based systems, there is no difference between the two
108        $self->{'source_path'} = $source_filename;
109        }
110    }
111    else {
112        print STDERR "Warning: In doc::set_source_path(), file\n";
113        print STDERR "           $source_filename\n";
114        print STDERR "         does not exist\n";
115       
116        # (default) Set it to whatever we were given
117        $self->{'source_path'} = $source_filename;
118    }   
119    }
120    else {
121    # Previous code for setting source_path allowed for
122    # it to be undefined, so continue this practice
123    $self->{'terse_source_path'} = undef;
124    $self->{'source_path'} = undef;
125    }
126}
127
128
129sub get_source_path
130{
131    my $self = shift @_;
132
133    return $self->{'terse_source_path'};
134}
135
136# set lastmodified for OAI purposes, added by GRB, moved by kjdon
137sub set_oailastmodified {
138    my $self = shift (@_);
139
140    my $source_path = $self->{'terse_source_path'};
141   
142    if (defined $source_path && (-e $source_path)) {
143    my $current_time = time;
144
145    my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
146        $wday, $yday, $isdst) = localtime($current_time);
147
148    my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
149
150    $self->add_utf8_metadata($self->get_top_section(), "oailastmodified", $current_time);
151    $self->add_utf8_metadata($self->get_top_section(), "oailastmodifieddate", $date_modified);
152    }
153}
154
155# no longer used for OAI purposes, since lastmodified is not what we want as the
156# Datestamp of a document. This doc metadata may be useful for general purposes.
157sub set_lastmodified {
158    my $self = shift (@_);
159
160    my $source_path = $self->{'terse_source_path'};
161   
162    if (defined $source_path && (-e $source_path)) {
163
164        my $file_stat = stat($source_path);
165    my $mtime = $file_stat->mtime;
166    my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
167        $wday, $yday, $isdst) = localtime($mtime);
168
169    my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
170
171    $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $mtime);
172    $self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
173    }
174}
175
176# clone the $self object
177sub duplicate {
178    my $self = shift (@_);
179
180    my $newobj = {};
181   
182    foreach my $k (keys %$self) {
183    $newobj->{$k} = &clone ($self->{$k});
184    }
185
186    bless $newobj, ref($self);
187    return $newobj;
188}
189
190sub clone {
191    my ($from) = @_;
192    my $type = ref ($from);
193
194    if ($type eq "HASH") {
195    my $to = {};
196    foreach my $key (keys %$from) {
197        $to->{$key} = &clone ($from->{$key});
198    }
199    return $to;
200    } elsif ($type eq "ARRAY") {
201    my $to = [];
202    foreach my $v (@$from) {
203        push (@$to, &clone ($v));
204    }
205    return $to;
206    } else {
207    return $from;
208    }
209}
210
211sub set_OIDtype {
212    my $self = shift (@_);
213    my ($type, $metadata) = @_;
214
215    if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|hash_on_full_filename|incremental|filename|dirname|full_filename|assigned)$/) {
216    $self->{'OIDtype'} = $type;
217    } else {
218    $self->{'OIDtype'} = "hash";
219    }
220
221    if ($type =~ /^assigned$/) {
222    if (defined $metadata) {
223        $self->{'OIDmetadata'} = $metadata;
224    } else {
225        $self->{'OIDmetadata'} = "dc.Identifier";
226    }
227    }
228}
229
230# rename_method can be 'url', 'none', 'base64'
231sub set_source_filename {
232    my $self = shift (@_);
233    my ($source_filename, $rename_method) = @_;
234
235    # Since the gsdlsourcefilename element goes into the doc.xml it has
236    # to be utf8. However, it should also *represent* the source filename
237    # (in the import directory) which may not be utf8 at all.
238    # For instance, if this meta element (gsdlsourcefilename) will be used
239    # by other applications that parse doc.xml in order to locate
240    # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
241    # encode the real filename as this is a binary-to-text encoding meaning
242    # that the resulting string is ASCII (utf8). Decoding will give the original.
243   
244#    print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
245
246    # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
247    $source_filename = $self->encode_filename($source_filename, $rename_method);
248#    my ($srcfilename,$dirname,$suffix)
249#   = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
250#    print STDERR "-> $srcfilename -> ";
251#    $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
252#    $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
253#    print STDERR "$source_filename\n";
254   
255    $self->set_utf8_metadata_element ($self->get_top_section(),
256                 "gsdlsourcefilename",
257                 $source_filename);
258}
259
260sub encode_filename {
261    my $self = shift (@_);
262    my ($source_filename, $rename_method) = @_;
263
264     my ($srcfilename,$dirname,$suffix)
265    = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
266#    print STDERR "-> $srcfilename -> ";
267    $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
268    $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
269
270    return $source_filename;
271}
272
273sub set_converted_filename {
274    my $self = shift (@_);
275    my ($converted_filename) = @_;
276
277    # we know the converted filename is utf8
278    $self->set_utf8_metadata_element ($self->get_top_section(),
279                 "gsdlconvertedfilename",
280                 $converted_filename);
281}
282
283# returns the source_filename as it was provided
284sub get_unmodified_source_filename {
285    my $self = shift (@_);
286
287    return $self->{'terse_source_path'};
288}
289
290# returns the source_filename with whatever rename_method was given
291sub get_source_filename {
292    my $self = shift (@_);
293
294    return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
295}
296
297
298
299# returns converted filename if available else returns source filename
300sub get_filename_for_hashing {
301    my $self = shift (@_);
302
303    my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
304
305    if (!defined $filename) {
306    my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
307    # if NULPlug processed file, then don't give a filename
308    if (defined $plugin_name && $plugin_name eq "NULPlug") {
309        $filename = undef;
310    } else { # returns the URL encoded source filename!
311        $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
312    }
313    }
314
315    if (!&FileUtils::isFilenameAbsolute($filename)) {
316    $filename = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$filename);
317    }
318
319    return $filename;
320}
321
322sub set_doc_type {
323    my $self = shift (@_);
324    my ($doc_type) = @_;
325
326    $self->set_metadata_element ($self->get_top_section(),
327                 "gsdldoctype",
328                 $doc_type);
329}
330
331# returns the gsdldoctype as it was provided
332# the default of "indexed_doc" is used if no document
333# type was provided
334sub get_doc_type {
335    my $self = shift (@_);
336
337    my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
338    return $doc_type if (defined $doc_type);
339    return "indexed_doc";
340}
341
342
343# look up the reference to the a particular section
344sub _lookup_section {
345    my $self = shift (@_);
346    my ($section) = @_;
347
348    my ($num);
349    my $sectionref = $self;
350
351    while (defined $section && $section ne "") {
352   
353    ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
354   
355    $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
356   
357    $section = "" unless defined $section;
358   
359
360    if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
361        $sectionref = $sectionref->{'subsections'}->{$num};
362    } else {
363        return undef;
364    }
365    }
366   
367    return $sectionref;
368}
369
370# calculate OID by hashing the contents of the document
371sub _calc_OID {
372    my $self = shift (@_);
373    my ($filename) = @_;
374
375
376    my $osexe = &util::get_os_exe();
377
378    my $hashfile_exe = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin",
379                       $ENV{'GSDLOS'},"hashfile$osexe");
380
381    &util::set_gnomelib_env(); # gnomelib_env (particularly lib/libiconv2.dylib) required to run the hashfile executable on Mac Lions
382    # The subroutine will set the gnomelib env once for each subshell launched, by first testing if GEXTGNOME is not already set
383
384    # A different way to set the gnomelib env would be to do it more locally: exporting the necessary vars
385    # (specifically DYLD/LD_LIB_PATH) for gnome_lib as part of the command executed.
386    # E.g. $result=`export LD_LIBRARY_PATH=../ext/gnome-lib/darwin/lib; hashfile...`
387
388    my $result = "NULL";
389
390   
391    if (-e "$hashfile_exe") {
392#   $result = `\"$hashfile_exe\" \"$filename\"`;
393#   $result = `hashfile$osexe \"$filename\" 2>&1`;
394    $result = `hashfile$osexe \"$filename\"`;
395
396    ($result) = $result =~ /:\s*([0-9a-f]+)/i;
397    } else {
398    print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
399    }
400    return "HASH$result";
401}
402
403# methods dealing with OID, not groups of them.
404
405# if $OID is not provided one is calculated
406sub set_OID {
407    my $self = shift (@_);
408    my ($OID) = @_;
409   
410    my $use_hash_oid = 0;
411    # if an OID wasn't provided calculate one
412    if (!defined $OID) {
413    $OID = "NULL";
414    if ($self->{'OIDtype'} =~ /^hash/) {
415        $use_hash_oid = 1;
416    } elsif ($self->{'OIDtype'} eq "incremental") {
417        $OID = "D" . $OIDcount;
418        $OIDcount ++;
419    } elsif ($self->{'OIDtype'} eq "filename") {
420        my $filename = $self->get_source_filename();
421        $OID = &File::Basename::fileparse($filename, qr/\.[^.]*/);
422        $OID = &util::tidy_up_oid($OID);
423    } elsif ($self->{'OIDtype'} eq "full_filename") {
424        my $source_filename = $self->get_source_filename();
425        my $dirsep = &util::get_os_dirsep();
426
427        $source_filename =~ s/^import$dirsep//;
428        $source_filename =~ s/$dirsep/-/g;
429        $source_filename =~ s/\./_/g;
430
431        $OID = $source_filename;
432        $OID = &util::tidy_up_oid($OID);
433    } elsif ($self->{'OIDtype'} eq "dirname") {
434        my $filename = $self->get_source_filename();
435        if (defined($filename)) { # && -e $filename) {
436        # get the immediate parent directory
437        $OID = &File::Basename::dirname($filename);
438        if (defined $OID) {
439            $OID = &File::Basename::basename($OID);
440            $OID = &util::tidy_up_oid($OID);
441        } else {
442            print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
443            $use_hash_oid = 1;
444        }
445        } else {
446        print STDERR "Failed to find a filename, generating hash id\n";
447        $use_hash_oid = 1;
448        }   
449       
450    } elsif ($self->{'OIDtype'} eq "assigned") {
451        my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
452        if (defined $identifier && $identifier ne "") {
453        $OID = $identifier;
454        $OID = &util::tidy_up_oid($OID);
455        } else {
456        # need a hash id
457        print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
458        $use_hash_oid = 1;
459        }
460               
461    } else {
462        $use_hash_oid = 1;
463    }
464
465    if ($use_hash_oid) {
466        my $hash_on_file = 1;
467        my $hash_on_ga_xml = 0;
468
469        if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
470        $hash_on_file = 0;
471        $hash_on_ga_xml = 1;
472        }
473
474        if ($self->{'OIDtype'} eq "hash_on_full_filename") {
475        $hash_on_file = 0;
476        $hash_on_ga_xml = 0;
477
478        my $source_filename = $self->get_source_filename();
479        my $dirsep = &util::get_os_dirsep();
480
481        $source_filename =~ s/^import$dirsep//;
482        $source_filename =~ s/$dirsep/-/g;
483        $source_filename =~ s/\./_/g;
484       
485        # If the filename is very short then (handled naively)
486        # this can cause conjestion in the hash-values
487        # computed, leading documents sharing the same leading
488        # Hex values in the computed has.
489        #
490        # The solution taken here is to replace the name of
491        # the file name a sufficient number of times (up to
492        # the character limit defined in 'rep_limit' and
493        # make that the content that is hashed on
494
495        # *** Think twice before changing the following value
496        # as it will break backward compatability of computed
497        # document HASH values
498
499        my $rep_limit  = 256;
500        my $hash_content = undef;
501
502        if (length($source_filename)<$rep_limit) {
503            my $rep_string = "$source_filename|";
504            my $rs_len = length($rep_string);
505
506            my $clone_times = int(($rep_limit-1)/$rs_len) +1;
507           
508            $hash_content = substr($rep_string x $clone_times, 0, $rep_limit);
509        }
510        else {
511            $hash_content = $source_filename;
512        }
513
514        my $filename = &util::get_tmp_filename();
515        if (!open (OUTFILE, ">:utf8", $filename)) {
516            print STDERR "doc::set_OID could not write to $filename\n";
517        } else {
518            print OUTFILE $hash_content;
519            close (OUTFILE);
520        }
521        $OID = $self->_calc_OID ($filename);
522
523        &FileUtils::removeFiles ($filename);
524        }
525
526        if ($hash_on_file) {
527        # "hash" OID - feed file to hashfile.exe
528        my $filename = $self->get_filename_for_hashing();
529       
530        # -z: don't want to hash on the file if it is zero size
531        if (defined($filename) && -e $filename && !-z $filename) {
532            $OID = $self->_calc_OID ($filename);
533        } else {
534            $hash_on_ga_xml = 1; # switch to back-up plan, and hash on GA file instead
535        }
536        }
537
538        if ($hash_on_ga_xml) {
539        # In addition being asked to explicity calculate the has based on the GA file,
540        # can also end up coming into this block is doing 'hash_on_file' but the file
541        # itself is of zero bytes (as could be the case with 'doc.nul' file
542
543        my $filename = &util::get_tmp_filename();
544        if (!open (OUTFILE, ">:utf8", $filename)) {
545            print STDERR "doc::set_OID could not write to $filename\n";
546        } else {
547            my $doc_text = &docprint::get_section_xml($self);
548            print OUTFILE $doc_text;
549            close (OUTFILE);
550        }
551        $OID = $self->_calc_OID ($filename);
552        &FileUtils::removeFiles($filename);
553        }
554    }
555    }
556    $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
557}
558
559# this uses hashdoc (embedded c thingy) which is faster but still
560# needs a little work to be sufficiently stable
561sub ___set_OID {
562    my $self = shift (@_);
563    my ($OID) = @_;
564
565    # if an OID wasn't provided then calculate hash value based on document
566    if (!defined $OID)
567    {
568    my $hash_text = &docprint::get_section_xml($self);
569    my $hash_len = length($hash_text);
570
571        $OID = &hashdoc::buffer($hash_text,$hash_len);
572    }
573
574    $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
575}
576
577# returns the OID for this document
578sub get_OID {
579    my $self = shift (@_);
580    my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
581    return $OID if (defined $OID);
582    return "NULL";
583}
584
585sub delete_OID {
586    my $self = shift (@_);
587   
588    $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
589}
590
591
592# methods for manipulating section names
593
594# returns the name of the top-most section (the top
595# level of the document
596sub get_top_section {
597    my $self = shift (@_);
598   
599    return "";
600}
601
602# returns a section
603sub get_parent_section {
604    my $self = shift (@_);
605    my ($section) = @_;
606
607    $section =~ s/(^|\.)\d+$//;
608
609    return $section;
610}
611
612# returns the first child section (or the end child
613# if there isn't any)
614sub get_begin_child {
615    my $self = shift (@_);
616    my ($section) = @_;
617
618    my $section_ptr = $self->_lookup_section($section);
619    return "" unless defined $section_ptr;
620
621    if (defined $section_ptr->{'subsection_order'}->[0]) {
622    return "$section.$section_ptr->{'subsection_order'}->[0]";
623    }
624
625    return $self->get_end_child ($section);
626}
627
628# returns the next child of a parent section
629sub get_next_child {
630    my $self = shift (@_);
631    my ($section) = @_;
632   
633    my $parent_section = $self->get_parent_section($section);
634    my $parent_section_ptr = $self->_lookup_section($parent_section);
635    return undef unless defined $parent_section_ptr;
636
637    my ($section_num) = $section =~ /(\d+)$/;
638    return undef unless defined $section_num;
639
640    my $i = 0;
641    my $section_order = $parent_section_ptr->{'subsection_order'};
642    while ($i < scalar(@$section_order)) {
643    last if $section_order->[$i] eq $section_num;
644    $i++;
645    }
646
647    $i++; # the next child
648    if ($i < scalar(@$section_order)) {
649    return $section_order->[$i] if $parent_section eq "";
650    return "$parent_section.$section_order->[$i]";
651    }
652
653    # no more sections in this level
654    return undef;
655}
656
657# returns a reference to a list of children
658sub get_children {
659    my $self = shift (@_);
660    my ($section) = @_;
661
662    my $section_ptr = $self->_lookup_section($section);
663    return [] unless defined $section_ptr;
664
665    my @children = @{$section_ptr->{'subsection_order'}};
666
667    map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
668    return \@children;
669}
670
671# returns the child section one past the last one (which
672# is coded as "0")
673sub get_end_child {
674    my $self = shift (@_);
675    my ($section) = @_;
676
677    return $section . ".0" unless $section eq "";
678    return "0";
679}
680
681# returns the next section in book order
682sub get_next_section {
683    my $self = shift (@_);
684    my ($section) = @_;
685
686    return undef unless defined $section;
687
688    my $section_ptr = $self->_lookup_section($section);
689    return undef unless defined $section_ptr;
690
691    # first try to find first child
692    if (defined $section_ptr->{'subsection_order'}->[0]) {
693    return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
694    return "$section.$section_ptr->{'subsection_order'}->[0]";
695    }
696
697    do {
698    # try to find sibling
699    my $next_child = $self->get_next_child ($section);
700    return $next_child if (defined $next_child);
701
702    # move up one level
703    $section = $self->get_parent_section ($section);
704    } while $section =~ /\d/;
705
706    return undef;
707}
708
709sub is_leaf_section {
710    my $self = shift (@_);
711    my ($section) = @_;
712
713    my $section_ptr = $self->_lookup_section($section);
714    return 1 unless defined $section_ptr;
715
716    return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
717}
718
719# methods for dealing with sections
720
721# returns the name of the inserted section
722sub insert_section {
723    my $self = shift (@_);
724    my ($before_section) = @_;
725
726    # get the child to insert before and its parent section
727    my $parent_section = "";
728    my $before_child = "0";
729    my @before_section = split (/\./, $before_section);
730    if (scalar(@before_section) > 0) {
731    $before_child = pop (@before_section);
732    $parent_section = join (".", @before_section);
733    }
734
735    my $parent_section_ptr = $self->_lookup_section($parent_section);
736    if (!defined $parent_section_ptr) {
737    print STDERR "doc::insert_section couldn't find parent section " .
738        "$parent_section\n";
739    return;
740    }
741
742    # get the next section number
743    my $section_num = $parent_section_ptr->{'next_subsection'}++;
744
745    my $i = 0;
746    while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
747       $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
748    $i++;
749    }
750   
751    # insert the section number into the order list
752    splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
753
754    # add this section to the parent section
755    my $section_ptr = {'subsection_order'=>[],
756               'next_subsection'=>1,
757               'subsections'=>{},
758               'metadata'=>[],
759               'text'=>""};
760    $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
761
762    # work out the full section number
763    my $section = $parent_section;
764    $section .= "." unless $section eq "";
765    $section .= $section_num;
766   
767    return $section;
768}
769
770# creates a pre-named section
771sub create_named_section {
772    my $self = shift (@_);
773    my ($mastersection) = @_;
774
775    my ($num);
776    my $section = $mastersection;
777    my $sectionref = $self;
778
779    while ($section ne "") {
780    ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
781    $num =~ s/^0+(\d)/$1/; # remove leading 0s
782    $section = "" unless defined $section;
783   
784    if (defined $num) {
785        if (!defined $sectionref->{'subsections'}->{$num}) {
786        push (@{$sectionref->{'subsection_order'}}, $num);
787        $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
788                            'next_subsection'=>1,
789                            'subsections'=>{},
790                            'metadata'=>[],
791                            'text'=>""};
792        if ($num >= $sectionref->{'next_subsection'}) {
793            $sectionref->{'next_subsection'} = $num + 1;
794        }
795        }
796        $sectionref = $sectionref->{'subsections'}->{$num};
797
798    } else {
799        print STDERR "doc::create_named_section couldn't create section ";
800        print STDERR "$mastersection\n";
801        last;
802    }
803    }
804}
805
806# returns a reference to a list of subsections
807sub list_subsections {
808    my $self = shift (@_);
809    my ($section) = @_;
810
811    my $section_ptr = $self->_lookup_section ($section);
812    if (!defined $section_ptr) {
813    print STDERR "doc::list_subsections couldn't find section $section\n";
814    return [];
815    }
816
817    return [@{$section_ptr->{'subsection_order'}}];
818}
819
820sub delete_section {
821    my $self = shift (@_);
822    my ($section) = @_;
823
824#    my $section_ptr = {'subsection_order'=>[],
825#              'next_subsection'=>1,
826#              'subsections'=>{},
827#              'metadata'=>[],
828#              'text'=>""};
829
830    # if this is the top section reset everything
831    if ($section eq "") {
832    $self->{'subsection_order'} = [];
833    $self->{'subsections'} = {};
834    $self->{'metadata'} = [];
835    $self->{'text'} = "";
836    return;
837    }
838
839    # find the parent of the section to delete
840    my $parent_section = "";
841    my $child = "0";
842    my @section = split (/\./, $section);
843    if (scalar(@section) > 0) {
844    $child = pop (@section);
845    $parent_section = join (".", @section);
846    }
847
848    my $parent_section_ptr = $self->_lookup_section($parent_section);
849    if (!defined $parent_section_ptr) {
850    print STDERR "doc::delete_section couldn't find parent section " .
851        "$parent_section\n";
852    return;
853    }
854
855    # remove this section from the subsection_order list
856    my $i = 0;
857    while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
858    if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
859        splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
860        last;
861    }
862    $i++;
863    }
864
865    # remove this section from the subsection hash
866    if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
867    undef $parent_section_ptr->{'subsections'}->{$child};
868    }
869}
870
871#--
872# methods for dealing with metadata
873
874# set_metadata_element and get_metadata_element are for metadata
875# which should only have one value. add_meta_data and get_metadata
876# are for metadata which can have more than one value.
877
878# returns the first metadata value which matches field
879
880# This version of get metadata element works much like the one above,
881# except it allows for the namespace portion of a metadata element to
882# be ignored, thus if you are searching for dc.Title, the first piece
883# of matching metadata ending with the name Title (once any namespace
884# is removed) would be returned.
885# 28-11-2003 John Thompson
886sub get_metadata_element {
887    my $self = shift (@_);
888    my ($section, $field, $ignore_namespace) = @_;
889    my ($data);
890
891    $ignore_namespace = 0 unless defined $ignore_namespace;
892
893    my $section_ptr = $self->_lookup_section($section);
894    if (!defined $section_ptr) {
895    print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
896    return;
897    }
898
899    # Remove any namespace if we are being told to ignore them
900    if($ignore_namespace) {
901    $field =~ s/^.*\.//; #$field =~ s/^\w*\.//;
902    }
903
904    foreach $data (@{$section_ptr->{'metadata'}}) {
905
906    my $data_name = $data->[0];
907
908    # Remove any namespace if we are being told to ignore them
909    if($ignore_namespace) {
910        $data_name =~ s/^.*\.//; #$data_name =~ s/^\w*\.//;
911    }
912    # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
913    $data_name =~ s/^ex\.([^.]+)$/$1/; #$data_name =~ s/^ex\.//;
914   
915    return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
916    }
917   
918    return undef; # was not found
919}
920
921# returns a list of the form [value1, value2, ...]
922sub get_metadata {
923    my $self = shift (@_);
924    my ($section, $field, $ignore_namespace) = @_;
925    my ($data);
926
927    $ignore_namespace = 0 unless defined $ignore_namespace;
928
929    my $section_ptr = $self->_lookup_section($section);
930    if (!defined $section_ptr) {
931        print STDERR "doc::get_metadata couldn't find section ",
932        $section, "\n";
933        return;
934    }
935
936    # Remove any namespace if we are being told to ignore them
937    if($ignore_namespace) {
938    $field =~ s/^.*\.//;
939    }
940
941    my @metadata = ();
942    foreach $data (@{$section_ptr->{'metadata'}}) {
943
944    my $data_name = $data->[0];
945
946    # Remove any namespace if we are being told to ignore them
947    if($ignore_namespace) {
948        $data_name =~ s/^.*\.//;
949    }   
950    # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
951    $data_name =~ s/^ex\.([^.]+)$/$1/;
952
953        push (@metadata, $data->[1]) if ($data_name eq $field);
954    }
955
956    return \@metadata;
957}
958
959sub get_metadata_hashmap {
960    my $self = shift (@_);
961    my ($section, $opt_namespace) = @_;
962   
963    my $section_ptr = $self->_lookup_section($section);
964    if (!defined $section_ptr) {
965            print STDERR "doc::get_metadata couldn't find section ",
966            $section, "\n";
967            return;
968        }
969
970    my $metadata_hashmap = {};
971    foreach my $data (@{$section_ptr->{'metadata'}}) {
972            my $metaname = $data->[0];
973         
974            if ((!defined $opt_namespace) || ($metaname =~ m/^$opt_namespace\./)) {
975                if (!defined $metadata_hashmap->{$metaname}) {
976                    $metadata_hashmap->{$metaname} = [];
977                  }
978                my $metaval_list = $metadata_hashmap->{$metaname};
979                push(@$metaval_list, $data->[1]);
980              }
981          }
982   
983    return $metadata_hashmap;
984}
985
986# returns a list of the form [[field,value],[field,value],...]
987sub get_all_metadata {
988    my $self = shift (@_);
989    my ($section) = @_;
990
991    my $section_ptr = $self->_lookup_section($section);
992    if (!defined $section_ptr) {
993    print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
994    return;
995    }
996   
997    return $section_ptr->{'metadata'};
998}
999
1000# $value is optional
1001sub delete_metadata {
1002    my $self = shift (@_);
1003    my ($section, $field, $value) = @_;
1004
1005    my $section_ptr = $self->_lookup_section($section);
1006    if (!defined $section_ptr) {
1007    print STDERR "doc::delete_metadata couldn't find section ", $section, "$field\n";
1008    return;
1009    }
1010
1011    my $i = 0;
1012    while ($i < scalar (@{$section_ptr->{'metadata'}})) {
1013    if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
1014        (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
1015        splice (@{$section_ptr->{'metadata'}}, $i, 1);
1016    } else {
1017        $i++;
1018    }
1019    }
1020}
1021
1022sub delete_all_metadata {
1023    my $self = shift (@_);
1024    my ($section) = @_;
1025
1026    my $section_ptr = $self->_lookup_section($section);
1027    if (!defined $section_ptr) {
1028    print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
1029    return;
1030    }
1031   
1032    $section_ptr->{'metadata'} = [];
1033}
1034
1035sub set_metadata_element {
1036    my $self = shift (@_);
1037    my ($section, $field, $value) = @_;
1038
1039    $self->set_utf8_metadata_element ($section, $field,
1040                      &unicode::ascii2utf8(\$value));
1041}
1042
1043# set_utf8_metadata_element assumes the text has already been
1044# converted to the UTF-8 encoding.
1045sub set_utf8_metadata_element {
1046    my $self = shift (@_);
1047    my ($section, $field, $value) = @_;
1048   
1049    $self->delete_metadata ($section, $field);
1050    $self->add_utf8_metadata ($section, $field, $value);
1051}
1052
1053
1054# add_metadata assumes the text is in (extended) ascii form. For
1055# text which has already been converted to the UTF-8 format use
1056# add_utf8_metadata.
1057sub add_metadata {
1058    my $self = shift (@_);
1059    my ($section, $field, $value) = @_;
1060   
1061    $self->add_utf8_metadata ($section, $field,
1062                  &unicode::ascii2utf8(\$value));
1063}
1064
1065sub add_utf8_metadata {
1066    my $self = shift (@_);
1067    my ($section, $field, $value) = @_;
1068   
1069    #    my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1070    #    my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
1071    #    print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
1072
1073    my $section_ptr = $self->_lookup_section($section);
1074    if (!defined $section_ptr) {
1075    print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
1076    return;
1077    }
1078    if (!defined $value) {
1079    print STDERR "doc::add_utf8_metadata undefined value for $field\n";
1080    return;
1081    }
1082    if (!defined $field) {
1083    print STDERR "doc::add_utf8_metadata undefined metadata type \n";
1084    return;
1085    }
1086   
1087    #print STDERR "###$field=$value\n";
1088
1089    # For now, supress this check.  Given that text data read in is now
1090    # Unicode aware, then the following block of code can (ironically enough)
1091    # cause our unicode compliant string to be re-encoded (leading to
1092    # a double-encoded UTF-8 string, which we definitely don't want!).
1093   
1094
1095    # double check that the value is utf-8
1096    #    if (!&unicode::check_is_utf8($value)) {
1097    #   print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
1098    #   &unicode::ensure_utf8(\$value);
1099    #   print STDERR " Tried converting to utf8: $value\n";
1100    #    }
1101
1102    #If the metadata value is either a latitude or a longitude value then we want to save a shortened version for spacial searching purposes
1103    if ($field =~ m/^(.+\.)?Latitude$/ || $field =~ m/^(.+\.)?Longitude$/)
1104    {
1105            my ($mdprefix,$metaname) = ($field =~ m/(.+)\.(.+)$/);
1106        if (defined $mdprefix) {
1107            # Add in a version of Latitude/Longitude without the metadata namespace prefix to keep Runtime happy
1108            push (@{$section_ptr->{'metadata'}}, [$metaname, $value]);
1109        }
1110
1111        my $direction;
1112        if($value =~ m/^-/)
1113        {
1114            $direction = ($field eq "Latitude") ? "S" : "W";
1115        }
1116        else
1117        {
1118            $direction = ($field eq "Latitude") ? "N" : "E";
1119        }
1120       
1121        my ($beforeDec, $afterDec) = ($value =~ m/^-?([0-9]+)\.([0-9]+)$/);
1122        if(defined $beforeDec && defined $afterDec)
1123        {
1124            my $name = ($field eq "Latitude") ? "LatShort" : "LngShort";
1125            push (@{$section_ptr->{'metadata'}}, [$name, $beforeDec . $direction]);
1126           
1127            for(my $i = 2; $i <= 4; $i++)
1128            {
1129                if(length($afterDec) >= $i)
1130                {
1131                    push (@{$section_ptr->{'metadata'}}, [$name, substr($afterDec, 0, $i)]);
1132                }
1133            }
1134           
1135            #Only add the metadata if it has not already been added
1136            my $metaMap = $self->get_metadata_hashmap($section); ### TODO: metaMap not used. Unnecesssary step? (Called function has no side-effects.)
1137        }
1138       
1139        if($field =~ m/^(.+\.)?Longitude$/) {
1140            # if we are dealing with Longitude meta, we should 1. have Latitude meta; 2. already have processed Latitude meta
1141            # in that case, add both Lat and Lng of this section as a Coordinate meta
1142            my $latitude = $self->get_metadata_element ($section, "Latitude");         
1143            # TODO: would like all Longitude info together followed by all Coordinate info, but the following will add all coord info meta and end of this function will add Longitude meta
1144            $self->processCoordinate($section_ptr, $latitude, $value); # value is Longitude
1145        }
1146    }
1147
1148    elsif($field eq "GPS.mapOverlay") { # then the value is a JSON string
1149        print STDERR "GPS.mapOverlay before val: " . $value . "\n";
1150       
1151        # TODO html decode?
1152        $value =~ s@&#091;@[@g;
1153        $value =~ s@&#093;@]@g;
1154        $value =~ s@&quot;@"@g;
1155        print STDERR "GPS.mapOverlay after val: " . $value . "\n";
1156
1157        my $json_array = decode_json $value;
1158        #my $json = JSON->new->allow_nonref;
1159        #&printAllShapes($json, $json_array);
1160
1161        foreach my $shape (@$json_array) {     
1162
1163            my $type = $shape->{"type"};
1164            print STDERR "Type : " . $type . "\n";
1165       
1166            if($type eq "circle") {
1167                #print STDERR "Found a circle:\n" . &printShape($json, $shape);
1168       
1169                # work out bounding box
1170                # SCARY!
1171                # want the inverse of this useful page:
1172                # https://stackoverflow.com/questions/639695/how-to-convert-latitude-or-longitude-to-meters
1173                # https://www.geodatasource.com/developers/javascript       
1174               
1175                           
1176                # for now, just process the circle centre
1177                #my $centre = $shape->{"center"};               
1178                #$self->processLatOrLng($section_ptr, "Latitude", $centre->{"lat"});
1179                #$self->processLatOrLng($section_ptr, "Longitude", $centre->{"lng"});
1180
1181               
1182                # Dr Bainbridge wants us to use: https://gis.stackexchange.com/questions/5821/calculating-latitude-longitude-x-miles-from-point
1183                # But we're using the rule of thumb here, since for N,E,S,W it works out the same:
1184                # https://gis.stackexchange.com/questions/2951/algorithm-for-offsetting-a-latitude-longitude-by-some-amount-of-meters
1185                # which states
1186                # "If your displacements aren't too great (less than a few kilometers) and you're not right at the poles,
1187                # use the quick and dirty estimate that 111,111 meters (111.111 km) in the y direction is 1 degree
1188                # (of latitude) and 111,111 * cos(latitude) meters in the x direction is 1 degree (of longitude)."
1189                my $centre_lat = $shape->{"center"}->{"lat"};
1190                my $centre_lng = $shape->{"center"}->{"lng"};
1191                my $radius = $shape->{"radius"}; # in metres!
1192
1193                print STDERR "@@@ circle centre: ($centre_lat, $centre_lng), radius: $radius\n";
1194
1195                my $lat_north = $centre_lat + ($radius/111111);
1196                my $lat_south = $centre_lat - ($radius/111111);
1197               
1198                print STDERR "### lat_north:  $lat_north\n";
1199                print STDERR "### lat_south:  $lat_south\n";
1200
1201                # our latitude and longitude values are in degrees. But cos and sin etc in perl and generally all prog languages
1202                # all expect their angle to be in radians. So need to convert from degree to radians before we can take the cose of it.
1203                my $centre_lat_radians = $self->degreesToRadians($centre_lat);
1204                my $cos_in_radians = cos($centre_lat_radians);             
1205                print STDERR "cos $centre_lat_radians " . cos($centre_lat_radians) . "\n";
1206                my $lng_east = $centre_lng + ($radius/(111111 * $cos_in_radians));
1207                my $lng_west = $centre_lng - ($radius/(111111 * $cos_in_radians));
1208                print STDERR "### lng_east  $lng_east\n";
1209                print STDERR "### lng_west  $lng_west\n";
1210
1211
1212
1213                my $cos_lat = cos($centre_lat);             
1214                print STDERR "cos $centre_lat is $cos_lat\n";
1215
1216                $self->processCoordinate($section_ptr, $lat_north, $lng_east);
1217                $self->processCoordinate($section_ptr, $lat_south, $lng_east);
1218                $self->processCoordinate($section_ptr, $lat_south, $lng_west);
1219                $self->processCoordinate($section_ptr, $lat_north, $lng_west);
1220           
1221            }
1222            elsif ($type eq "marker") {
1223                print STDERR "@@ MARKER FOUND WITH LAT: " . $shape->{"position"}->{"lat"} . "\n";
1224                print STDERR "@@ MARKER FOUND WITH LNG: " . $shape->{"position"}->{"lng"} . "\n";
1225                $self->processCoordinate($section_ptr, $shape->{"position"}->{"lat"}, $shape->{"position"}->{"lng"});               
1226            }
1227            elsif ($type eq "polyline" || $type eq "polygon") {
1228                my $path_array = $shape->{"path"};
1229                foreach my $position (@$path_array) {                                   
1230                    $self->processCoordinate($section_ptr, $position->{"lat"}, $position->{"lng"});
1231                }
1232            }
1233            elsif ($type eq "rectangle") {
1234       
1235                my $bounds = $shape->{"bounds"};               
1236           
1237                $self->processCoordinate($section_ptr, $bounds->{"north"}, $bounds->{"east"});
1238                $self->processCoordinate($section_ptr, $bounds->{"south"}, $bounds->{"east"});
1239                $self->processCoordinate($section_ptr, $bounds->{"south"}, $bounds->{"west"});
1240                $self->processCoordinate($section_ptr, $bounds->{"north"}, $bounds->{"west"});
1241            }   
1242       
1243        } # end for on each shape in GPS.mapOverlay
1244
1245    } # end GPS.mapOverlay meta
1246
1247    push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1248}
1249
1250# https://en.wikipedia.org/wiki/Radian
1251sub degreesToRadians
1252{
1253    my $self = shift (@_);
1254    my ($degrees) = @_;
1255
1256    return $degrees * pi /180; # returns radians
1257}
1258
1259sub radiansToDegrees
1260{
1261    my $self = shift (@_);
1262    my ($radians) = @_;
1263
1264    return $radians * 180 / pi; # returns degrees
1265}
1266
1267sub printAllShapes {
1268    my ($json, $json_array) = @_;
1269   
1270   
1271    #my $pretty_print_shape = $json->pretty->encode( $json_array->[0] );
1272    foreach my $shape (@$json_array) {
1273        my $pretty_print_shape = $json->pretty->encode( $shape );
1274        print STDERR "Shape: $pretty_print_shape\n";
1275        #&printShape($shape);
1276    }
1277   
1278}   
1279
1280sub processCoordinate {
1281    my $self = shift (@_);
1282    my ($section_ptr, $latitude, $longitude) = @_;
1283
1284    my $lat_direction = ($latitude =~ m/^-/) ? "S" : "N";
1285    my $lng_direction = ($longitude =~ m/^-/) ? "W" : "E";
1286   
1287    # have to store (lat, lng) in pairs, when there are so many coords to store
1288    #push (@{$section_ptr->{'metadata'}}, ["Latitude", $latitude]);
1289    #push (@{$section_ptr->{'metadata'}}, ["Longitude", $longitude]);
1290
1291    push (@{$section_ptr->{'metadata'}}, ["Coordinate", "$latitude $longitude"]); # "$latitude$lat_direction $longitude$lng_direction"
1292
1293    my ($latBeforeDec, $latAfterDec);
1294    my ($lngBeforeDec, $lngAfterDec);
1295
1296    if($latitude !~ m/\./) {
1297        $latBeforeDec = $latitude;
1298        $latAfterDec = "";
1299    } else {
1300        ($latBeforeDec, $latAfterDec) = ($latitude =~ m/^-?([0-9]+)\.([0-9]+)$/);
1301    }
1302    if($longitude !~ m/\./) {
1303        $lngBeforeDec = $longitude;
1304        $lngAfterDec = "";
1305    } else {
1306        ($lngBeforeDec, $lngAfterDec) = ($longitude =~ m/^-?([0-9]+)\.([0-9]+)$/);
1307    }   
1308   
1309    #if(defined $beforeDec && defined $afterDec)
1310    #{
1311        my $name = "CoordShort";
1312        push (@{$section_ptr->{'metadata'}}, [$name, "$latBeforeDec$lat_direction $lngBeforeDec$lng_direction"]);
1313           
1314        for(my $i = 2; $i <= 4; $i++)
1315        {
1316            my $latDecPlaces = (length($latAfterDec) >= $i) ? substr($latAfterDec, 0, $i) : "";         
1317            my $lngDecPlaces = (length($lngAfterDec) >= $i) ? substr($lngAfterDec, 0, $i) : "";
1318
1319            push (@{$section_ptr->{'metadata'}}, [$name,
1320                    $latBeforeDec . $lat_direction. $latDecPlaces . " " . $lngBeforeDec . $lng_direction. $lngDecPlaces]);
1321           
1322        }
1323           
1324        #Only add the metadata if it has not already been added
1325        #my $metaMap = $self->get_metadata_hashmap($section); ### TODO: metaMap not used. Unnecesssary step? (Called function has no side-effects.)
1326    #}
1327
1328
1329}
1330
1331
1332# methods for dealing with text
1333
1334# returns the text for a section
1335sub get_text {
1336    my $self = shift (@_);
1337    my ($section) = @_;
1338
1339    my $section_ptr = $self->_lookup_section($section);
1340    if (!defined $section_ptr) {
1341    print STDERR "doc::get_text couldn't find section " .
1342        "$section\n";
1343    return "";
1344    }
1345
1346    return $section_ptr->{'text'};
1347}
1348
1349# returns the (utf-8 encoded) length of the text for a section
1350sub get_text_length {
1351    my $self = shift (@_);
1352    my ($section) = @_;
1353
1354    my $section_ptr = $self->_lookup_section($section);
1355    if (!defined $section_ptr) {
1356    print STDERR "doc::get_text_length couldn't find section " .
1357        "$section\n";
1358    return 0;
1359    }
1360
1361    return length ($section_ptr->{'text'});
1362}
1363
1364# returns the total length for all the sections
1365sub get_total_text_length {
1366    my $self = shift (@_);
1367
1368    my $section = $self->get_top_section();
1369    my $length = 0;
1370    while (defined $section) {
1371    $length += $self->get_text_length($section);
1372    $section = $self->get_next_section($section);
1373    }
1374    return $length;
1375}
1376
1377sub delete_text {
1378    my $self = shift (@_);
1379    my ($section) = @_;
1380
1381    my $section_ptr = $self->_lookup_section($section);
1382    if (!defined $section_ptr) {
1383    print STDERR "doc::delete_text couldn't find section " .
1384        "$section\n";
1385    return;
1386    }
1387
1388    $section_ptr->{'text'} = "";
1389}
1390
1391# add_text assumes the text is in (extended) ascii form. For
1392# text which has been already converted to the UTF-8 format
1393# use add_utf8_text.
1394sub add_text {
1395    my $self = shift (@_);
1396    my ($section, $text) = @_;
1397
1398    # convert the text to UTF-8 encoded unicode characters
1399    # and add the text
1400    $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1401}
1402
1403
1404# add_utf8_text assumes the text to be added has already
1405# been converted to the UTF-8 encoding. For ascii text use
1406# add_text
1407# Pass by value version (internally calls pass by ref version
1408# to avoid code duplication)
1409sub add_utf8_text {
1410    my $self = shift (@_);
1411    my ($section, $text) = @_;
1412   
1413    $self->add_utf8_textref($section, \$text);
1414}
1415
1416# Pass by reference version, used by GreenstoneSQLPlugin for fulltext
1417sub add_utf8_textref {
1418    my $self = shift (@_);
1419    my ($section, $text_ref) = @_;
1420
1421    my $section_ptr = $self->_lookup_section($section);
1422    if (!defined $section_ptr) {
1423    print STDERR "doc::add_utf8_textref couldn't find section " .
1424        "$section\n";
1425    return;
1426    }
1427
1428    $section_ptr->{'text'} .= $$text_ref;
1429}
1430
1431# returns the Source meta, which is the utf8 filename generated.
1432# Added a separate method here for convenience
1433sub get_source {
1434    my $self = shift (@_);
1435    return $self->get_metadata_element ($self->get_top_section(), "Source");
1436}
1437
1438# returns the SourceFile meta, which is the url reference to the URL-encoded
1439# version of Source (the utf8 filename). Added a separate method here for convenience
1440sub get_sourcefile {
1441    my $self = shift (@_);
1442    return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
1443}
1444
1445# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
1446# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
1447sub get_assocfile_from_sourcefile {
1448    my $self = shift (@_);
1449   
1450    # get the SourceFile meta, which is a *URL* to a file on the filesystem
1451    my $top_section = $self->get_top_section();
1452    my $source_file = $self->get_metadata_element($top_section, "SourceFile");
1453
1454    # get the actual filename as it exists on the filesystem which this url refers to
1455    $source_file = &unicode::url_to_filename($source_file);
1456    my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
1457    return $assocfilename;
1458}
1459
1460# methods for dealing with associated files
1461
1462# a file is associated with a document, NOT a section.
1463# if section is defined it is noted in the data structure
1464# only so that files associated from a particular section
1465# may be removed later (using delete_section_assoc_files)
1466sub associate_file {
1467    my $self = shift (@_);
1468    my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1469    $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1470
1471    # remove all associated files with the same name
1472    $self->delete_assoc_file ($assoc_filename);
1473
1474    # Too harsh a requirement
1475    # Definitely get HTML docs, for example, with some missing
1476    # support files
1477#    if (!&util::fd_exists($real_filename)) {
1478#   print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n";
1479#   exit -1;
1480#    }
1481
1482#    print STDERR "**** is the following a UTF8 rep of *real* filename?\n   $real_filename\n";
1483#    print STDERR "****##### so, ensure it is before storing?!?!?\n";
1484##    my $utf8_filename = Encode::encode("utf8",$filename);
1485
1486    push (@{$self->{'associated_files'}},
1487      [$real_filename, $assoc_filename, $mime_type, $section]);
1488}
1489
1490# returns a list of associated files in the form
1491#   [[real_filename, assoc_filename, mimetype], ...]
1492sub get_assoc_files {
1493    my $self = shift (@_);
1494
1495    return $self->{'associated_files'};
1496}
1497
1498# the following two methods used to keep track of original associated files
1499# for incremental building. eg a txt file used by an item file does not end
1500# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1501# file for incremental build
1502sub associate_source_file {
1503    my $self = shift (@_);
1504    my ($full_filename) = @_;
1505
1506    push (@{$self->{'source_assoc_files'}}, $full_filename);
1507
1508}
1509
1510sub get_source_assoc_files {
1511    my $self = shift (@_);
1512
1513    return $self->{'source_assoc_files'};
1514 
1515
1516}
1517sub metadata_file {
1518    my $self = shift (@_);
1519    my ($real_filename, $filename) = @_;
1520   
1521    push (@{$self->{'metadata_files'}},
1522      [$real_filename, $filename]);
1523}
1524
1525# used for writing out the archiveinf-doc info database, to list all the metadata files
1526sub get_meta_files {
1527    my $self = shift (@_);
1528
1529    return $self->{'metadata_files'};
1530}
1531
1532sub delete_section_assoc_files {
1533    my $self = shift (@_);
1534    my ($section) = @_;
1535
1536    my $i=0;
1537    while ($i < scalar (@{$self->{'associated_files'}})) {
1538    if (defined $self->{'associated_files'}->[$i]->[3] &&
1539        $self->{'associated_files'}->[$i]->[3] eq $section) {
1540        splice (@{$self->{'associated_files'}}, $i, 1);
1541    } else {
1542        $i++;
1543    }
1544    }
1545}
1546
1547sub delete_assoc_file {
1548    my $self = shift (@_);
1549    my ($assoc_filename) = @_;
1550
1551    my $i=0;
1552    while ($i < scalar (@{$self->{'associated_files'}})) {
1553    if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1554        splice (@{$self->{'associated_files'}}, $i, 1);
1555    } else {
1556        $i++;
1557    }
1558    }
1559}
1560
1561sub reset_nextsection_ptr {
1562    my $self = shift (@_);
1563    my ($section) = @_;
1564   
1565    my $section_ptr = $self->_lookup_section($section);
1566    $section_ptr->{'next_subsection'} = 1;
1567}
1568
15691;
Note: See TracBrowser for help on using the browser.