root/main/trunk/greenstone2/perllib/doc.pm @ 33128

Revision 33128, 47.5 KB (checked in by wy59, 5 months ago)

Improvements to Coordinate support AND bugfixes. BUT not all the fixes may be ideal, many marked with TODO. 1. Now we support an Array of coordinates. At present these are only displayed as Markers, but in future shapes should appear as shapes. 2. Bugfixes include: (a) expanding sections wasn't working when we had hierarchical docs with Coordinate data, because map-scripts 'overrode' the toggleSection function but no longer did any of the doc expanding behaviour that document_scripts.js used to do. This was not a problem with the ImagesGPS collection, simply because that did not have hierarchical/sectionalised documents. (b) Perl: A previous commit output duplicate Coordinates into the index. Now this doesn't happen. Fix works but may not be ideal. 3. Perl: (a) Reserved index names CD, CS for Coordinate and CoordShort?. Note however that LAT and LNG were never added to reserve list for index names. (b) Now doc.pm::processCoord() takes a section parameter and works out the section-ptr from that.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr   te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32    die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use FileUtils;
40use ghtml;
41use File::stat;
42##use hashdoc;
43use docprint;
44use JSON;
45
46# We just need pi from the Trig lib
47# Import constants pi2, pip2, pip4 (2*pi, pi/2, pi/4).
48use Math::Trig ':pi';
49
50# the document type may be indexed_doc, nonindexed_doc, or
51# classification
52
53our $OIDcount = 0;
54
55# rename_method can be 'url', 'none', 'base64'
56sub new {
57    my $class = shift (@_);
58    my ($source_filename, $doc_type, $rename_method) = @_;
59
60
61    my $self = bless {'associated_files'=>[],
62              'subsection_order'=>[],
63              'next_subsection'=>1,
64              'subsections'=>{},
65              'metadata'=>[],
66              'text'=>"",
67              'OIDtype'=>"hash"}, $class;
68
69    # used to set lastmodified here, but this can screw up the HASH ids, so
70    # the docsave processor now calls set_lastmodified
71
72    $self->set_source_path($source_filename);
73   
74    if (defined $source_filename) {
75    $source_filename = &util::filename_within_collection($source_filename);
76    print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
77    $self->set_source_filename ($source_filename, $rename_method);
78    }
79
80    $self->set_doc_type ($doc_type) if defined $doc_type;
81
82    return $self;
83}
84
85
86sub set_source_path
87{
88    my $self = shift @_;
89    my ($source_filename) = @_;
90
91    if (defined $source_filename) {
92    # On Windows the source_filename can be in terse DOS format
93    # e.g. test~1.txt
94
95    $self->{'terse_source_path'} = $source_filename;
96
97        # Use the FileUtil library methods as they are aware of more special
98        # cases such as HDFS [jmt12]
99    if (&FileUtils::fileExists($source_filename))
100        {
101        # See if we can do better for Windows with a filename
102        if ($ENV{'GSDLOS'} =~ /^windows$/i) {
103        require Win32;
104        $self->{'source_path'} = Win32::GetLongPathName($source_filename);
105        }
106        else {
107        # For Unix-based systems, there is no difference between the two
108        $self->{'source_path'} = $source_filename;
109        }
110    }
111    else {
112        print STDERR "Warning: In doc::set_source_path(), file\n";
113        print STDERR "           $source_filename\n";
114        print STDERR "         does not exist\n";
115       
116        # (default) Set it to whatever we were given
117        $self->{'source_path'} = $source_filename;
118    }   
119    }
120    else {
121    # Previous code for setting source_path allowed for
122    # it to be undefined, so continue this practice
123    $self->{'terse_source_path'} = undef;
124    $self->{'source_path'} = undef;
125    }
126}
127
128
129sub get_source_path
130{
131    my $self = shift @_;
132
133    return $self->{'terse_source_path'};
134}
135
136# set lastmodified for OAI purposes, added by GRB, moved by kjdon
137sub set_oailastmodified {
138    my $self = shift (@_);
139
140    my $source_path = $self->{'terse_source_path'};
141   
142    if (defined $source_path && (-e $source_path)) {
143    my $current_time = time;
144
145    my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
146        $wday, $yday, $isdst) = localtime($current_time);
147
148    my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
149
150    $self->add_utf8_metadata($self->get_top_section(), "oailastmodified", $current_time);
151    $self->add_utf8_metadata($self->get_top_section(), "oailastmodifieddate", $date_modified);
152    }
153}
154
155# no longer used for OAI purposes, since lastmodified is not what we want as the
156# Datestamp of a document. This doc metadata may be useful for general purposes.
157sub set_lastmodified {
158    my $self = shift (@_);
159
160    my $source_path = $self->{'terse_source_path'};
161   
162    if (defined $source_path && (-e $source_path)) {
163
164        my $file_stat = stat($source_path);
165    my $mtime = $file_stat->mtime;
166    my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
167        $wday, $yday, $isdst) = localtime($mtime);
168
169    my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
170
171    $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $mtime);
172    $self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
173    }
174}
175
176# clone the $self object
177sub duplicate {
178    my $self = shift (@_);
179
180    my $newobj = {};
181   
182    foreach my $k (keys %$self) {
183    $newobj->{$k} = &clone ($self->{$k});
184    }
185
186    bless $newobj, ref($self);
187    return $newobj;
188}
189
190sub clone {
191    my ($from) = @_;
192    my $type = ref ($from);
193
194    if ($type eq "HASH") {
195    my $to = {};
196    foreach my $key (keys %$from) {
197        $to->{$key} = &clone ($from->{$key});
198    }
199    return $to;
200    } elsif ($type eq "ARRAY") {
201    my $to = [];
202    foreach my $v (@$from) {
203        push (@$to, &clone ($v));
204    }
205    return $to;
206    } else {
207    return $from;
208    }
209}
210
211sub set_OIDtype {
212    my $self = shift (@_);
213    my ($type, $metadata) = @_;
214
215    if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|hash_on_full_filename|incremental|filename|dirname|full_filename|assigned)$/) {
216    $self->{'OIDtype'} = $type;
217    } else {
218    $self->{'OIDtype'} = "hash";
219    }
220
221    if ($type =~ /^assigned$/) {
222    if (defined $metadata) {
223        $self->{'OIDmetadata'} = $metadata;
224    } else {
225        $self->{'OIDmetadata'} = "dc.Identifier";
226    }
227    }
228}
229
230# rename_method can be 'url', 'none', 'base64'
231sub set_source_filename {
232    my $self = shift (@_);
233    my ($source_filename, $rename_method) = @_;
234
235    # Since the gsdlsourcefilename element goes into the doc.xml it has
236    # to be utf8. However, it should also *represent* the source filename
237    # (in the import directory) which may not be utf8 at all.
238    # For instance, if this meta element (gsdlsourcefilename) will be used
239    # by other applications that parse doc.xml in order to locate
240    # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
241    # encode the real filename as this is a binary-to-text encoding meaning
242    # that the resulting string is ASCII (utf8). Decoding will give the original.
243   
244#    print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
245
246    # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
247    $source_filename = $self->encode_filename($source_filename, $rename_method);
248#    my ($srcfilename,$dirname,$suffix)
249#   = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
250#    print STDERR "-> $srcfilename -> ";
251#    $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
252#    $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
253#    print STDERR "$source_filename\n";
254   
255    $self->set_utf8_metadata_element ($self->get_top_section(),
256                 "gsdlsourcefilename",
257                 $source_filename);
258}
259
260sub encode_filename {
261    my $self = shift (@_);
262    my ($source_filename, $rename_method) = @_;
263
264     my ($srcfilename,$dirname,$suffix)
265    = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
266#    print STDERR "-> $srcfilename -> ";
267    $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
268    $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
269
270    return $source_filename;
271}
272
273sub set_converted_filename {
274    my $self = shift (@_);
275    my ($converted_filename) = @_;
276
277    # we know the converted filename is utf8
278    $self->set_utf8_metadata_element ($self->get_top_section(),
279                 "gsdlconvertedfilename",
280                 $converted_filename);
281}
282
283# returns the source_filename as it was provided
284sub get_unmodified_source_filename {
285    my $self = shift (@_);
286
287    return $self->{'terse_source_path'};
288}
289
290# returns the source_filename with whatever rename_method was given
291sub get_source_filename {
292    my $self = shift (@_);
293
294    return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
295}
296
297
298
299# returns converted filename if available else returns source filename
300sub get_filename_for_hashing {
301    my $self = shift (@_);
302
303    my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
304
305    if (!defined $filename) {
306    my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
307    # if NULPlug processed file, then don't give a filename
308    if (defined $plugin_name && $plugin_name eq "NULPlug") {
309        $filename = undef;
310    } else { # returns the URL encoded source filename!
311        $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
312    }
313    }
314
315    if (!&FileUtils::isFilenameAbsolute($filename)) {
316    $filename = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$filename);
317    }
318
319    return $filename;
320}
321
322sub set_doc_type {
323    my $self = shift (@_);
324    my ($doc_type) = @_;
325
326    $self->set_metadata_element ($self->get_top_section(),
327                 "gsdldoctype",
328                 $doc_type);
329}
330
331# returns the gsdldoctype as it was provided
332# the default of "indexed_doc" is used if no document
333# type was provided
334sub get_doc_type {
335    my $self = shift (@_);
336
337    my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
338    return $doc_type if (defined $doc_type);
339    return "indexed_doc";
340}
341
342
343# look up the reference to the a particular section
344sub _lookup_section {
345    my $self = shift (@_);
346    my ($section) = @_;
347
348    my ($num);
349    my $sectionref = $self;
350
351    while (defined $section && $section ne "") {
352   
353    ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
354   
355    $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
356   
357    $section = "" unless defined $section;
358   
359
360    if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
361        $sectionref = $sectionref->{'subsections'}->{$num};
362    } else {
363        return undef;
364    }
365    }
366   
367    return $sectionref;
368}
369
370# calculate OID by hashing the contents of the document
371sub _calc_OID {
372    my $self = shift (@_);
373    my ($filename) = @_;
374
375
376    my $osexe = &util::get_os_exe();
377
378    my $hashfile_exe = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin",
379                       $ENV{'GSDLOS'},"hashfile$osexe");
380
381    &util::set_gnomelib_env(); # gnomelib_env (particularly lib/libiconv2.dylib) required to run the hashfile executable on Mac Lions
382    # The subroutine will set the gnomelib env once for each subshell launched, by first testing if GEXTGNOME is not already set
383
384    # A different way to set the gnomelib env would be to do it more locally: exporting the necessary vars
385    # (specifically DYLD/LD_LIB_PATH) for gnome_lib as part of the command executed.
386    # E.g. $result=`export LD_LIBRARY_PATH=../ext/gnome-lib/darwin/lib; hashfile...`
387
388    my $result = "NULL";
389
390   
391    if (-e "$hashfile_exe") {
392#   $result = `\"$hashfile_exe\" \"$filename\"`;
393#   $result = `hashfile$osexe \"$filename\" 2>&1`;
394    $result = `hashfile$osexe \"$filename\"`;
395
396    ($result) = $result =~ /:\s*([0-9a-f]+)/i;
397    } else {
398    print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
399    }
400    return "HASH$result";
401}
402
403# methods dealing with OID, not groups of them.
404
405# if $OID is not provided one is calculated
406sub set_OID {
407    my $self = shift (@_);
408    my ($OID) = @_;
409   
410    my $use_hash_oid = 0;
411    # if an OID wasn't provided calculate one
412    if (!defined $OID) {
413    $OID = "NULL";
414    if ($self->{'OIDtype'} =~ /^hash/) {
415        $use_hash_oid = 1;
416    } elsif ($self->{'OIDtype'} eq "incremental") {
417        $OID = "D" . $OIDcount;
418        $OIDcount ++;
419    } elsif ($self->{'OIDtype'} eq "filename") {
420        my $filename = $self->get_source_filename();
421        $OID = &File::Basename::fileparse($filename, qr/\.[^.]*/);
422        $OID = &util::tidy_up_oid($OID);
423    } elsif ($self->{'OIDtype'} eq "full_filename") {
424        my $source_filename = $self->get_source_filename();
425        my $dirsep = &util::get_os_dirsep();
426
427        $source_filename =~ s/^import$dirsep//;
428        $source_filename =~ s/$dirsep/-/g;
429        $source_filename =~ s/\./_/g;
430
431        $OID = $source_filename;
432        $OID = &util::tidy_up_oid($OID);
433    } elsif ($self->{'OIDtype'} eq "dirname") {
434        my $filename = $self->get_source_filename();
435        if (defined($filename)) { # && -e $filename) {
436        # get the immediate parent directory
437        $OID = &File::Basename::dirname($filename);
438        if (defined $OID) {
439            $OID = &File::Basename::basename($OID);
440            $OID = &util::tidy_up_oid($OID);
441        } else {
442            print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
443            $use_hash_oid = 1;
444        }
445        } else {
446        print STDERR "Failed to find a filename, generating hash id\n";
447        $use_hash_oid = 1;
448        }   
449       
450    } elsif ($self->{'OIDtype'} eq "assigned") {
451        my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
452        if (defined $identifier && $identifier ne "") {
453        $OID = $identifier;
454        $OID = &util::tidy_up_oid($OID);
455        } else {
456        # need a hash id
457        print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
458        $use_hash_oid = 1;
459        }
460               
461    } else {
462        $use_hash_oid = 1;
463    }
464
465    if ($use_hash_oid) {
466        my $hash_on_file = 1;
467        my $hash_on_ga_xml = 0;
468
469        if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
470        $hash_on_file = 0;
471        $hash_on_ga_xml = 1;
472        }
473
474        if ($self->{'OIDtype'} eq "hash_on_full_filename") {
475        $hash_on_file = 0;
476        $hash_on_ga_xml = 0;
477
478        my $source_filename = $self->get_source_filename();
479        my $dirsep = &util::get_os_dirsep();
480
481        $source_filename =~ s/^import$dirsep//;
482        $source_filename =~ s/$dirsep/-/g;
483        $source_filename =~ s/\./_/g;
484       
485        # If the filename is very short then (handled naively)
486        # this can cause conjestion in the hash-values
487        # computed, leading documents sharing the same leading
488        # Hex values in the computed has.
489        #
490        # The solution taken here is to replace the name of
491        # the file name a sufficient number of times (up to
492        # the character limit defined in 'rep_limit' and
493        # make that the content that is hashed on
494
495        # *** Think twice before changing the following value
496        # as it will break backward compatability of computed
497        # document HASH values
498
499        my $rep_limit  = 256;
500        my $hash_content = undef;
501
502        if (length($source_filename)<$rep_limit) {
503            my $rep_string = "$source_filename|";
504            my $rs_len = length($rep_string);
505
506            my $clone_times = int(($rep_limit-1)/$rs_len) +1;
507           
508            $hash_content = substr($rep_string x $clone_times, 0, $rep_limit);
509        }
510        else {
511            $hash_content = $source_filename;
512        }
513
514        my $filename = &util::get_tmp_filename();
515        if (!open (OUTFILE, ">:utf8", $filename)) {
516            print STDERR "doc::set_OID could not write to $filename\n";
517        } else {
518            print OUTFILE $hash_content;
519            close (OUTFILE);
520        }
521        $OID = $self->_calc_OID ($filename);
522
523        &FileUtils::removeFiles ($filename);
524        }
525
526        if ($hash_on_file) {
527        # "hash" OID - feed file to hashfile.exe
528        my $filename = $self->get_filename_for_hashing();
529       
530        # -z: don't want to hash on the file if it is zero size
531        if (defined($filename) && -e $filename && !-z $filename) {
532            $OID = $self->_calc_OID ($filename);
533        } else {
534            $hash_on_ga_xml = 1; # switch to back-up plan, and hash on GA file instead
535        }
536        }
537
538        if ($hash_on_ga_xml) {
539        # In addition being asked to explicity calculate the has based on the GA file,
540        # can also end up coming into this block is doing 'hash_on_file' but the file
541        # itself is of zero bytes (as could be the case with 'doc.nul' file
542
543        my $filename = &util::get_tmp_filename();
544        if (!open (OUTFILE, ">:utf8", $filename)) {
545            print STDERR "doc::set_OID could not write to $filename\n";
546        } else {
547            my $doc_text = &docprint::get_section_xml($self);
548            print OUTFILE $doc_text;
549            close (OUTFILE);
550        }
551        $OID = $self->_calc_OID ($filename);
552        &FileUtils::removeFiles($filename);
553        }
554    }
555    }
556    $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
557}
558
559# this uses hashdoc (embedded c thingy) which is faster but still
560# needs a little work to be sufficiently stable
561sub ___set_OID {
562    my $self = shift (@_);
563    my ($OID) = @_;
564
565    # if an OID wasn't provided then calculate hash value based on document
566    if (!defined $OID)
567    {
568    my $hash_text = &docprint::get_section_xml($self);
569    my $hash_len = length($hash_text);
570
571        $OID = &hashdoc::buffer($hash_text,$hash_len);
572    }
573
574    $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
575}
576
577# returns the OID for this document
578sub get_OID {
579    my $self = shift (@_);
580    my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
581    return $OID if (defined $OID);
582    return "NULL";
583}
584
585sub delete_OID {
586    my $self = shift (@_);
587   
588    $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
589}
590
591
592# methods for manipulating section names
593
594# returns the name of the top-most section (the top
595# level of the document
596sub get_top_section {
597    my $self = shift (@_);
598   
599    return "";
600}
601
602# returns a section
603sub get_parent_section {
604    my $self = shift (@_);
605    my ($section) = @_;
606
607    $section =~ s/(^|\.)\d+$//;
608
609    return $section;
610}
611
612# returns the first child section (or the end child
613# if there isn't any)
614sub get_begin_child {
615    my $self = shift (@_);
616    my ($section) = @_;
617
618    my $section_ptr = $self->_lookup_section($section);
619    return "" unless defined $section_ptr;
620
621    if (defined $section_ptr->{'subsection_order'}->[0]) {
622    return "$section.$section_ptr->{'subsection_order'}->[0]";
623    }
624
625    return $self->get_end_child ($section);
626}
627
628# returns the next child of a parent section
629sub get_next_child {
630    my $self = shift (@_);
631    my ($section) = @_;
632   
633    my $parent_section = $self->get_parent_section($section);
634    my $parent_section_ptr = $self->_lookup_section($parent_section);
635    return undef unless defined $parent_section_ptr;
636
637    my ($section_num) = $section =~ /(\d+)$/;
638    return undef unless defined $section_num;
639
640    my $i = 0;
641    my $section_order = $parent_section_ptr->{'subsection_order'};
642    while ($i < scalar(@$section_order)) {
643    last if $section_order->[$i] eq $section_num;
644    $i++;
645    }
646
647    $i++; # the next child
648    if ($i < scalar(@$section_order)) {
649    return $section_order->[$i] if $parent_section eq "";
650    return "$parent_section.$section_order->[$i]";
651    }
652
653    # no more sections in this level
654    return undef;
655}
656
657# returns a reference to a list of children
658sub get_children {
659    my $self = shift (@_);
660    my ($section) = @_;
661
662    my $section_ptr = $self->_lookup_section($section);
663    return [] unless defined $section_ptr;
664
665    my @children = @{$section_ptr->{'subsection_order'}};
666
667    map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
668    return \@children;
669}
670
671# returns the child section one past the last one (which
672# is coded as "0")
673sub get_end_child {
674    my $self = shift (@_);
675    my ($section) = @_;
676
677    return $section . ".0" unless $section eq "";
678    return "0";
679}
680
681# returns the next section in book order
682sub get_next_section {
683    my $self = shift (@_);
684    my ($section) = @_;
685
686    return undef unless defined $section;
687
688    my $section_ptr = $self->_lookup_section($section);
689    return undef unless defined $section_ptr;
690
691    # first try to find first child
692    if (defined $section_ptr->{'subsection_order'}->[0]) {
693    return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
694    return "$section.$section_ptr->{'subsection_order'}->[0]";
695    }
696
697    do {
698    # try to find sibling
699    my $next_child = $self->get_next_child ($section);
700    return $next_child if (defined $next_child);
701
702    # move up one level
703    $section = $self->get_parent_section ($section);
704    } while $section =~ /\d/;
705
706    return undef;
707}
708
709sub is_leaf_section {
710    my $self = shift (@_);
711    my ($section) = @_;
712
713    my $section_ptr = $self->_lookup_section($section);
714    return 1 unless defined $section_ptr;
715
716    return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
717}
718
719# methods for dealing with sections
720
721# returns the name of the inserted section
722sub insert_section {
723    my $self = shift (@_);
724    my ($before_section) = @_;
725
726    # get the child to insert before and its parent section
727    my $parent_section = "";
728    my $before_child = "0";
729    my @before_section = split (/\./, $before_section);
730    if (scalar(@before_section) > 0) {
731    $before_child = pop (@before_section);
732    $parent_section = join (".", @before_section);
733    }
734
735    my $parent_section_ptr = $self->_lookup_section($parent_section);
736    if (!defined $parent_section_ptr) {
737    print STDERR "doc::insert_section couldn't find parent section " .
738        "$parent_section\n";
739    return;
740    }
741
742    # get the next section number
743    my $section_num = $parent_section_ptr->{'next_subsection'}++;
744
745    my $i = 0;
746    while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
747       $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
748    $i++;
749    }
750   
751    # insert the section number into the order list
752    splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
753
754    # add this section to the parent section
755    my $section_ptr = {'subsection_order'=>[],
756               'next_subsection'=>1,
757               'subsections'=>{},
758               'metadata'=>[],
759               'text'=>""};
760    $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
761
762    # work out the full section number
763    my $section = $parent_section;
764    $section .= "." unless $section eq "";
765    $section .= $section_num;
766   
767    return $section;
768}
769
770# creates a pre-named section
771sub create_named_section {
772    my $self = shift (@_);
773    my ($mastersection) = @_;
774
775    my ($num);
776    my $section = $mastersection;
777    my $sectionref = $self;
778
779    while ($section ne "") {
780    ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
781    $num =~ s/^0+(\d)/$1/; # remove leading 0s
782    $section = "" unless defined $section;
783   
784    if (defined $num) {
785        if (!defined $sectionref->{'subsections'}->{$num}) {
786        push (@{$sectionref->{'subsection_order'}}, $num);
787        $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
788                            'next_subsection'=>1,
789                            'subsections'=>{},
790                            'metadata'=>[],
791                            'text'=>""};
792        if ($num >= $sectionref->{'next_subsection'}) {
793            $sectionref->{'next_subsection'} = $num + 1;
794        }
795        }
796        $sectionref = $sectionref->{'subsections'}->{$num};
797
798    } else {
799        print STDERR "doc::create_named_section couldn't create section ";
800        print STDERR "$mastersection\n";
801        last;
802    }
803    }
804}
805
806# returns a reference to a list of subsections
807sub list_subsections {
808    my $self = shift (@_);
809    my ($section) = @_;
810
811    my $section_ptr = $self->_lookup_section ($section);
812    if (!defined $section_ptr) {
813    print STDERR "doc::list_subsections couldn't find section $section\n";
814    return [];
815    }
816
817    return [@{$section_ptr->{'subsection_order'}}];
818}
819
820sub delete_section {
821    my $self = shift (@_);
822    my ($section) = @_;
823
824#    my $section_ptr = {'subsection_order'=>[],
825#              'next_subsection'=>1,
826#              'subsections'=>{},
827#              'metadata'=>[],
828#              'text'=>""};
829
830    # if this is the top section reset everything
831    if ($section eq "") {
832    $self->{'subsection_order'} = [];
833    $self->{'subsections'} = {};
834    $self->{'metadata'} = [];
835    $self->{'text'} = "";
836    return;
837    }
838
839    # find the parent of the section to delete
840    my $parent_section = "";
841    my $child = "0";
842    my @section = split (/\./, $section);
843    if (scalar(@section) > 0) {
844    $child = pop (@section);
845    $parent_section = join (".", @section);
846    }
847
848    my $parent_section_ptr = $self->_lookup_section($parent_section);
849    if (!defined $parent_section_ptr) {
850    print STDERR "doc::delete_section couldn't find parent section " .
851        "$parent_section\n";
852    return;
853    }
854
855    # remove this section from the subsection_order list
856    my $i = 0;
857    while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
858    if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
859        splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
860        last;
861    }
862    $i++;
863    }
864
865    # remove this section from the subsection hash
866    if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
867    undef $parent_section_ptr->{'subsections'}->{$child};
868    }
869}
870
871#--
872# methods for dealing with metadata
873
874# set_metadata_element and get_metadata_element are for metadata
875# which should only have one value. add_meta_data and get_metadata
876# are for metadata which can have more than one value.
877
878# returns the first metadata value which matches field
879
880# This version of get metadata element works much like the one above,
881# except it allows for the namespace portion of a metadata element to
882# be ignored, thus if you are searching for dc.Title, the first piece
883# of matching metadata ending with the name Title (once any namespace
884# is removed) would be returned.
885# 28-11-2003 John Thompson
886sub get_metadata_element {
887    my $self = shift (@_);
888    my ($section, $field, $ignore_namespace) = @_;
889    my ($data);
890
891    $ignore_namespace = 0 unless defined $ignore_namespace;
892
893    my $section_ptr = $self->_lookup_section($section);
894    if (!defined $section_ptr) {
895    print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
896    return;
897    }
898
899    # Remove any namespace if we are being told to ignore them
900    if($ignore_namespace) {
901    $field =~ s/^.*\.//; #$field =~ s/^\w*\.//;
902    }
903
904    foreach $data (@{$section_ptr->{'metadata'}}) {
905
906    my $data_name = $data->[0];
907
908    # Remove any namespace if we are being told to ignore them
909    if($ignore_namespace) {
910        $data_name =~ s/^.*\.//; #$data_name =~ s/^\w*\.//;
911    }
912    # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
913    $data_name =~ s/^ex\.([^.]+)$/$1/; #$data_name =~ s/^ex\.//;
914   
915    return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
916    }
917   
918    return undef; # was not found
919}
920
921# returns a list of the form [value1, value2, ...]
922sub get_metadata {
923    my $self = shift (@_);
924    my ($section, $field, $ignore_namespace) = @_;
925    my ($data);
926
927    $ignore_namespace = 0 unless defined $ignore_namespace;
928
929    my $section_ptr = $self->_lookup_section($section);
930    if (!defined $section_ptr) {
931        print STDERR "doc::get_metadata couldn't find section ",
932        $section, "\n";
933        return;
934    }
935
936    # Remove any namespace if we are being told to ignore them
937    if($ignore_namespace) {
938    $field =~ s/^.*\.//;
939    }
940
941    my @metadata = ();
942    foreach $data (@{$section_ptr->{'metadata'}}) {
943
944    my $data_name = $data->[0];
945
946    # Remove any namespace if we are being told to ignore them
947    if($ignore_namespace) {
948        $data_name =~ s/^.*\.//;
949    }   
950    # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
951    $data_name =~ s/^ex\.([^.]+)$/$1/;
952
953        push (@metadata, $data->[1]) if ($data_name eq $field);
954    }
955
956    return \@metadata;
957}
958
959sub get_metadata_hashmap {
960    my $self = shift (@_);
961    my ($section, $opt_namespace) = @_;
962   
963    my $section_ptr = $self->_lookup_section($section);
964    if (!defined $section_ptr) {
965            print STDERR "doc::get_metadata couldn't find section ",
966            $section, "\n";
967            return;
968        }
969
970    my $metadata_hashmap = {};
971    foreach my $data (@{$section_ptr->{'metadata'}}) {
972            my $metaname = $data->[0];
973         
974            if ((!defined $opt_namespace) || ($metaname =~ m/^$opt_namespace\./)) {
975                if (!defined $metadata_hashmap->{$metaname}) {
976                    $metadata_hashmap->{$metaname} = [];
977                  }
978                my $metaval_list = $metadata_hashmap->{$metaname};
979                push(@$metaval_list, $data->[1]);
980              }
981          }
982   
983    return $metadata_hashmap;
984}
985
986# returns a list of the form [[field,value],[field,value],...]
987sub get_all_metadata {
988    my $self = shift (@_);
989    my ($section) = @_;
990
991    my $section_ptr = $self->_lookup_section($section);
992    if (!defined $section_ptr) {
993    print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
994    return;
995    }
996   
997    return $section_ptr->{'metadata'};
998}
999
1000# $value is optional
1001sub delete_metadata {
1002    my $self = shift (@_);
1003    my ($section, $field, $value) = @_;
1004
1005    my $section_ptr = $self->_lookup_section($section);
1006    if (!defined $section_ptr) {
1007    print STDERR "doc::delete_metadata couldn't find section ", $section, "$field\n";
1008    return;
1009    }
1010
1011    my $i = 0;
1012    while ($i < scalar (@{$section_ptr->{'metadata'}})) {
1013    if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
1014        (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
1015        splice (@{$section_ptr->{'metadata'}}, $i, 1);
1016    } else {
1017        $i++;
1018    }
1019    }
1020}
1021
1022sub delete_all_metadata {
1023    my $self = shift (@_);
1024    my ($section) = @_;
1025
1026    my $section_ptr = $self->_lookup_section($section);
1027    if (!defined $section_ptr) {
1028    print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
1029    return;
1030    }
1031   
1032    $section_ptr->{'metadata'} = [];
1033}
1034
1035sub set_metadata_element {
1036    my $self = shift (@_);
1037    my ($section, $field, $value) = @_;
1038
1039    $self->set_utf8_metadata_element ($section, $field,
1040                      &unicode::ascii2utf8(\$value));
1041}
1042
1043# set_utf8_metadata_element assumes the text has already been
1044# converted to the UTF-8 encoding.
1045sub set_utf8_metadata_element {
1046    my $self = shift (@_);
1047    my ($section, $field, $value) = @_;
1048   
1049    $self->delete_metadata ($section, $field);
1050    $self->add_utf8_metadata ($section, $field, $value);
1051}
1052
1053
1054# add_metadata assumes the text is in (extended) ascii form. For
1055# text which has already been converted to the UTF-8 format use
1056# add_utf8_metadata.
1057sub add_metadata {
1058    my $self = shift (@_);
1059    my ($section, $field, $value) = @_;
1060   
1061    $self->add_utf8_metadata ($section, $field,
1062                  &unicode::ascii2utf8(\$value));
1063}
1064
1065sub add_utf8_metadata {
1066    my $self = shift (@_);
1067    my ($section, $field, $value) = @_;
1068   
1069    #    my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1070    #    my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
1071    #    print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
1072
1073    my $section_ptr = $self->_lookup_section($section);
1074    if (!defined $section_ptr) {
1075    print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
1076    return;
1077    }
1078    if (!defined $value) {
1079    print STDERR "doc::add_utf8_metadata undefined value for $field\n";
1080    return;
1081    }
1082    if (!defined $field) {
1083    print STDERR "doc::add_utf8_metadata undefined metadata type \n";
1084    return;
1085    }
1086   
1087    #print STDERR "###$field=$value\n";
1088
1089    # For now, supress this check.  Given that text data read in is now
1090    # Unicode aware, then the following block of code can (ironically enough)
1091    # cause our unicode compliant string to be re-encoded (leading to
1092    # a double-encoded UTF-8 string, which we definitely don't want!).
1093   
1094
1095    # double check that the value is utf-8
1096    #    if (!&unicode::check_is_utf8($value)) {
1097    #   print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
1098    #   &unicode::ensure_utf8(\$value);
1099    #   print STDERR " Tried converting to utf8: $value\n";
1100    #    }
1101
1102    #If the metadata value is either a latitude or a longitude value then we want to save a shortened version for spacial searching purposes
1103    if ($field =~ m/^(.+\.)?Latitude$/ || $field =~ m/^(.+\.)?Longitude$/)
1104    {
1105            my ($mdprefix,$metaname) = ($field =~ m/(.+)\.(.+)$/);
1106        if (defined $mdprefix) {
1107            # Add in a version of Latitude/Longitude without the metadata namespace prefix to keep Runtime happy
1108            push (@{$section_ptr->{'metadata'}}, [$metaname, $value]);
1109        }
1110
1111        my $direction;
1112        if($value =~ m/^-/)
1113        {
1114            $direction = ($field eq "Latitude") ? "S" : "W";
1115        }
1116        else
1117        {
1118            $direction = ($field eq "Latitude") ? "N" : "E";
1119        }
1120       
1121        my ($beforeDec, $afterDec) = ($value =~ m/^-?([0-9]+)\.([0-9]+)$/);
1122        if(defined $beforeDec && defined $afterDec)
1123        {
1124            my $name = ($field eq "Latitude") ? "LatShort" : "LngShort";
1125            push (@{$section_ptr->{'metadata'}}, [$name, $beforeDec . $direction]);
1126           
1127            for(my $i = 2; $i <= 4; $i++)
1128            {
1129                if(length($afterDec) >= $i)
1130                {
1131                    push (@{$section_ptr->{'metadata'}}, [$name, substr($afterDec, 0, $i)]);
1132                }
1133            }
1134           
1135            #Only add the metadata if it has not already been added
1136            my $metaMap = $self->get_metadata_hashmap($section); ### TODO: metaMap not used. Unnecesssary step? (Called function has no side-effects.)
1137        }
1138       
1139        if($field =~ m/^(.+\.)?Longitude$/) {
1140            # if we are dealing with Longitude meta, we should 1. have Latitude meta too; 2. already have processed Latitude meta
1141            # in that case, add both Lat and Lng of this section as a Coordinate meta
1142            my $latitude = $self->get_metadata_element ($section, "Latitude");         
1143            # TODO: would like all Longitude info together followed by all Coordinate info, but the following will add all coord info meta and end of this function will add Longitude meta
1144            $self->processCoordinate($section, $latitude, $value); # value is Longitude
1145        }
1146    }
1147
1148    elsif($field eq "GPS.mapOverlay") { # then the value is a JSON string
1149   
1150        # TODO:
1151        # If we already have Coordinate meta for this section of the document (as can happen during buildcol.pl),
1152        # let's ASSUME this means we've already processed GPS.mapOverlay meta into Coordinate meta for this section (can have happened during import.pl)
1153        # to avoid adding duplicate Coordinates meta, which then end up duplicated in the index
1154        # Of course, the assumption is not always true! We could have an image with embedded Lat and Lng meta,
1155        # and the same image doc's section could have GPS.mapOverlay meta (from shapes) added via the doc editor.
1156        # This very function would then have converted Lat/Lng into Coordinate meta (just in the if stmt above) and added it to the section.
1157        # And then by the time we process this section's GPS.mapOverlay meta here, we would notice the section has Coordinate meta already,
1158        # and therefore skip converting the GPS.mapOverlay meta into Coordinate meta! What to dooooo?
1159        # So the return statement immediately below is a temporary solution, until we find a better one that will always work.
1160        my $metaMap = $self->get_metadata_hashmap($section);
1161        if($metaMap->{'Coordinate'}) {
1162            return;
1163        }
1164       
1165        print STDERR "GPS.mapOverlay before val: " . $value . "\n";
1166       
1167        # TODO html decode?
1168        $value =~ s@&#091;@[@g;
1169        $value =~ s@&#093;@]@g;
1170        $value =~ s@&quot;@"@g;
1171        print STDERR "GPS.mapOverlay after val: " . $value . "\n";
1172
1173        my $json_array = decode_json $value;
1174        #my $json = JSON->new->allow_nonref;
1175        #&printAllShapes($json, $json_array);
1176
1177        foreach my $shape (@$json_array) {     
1178
1179            my $type = $shape->{"type"};
1180            print STDERR "Type : " . $type . "\n";
1181       
1182            if($type eq "circle") {
1183                #print STDERR "Found a circle:\n" . &printShape($json, $shape);
1184       
1185                # work out bounding box
1186                # SCARY!
1187                # want the inverse of this useful page:
1188                # https://stackoverflow.com/questions/639695/how-to-convert-latitude-or-longitude-to-meters
1189                # https://www.geodatasource.com/developers/javascript       
1190               
1191                           
1192                # for now, just process the circle centre
1193                #my $centre = $shape->{"center"};               
1194                #$self->processLatOrLng($section_ptr, "Latitude", $centre->{"lat"});
1195                #$self->processLatOrLng($section_ptr, "Longitude", $centre->{"lng"});
1196
1197               
1198                # Dr Bainbridge wants us to use: https://gis.stackexchange.com/questions/5821/calculating-latitude-longitude-x-miles-from-point
1199                # But we're using the rule of thumb here, since for N,E,S,W it works out the same:
1200                # https://gis.stackexchange.com/questions/2951/algorithm-for-offsetting-a-latitude-longitude-by-some-amount-of-meters
1201                # which states
1202                # "If your displacements aren't too great (less than a few kilometers) and you're not right at the poles,
1203                # use the quick and dirty estimate that 111,111 meters (111.111 km) in the y direction is 1 degree
1204                # (of latitude) and 111,111 * cos(latitude) meters in the x direction is 1 degree (of longitude)."
1205                my $centre_lat = $shape->{"center"}->{"lat"};
1206                my $centre_lng = $shape->{"center"}->{"lng"};
1207                my $radius = $shape->{"radius"}; # in metres!
1208
1209                print STDERR "@@@ circle centre: ($centre_lat, $centre_lng), radius: $radius\n";
1210
1211                my $lat_north = $centre_lat + ($radius/111111);
1212                my $lat_south = $centre_lat - ($radius/111111);
1213               
1214                print STDERR "### lat_north:  $lat_north\n";
1215                print STDERR "### lat_south:  $lat_south\n";
1216
1217                # our latitude and longitude values are in degrees. But cos and sin etc in perl and generally all prog languages
1218                # all expect their angle to be in radians. So need to convert from degree to radians before we can take the cose of it.
1219                my $centre_lat_radians = $self->degreesToRadians($centre_lat);
1220                my $cos_in_radians = cos($centre_lat_radians);             
1221                print STDERR "cos $centre_lat_radians " . cos($centre_lat_radians) . "\n";
1222                my $lng_east = $centre_lng + ($radius/(111111 * $cos_in_radians));
1223                my $lng_west = $centre_lng - ($radius/(111111 * $cos_in_radians));
1224                print STDERR "### lng_east  $lng_east\n";
1225                print STDERR "### lng_west  $lng_west\n";
1226
1227
1228
1229                my $cos_lat = cos($centre_lat);             
1230                print STDERR "cos $centre_lat is $cos_lat\n";
1231
1232                $self->processCoordinate($section, $lat_north, $lng_east);
1233                $self->processCoordinate($section, $lat_south, $lng_east);
1234                $self->processCoordinate($section, $lat_south, $lng_west);
1235                $self->processCoordinate($section, $lat_north, $lng_west);
1236           
1237            }
1238            elsif ($type eq "marker") {
1239                print STDERR "@@ MARKER FOUND WITH LAT: " . $shape->{"position"}->{"lat"} . "\n";
1240                print STDERR "@@ MARKER FOUND WITH LNG: " . $shape->{"position"}->{"lng"} . "\n";
1241                $self->processCoordinate($section, $shape->{"position"}->{"lat"}, $shape->{"position"}->{"lng"});               
1242            }
1243            elsif ($type eq "polyline" || $type eq "polygon") {
1244                my $path_array = $shape->{"path"};
1245                foreach my $position (@$path_array) {                                   
1246                    $self->processCoordinate($section, $position->{"lat"}, $position->{"lng"});
1247                }
1248            }
1249            elsif ($type eq "rectangle") {
1250       
1251                my $bounds = $shape->{"bounds"};               
1252           
1253                $self->processCoordinate($section, $bounds->{"north"}, $bounds->{"east"});
1254                $self->processCoordinate($section, $bounds->{"south"}, $bounds->{"east"});
1255                $self->processCoordinate($section, $bounds->{"south"}, $bounds->{"west"});
1256                $self->processCoordinate($section, $bounds->{"north"}, $bounds->{"west"});
1257            }   
1258       
1259        } # end for on each shape in GPS.mapOverlay
1260
1261    } # end GPS.mapOverlay meta
1262
1263    push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1264}
1265
1266# https://en.wikipedia.org/wiki/Radian
1267sub degreesToRadians
1268{
1269    my $self = shift (@_);
1270    my ($degrees) = @_;
1271
1272    return $degrees * pi /180; # returns radians
1273}
1274
1275sub radiansToDegrees
1276{
1277    my $self = shift (@_);
1278    my ($radians) = @_;
1279
1280    return $radians * 180 / pi; # returns degrees
1281}
1282
1283sub printAllShapes {
1284    my ($json, $json_array) = @_;
1285   
1286   
1287    #my $pretty_print_shape = $json->pretty->encode( $json_array->[0] );
1288    foreach my $shape (@$json_array) {
1289        my $pretty_print_shape = $json->pretty->encode( $shape );
1290        print STDERR "Shape: $pretty_print_shape\n";
1291        #&printShape($shape);
1292    }
1293   
1294}   
1295
1296sub processCoordinate {
1297    my $self = shift (@_);
1298    my ($section, $latitude, $longitude) = @_;
1299
1300    my $section_ptr = $self->_lookup_section($section);
1301
1302    my $lat_direction = ($latitude =~ m/^-/) ? "S" : "N";
1303    my $lng_direction = ($longitude =~ m/^-/) ? "W" : "E";
1304   
1305    # have to store (lat, lng) in pairs, when there are so many coords to store
1306    #push (@{$section_ptr->{'metadata'}}, ["Latitude", $latitude]);
1307    #push (@{$section_ptr->{'metadata'}}, ["Longitude", $longitude]);
1308
1309    push (@{$section_ptr->{'metadata'}}, ["Coordinate", "$latitude $longitude"]); # "$latitude$lat_direction $longitude$lng_direction"
1310
1311    my ($latBeforeDec, $latAfterDec);
1312    my ($lngBeforeDec, $lngAfterDec);
1313
1314    if($latitude !~ m/\./) {
1315        $latBeforeDec = $latitude;
1316        $latAfterDec = "";
1317    } else {
1318        ($latBeforeDec, $latAfterDec) = ($latitude =~ m/^-?([0-9]+)\.([0-9]+)$/);
1319    }
1320    if($longitude !~ m/\./) {
1321        $lngBeforeDec = $longitude;
1322        $lngAfterDec = "";
1323    } else {
1324        ($lngBeforeDec, $lngAfterDec) = ($longitude =~ m/^-?([0-9]+)\.([0-9]+)$/);
1325    }   
1326   
1327    #if(defined $beforeDec && defined $afterDec)
1328    #{
1329        my $name = "CoordShort";
1330        push (@{$section_ptr->{'metadata'}}, [$name, "$latBeforeDec$lat_direction $lngBeforeDec$lng_direction"]);
1331           
1332        for(my $i = 2; $i <= 4; $i++)
1333        {
1334            my $latDecPlaces = (length($latAfterDec) >= $i) ? substr($latAfterDec, 0, $i) : "";         
1335            my $lngDecPlaces = (length($lngAfterDec) >= $i) ? substr($lngAfterDec, 0, $i) : "";
1336
1337            push (@{$section_ptr->{'metadata'}}, [$name,
1338                    $latBeforeDec . $lat_direction. $latDecPlaces . " " . $lngBeforeDec . $lng_direction. $lngDecPlaces]);
1339           
1340        }
1341           
1342        #Only add the metadata if it has not already been added
1343        #my $metaMap = $self->get_metadata_hashmap($section); ### TODO: metaMap not used. Unnecesssary step? (Called function has no side-effects.)
1344    #}
1345
1346
1347}
1348
1349
1350# methods for dealing with text
1351
1352# returns the text for a section
1353sub get_text {
1354    my $self = shift (@_);
1355    my ($section) = @_;
1356
1357    my $section_ptr = $self->_lookup_section($section);
1358    if (!defined $section_ptr) {
1359    print STDERR "doc::get_text couldn't find section " .
1360        "$section\n";
1361    return "";
1362    }
1363
1364    return $section_ptr->{'text'};
1365}
1366
1367# returns the (utf-8 encoded) length of the text for a section
1368sub get_text_length {
1369    my $self = shift (@_);
1370    my ($section) = @_;
1371
1372    my $section_ptr = $self->_lookup_section($section);
1373    if (!defined $section_ptr) {
1374    print STDERR "doc::get_text_length couldn't find section " .
1375        "$section\n";
1376    return 0;
1377    }
1378
1379    return length ($section_ptr->{'text'});
1380}
1381
1382# returns the total length for all the sections
1383sub get_total_text_length {
1384    my $self = shift (@_);
1385
1386    my $section = $self->get_top_section();
1387    my $length = 0;
1388    while (defined $section) {
1389    $length += $self->get_text_length($section);
1390    $section = $self->get_next_section($section);
1391    }
1392    return $length;
1393}
1394
1395sub delete_text {
1396    my $self = shift (@_);
1397    my ($section) = @_;
1398
1399    my $section_ptr = $self->_lookup_section($section);
1400    if (!defined $section_ptr) {
1401    print STDERR "doc::delete_text couldn't find section " .
1402        "$section\n";
1403    return;
1404    }
1405
1406    $section_ptr->{'text'} = "";
1407}
1408
1409# add_text assumes the text is in (extended) ascii form. For
1410# text which has been already converted to the UTF-8 format
1411# use add_utf8_text.
1412sub add_text {
1413    my $self = shift (@_);
1414    my ($section, $text) = @_;
1415
1416    # convert the text to UTF-8 encoded unicode characters
1417    # and add the text
1418    $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1419}
1420
1421
1422# add_utf8_text assumes the text to be added has already
1423# been converted to the UTF-8 encoding. For ascii text use
1424# add_text
1425# Pass by value version (internally calls pass by ref version
1426# to avoid code duplication)
1427sub add_utf8_text {
1428    my $self = shift (@_);
1429    my ($section, $text) = @_;
1430   
1431    $self->add_utf8_textref($section, \$text);
1432}
1433
1434# Pass by reference version, used by GreenstoneSQLPlugin for fulltext
1435sub add_utf8_textref {
1436    my $self = shift (@_);
1437    my ($section, $text_ref) = @_;
1438
1439    my $section_ptr = $self->_lookup_section($section);
1440    if (!defined $section_ptr) {
1441    print STDERR "doc::add_utf8_textref couldn't find section " .
1442        "$section\n";
1443    return;
1444    }
1445
1446    $section_ptr->{'text'} .= $$text_ref;
1447}
1448
1449# returns the Source meta, which is the utf8 filename generated.
1450# Added a separate method here for convenience
1451sub get_source {
1452    my $self = shift (@_);
1453    return $self->get_metadata_element ($self->get_top_section(), "Source");
1454}
1455
1456# returns the SourceFile meta, which is the url reference to the URL-encoded
1457# version of Source (the utf8 filename). Added a separate method here for convenience
1458sub get_sourcefile {
1459    my $self = shift (@_);
1460    return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
1461}
1462
1463# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
1464# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
1465sub get_assocfile_from_sourcefile {
1466    my $self = shift (@_);
1467   
1468    # get the SourceFile meta, which is a *URL* to a file on the filesystem
1469    my $top_section = $self->get_top_section();
1470    my $source_file = $self->get_metadata_element($top_section, "SourceFile");
1471
1472    # get the actual filename as it exists on the filesystem which this url refers to
1473    $source_file = &unicode::url_to_filename($source_file);
1474    my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
1475    return $assocfilename;
1476}
1477
1478# methods for dealing with associated files
1479
1480# a file is associated with a document, NOT a section.
1481# if section is defined it is noted in the data structure
1482# only so that files associated from a particular section
1483# may be removed later (using delete_section_assoc_files)
1484sub associate_file {
1485    my $self = shift (@_);
1486    my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1487    $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1488
1489    # remove all associated files with the same name
1490    $self->delete_assoc_file ($assoc_filename);
1491
1492    # Too harsh a requirement
1493    # Definitely get HTML docs, for example, with some missing
1494    # support files
1495#    if (!&util::fd_exists($real_filename)) {
1496#   print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n";
1497#   exit -1;
1498#    }
1499
1500#    print STDERR "**** is the following a UTF8 rep of *real* filename?\n   $real_filename\n";
1501#    print STDERR "****##### so, ensure it is before storing?!?!?\n";
1502##    my $utf8_filename = Encode::encode("utf8",$filename);
1503
1504    push (@{$self->{'associated_files'}},
1505      [$real_filename, $assoc_filename, $mime_type, $section]);
1506}
1507
1508# returns a list of associated files in the form
1509#   [[real_filename, assoc_filename, mimetype], ...]
1510sub get_assoc_files {
1511    my $self = shift (@_);
1512
1513    return $self->{'associated_files'};
1514}
1515
1516# the following two methods used to keep track of original associated files
1517# for incremental building. eg a txt file used by an item file does not end
1518# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1519# file for incremental build
1520sub associate_source_file {
1521    my $self = shift (@_);
1522    my ($full_filename) = @_;
1523
1524    push (@{$self->{'source_assoc_files'}}, $full_filename);
1525
1526}
1527
1528sub get_source_assoc_files {
1529    my $self = shift (@_);
1530
1531    return $self->{'source_assoc_files'};
1532 
1533
1534}
1535sub metadata_file {
1536    my $self = shift (@_);
1537    my ($real_filename, $filename) = @_;
1538   
1539    push (@{$self->{'metadata_files'}},
1540      [$real_filename, $filename]);
1541}
1542
1543# used for writing out the archiveinf-doc info database, to list all the metadata files
1544sub get_meta_files {
1545    my $self = shift (@_);
1546
1547    return $self->{'metadata_files'};
1548}
1549
1550sub delete_section_assoc_files {
1551    my $self = shift (@_);
1552    my ($section) = @_;
1553
1554    my $i=0;
1555    while ($i < scalar (@{$self->{'associated_files'}})) {
1556    if (defined $self->{'associated_files'}->[$i]->[3] &&
1557        $self->{'associated_files'}->[$i]->[3] eq $section) {
1558        splice (@{$self->{'associated_files'}}, $i, 1);
1559    } else {
1560        $i++;
1561    }
1562    }
1563}
1564
1565sub delete_assoc_file {
1566    my $self = shift (@_);
1567    my ($assoc_filename) = @_;
1568
1569    my $i=0;
1570    while ($i < scalar (@{$self->{'associated_files'}})) {
1571    if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1572        splice (@{$self->{'associated_files'}}, $i, 1);
1573    } else {
1574        $i++;
1575    }
1576    }
1577}
1578
1579sub reset_nextsection_ptr {
1580    my $self = shift (@_);
1581    my ($section) = @_;
1582   
1583    my $section_ptr = $self->_lookup_section($section);
1584    $section_ptr->{'next_subsection'} = 1;
1585}
1586
15871;
Note: See TracBrowser for help on using the browser.