root/main/trunk/greenstone2/perllib/doc.pm @ 33313

Revision 33313, 50.1 KB (checked in by ak19, 3 months ago)

Minor. Changes to comment

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr   te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32    die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33    unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use FileUtils;
40use ghtml;
41use File::stat;
42##use hashdoc;
43use docprint;
44use JSON;
45
46# We just need pi from the Trig lib
47# Import constants pi2, pip2, pip4 (2*pi, pi/2, pi/4).
48use Math::Trig ':pi';
49
50# the document type may be indexed_doc, nonindexed_doc, or
51# classification
52
53our $OIDcount = 0;
54
55
56# Declare the Package (level) Variable that will tell us whether we're in the import or buildcol phase.
57# This var will be set by import.pl, buildcol.pl and export.pl to the then appropriate value.
58# https://perlmaven.com/package-variables-and-lexical-variables-in-perl
59# But: https://stackoverflow.com/questions/7542120/perl-how-to-make-variables-from-requiring-script-available-in-required-script
60# https://stackoverflow.com/questions/33230015/how-do-i-use-a-variable-in-another-perl-script
61#$doc::cmd_line_mode = undef;
62# Decided to follow the usage of $OIDcount above to declare the necessary package-level variables 'globally' accessible
63# actually accessible only for those packages that import this one (with 'use doc')
64my $cmd_line_mode = undef;
65
66# processor_mode keeps track of which buildcol pass we're at: dummy, text (sidx/didx passes) or infodb
67my $processor_mode = undef;
68
69# rename_method can be 'url', 'none', 'base64'
70sub new {
71    my $class = shift (@_);
72    my ($source_filename, $doc_type, $rename_method) = @_;
73
74
75    my $self = bless {'associated_files'=>[],
76              'subsection_order'=>[],
77              'next_subsection'=>1,
78              'subsections'=>{},
79              'metadata'=>[],
80              'text'=>"",
81              'OIDtype'=>"hash"}, $class;
82
83    # used to set lastmodified here, but this can screw up the HASH ids, so
84    # the docsave processor now calls set_lastmodified
85
86    $self->set_source_path($source_filename);
87   
88    if (defined $source_filename) {
89    $source_filename = &util::filename_within_collection($source_filename);
90    print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
91    $self->set_source_filename ($source_filename, $rename_method);
92    }
93
94    $self->set_doc_type ($doc_type) if defined $doc_type;
95
96    return $self;
97}
98
99
100sub set_source_path
101{
102    my $self = shift @_;
103    my ($source_filename) = @_;
104
105    if (defined $source_filename) {
106    # On Windows the source_filename can be in terse DOS format
107    # e.g. test~1.txt
108
109    $self->{'terse_source_path'} = $source_filename;
110
111        # Use the FileUtil library methods as they are aware of more special
112        # cases such as HDFS [jmt12]
113    if (&FileUtils::fileExists($source_filename))
114        {
115        # See if we can do better for Windows with a filename
116        if ($ENV{'GSDLOS'} =~ /^windows$/i) {
117        require Win32;
118        $self->{'source_path'} = Win32::GetLongPathName($source_filename);
119        }
120        else {
121        # For Unix-based systems, there is no difference between the two
122        $self->{'source_path'} = $source_filename;
123        }
124    }
125    else {
126        print STDERR "Warning: In doc::set_source_path(), file\n";
127        print STDERR "           $source_filename\n";
128        print STDERR "         does not exist\n";
129       
130        # (default) Set it to whatever we were given
131        $self->{'source_path'} = $source_filename;
132    }   
133    }
134    else {
135    # Previous code for setting source_path allowed for
136    # it to be undefined, so continue this practice
137    $self->{'terse_source_path'} = undef;
138    $self->{'source_path'} = undef;
139    }
140}
141
142
143sub get_source_path
144{
145    my $self = shift @_;
146
147    return $self->{'terse_source_path'};
148}
149
150# set lastmodified for OAI purposes, added by GRB, moved by kjdon
151sub set_oailastmodified {
152    my $self = shift (@_);
153
154    my $source_path = $self->{'terse_source_path'};
155   
156    if (defined $source_path && (-e $source_path)) {
157    my $current_time = time;
158
159    my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
160        $wday, $yday, $isdst) = localtime($current_time);
161
162    my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
163
164    $self->add_utf8_metadata($self->get_top_section(), "oailastmodified", $current_time);
165    $self->add_utf8_metadata($self->get_top_section(), "oailastmodifieddate", $date_modified);
166    }
167}
168
169# no longer used for OAI purposes, since lastmodified is not what we want as the
170# Datestamp of a document. This doc metadata may be useful for general purposes.
171sub set_lastmodified {
172    my $self = shift (@_);
173
174    my $source_path = $self->{'terse_source_path'};
175   
176    if (defined $source_path && (-e $source_path)) {
177
178        my $file_stat = stat($source_path);
179    my $mtime = $file_stat->mtime;
180    my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
181        $wday, $yday, $isdst) = localtime($mtime);
182
183    my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
184
185    $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $mtime);
186    $self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
187    }
188}
189
190# clone the $self object
191sub duplicate {
192    my $self = shift (@_);
193
194    my $newobj = {};
195   
196    foreach my $k (keys %$self) {
197    $newobj->{$k} = &clone ($self->{$k});
198    }
199
200    bless $newobj, ref($self);
201    return $newobj;
202}
203
204sub clone {
205    my ($from) = @_;
206    my $type = ref ($from);
207
208    if ($type eq "HASH") {
209    my $to = {};
210    foreach my $key (keys %$from) {
211        $to->{$key} = &clone ($from->{$key});
212    }
213    return $to;
214    } elsif ($type eq "ARRAY") {
215    my $to = [];
216    foreach my $v (@$from) {
217        push (@$to, &clone ($v));
218    }
219    return $to;
220    } else {
221    return $from;
222    }
223}
224
225sub set_OIDtype {
226    my $self = shift (@_);
227    my ($type, $metadata) = @_;
228
229    if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|hash_on_full_filename|incremental|filename|dirname|full_filename|assigned)$/) {
230    $self->{'OIDtype'} = $type;
231    } else {
232    $self->{'OIDtype'} = "hash";
233    }
234
235    if ($type =~ /^assigned$/) {
236    if (defined $metadata) {
237        $self->{'OIDmetadata'} = $metadata;
238    } else {
239        $self->{'OIDmetadata'} = "dc.Identifier";
240    }
241    }
242}
243
244# rename_method can be 'url', 'none', 'base64'
245sub set_source_filename {
246    my $self = shift (@_);
247    my ($source_filename, $rename_method) = @_;
248
249    # Since the gsdlsourcefilename element goes into the doc.xml it has
250    # to be utf8. However, it should also *represent* the source filename
251    # (in the import directory) which may not be utf8 at all.
252    # For instance, if this meta element (gsdlsourcefilename) will be used
253    # by other applications that parse doc.xml in order to locate
254    # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
255    # encode the real filename as this is a binary-to-text encoding meaning
256    # that the resulting string is ASCII (utf8). Decoding will give the original.
257   
258#    print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
259
260    # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
261    $source_filename = $self->encode_filename($source_filename, $rename_method);
262#    my ($srcfilename,$dirname,$suffix)
263#   = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
264#    print STDERR "-> $srcfilename -> ";
265#    $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
266#    $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
267#    print STDERR "$source_filename\n";
268   
269    $self->set_utf8_metadata_element ($self->get_top_section(),
270                 "gsdlsourcefilename",
271                 $source_filename);
272}
273
274sub encode_filename {
275    my $self = shift (@_);
276    my ($source_filename, $rename_method) = @_;
277
278     my ($srcfilename,$dirname,$suffix)
279    = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
280#    print STDERR "-> $srcfilename -> ";
281    $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
282    $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
283
284    return $source_filename;
285}
286
287sub set_converted_filename {
288    my $self = shift (@_);
289    my ($converted_filename) = @_;
290
291    # we know the converted filename is utf8
292    $self->set_utf8_metadata_element ($self->get_top_section(),
293                 "gsdlconvertedfilename",
294                 $converted_filename);
295}
296
297# returns the source_filename as it was provided
298sub get_unmodified_source_filename {
299    my $self = shift (@_);
300
301    return $self->{'terse_source_path'};
302}
303
304# returns the source_filename with whatever rename_method was given
305sub get_source_filename {
306    my $self = shift (@_);
307
308    return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
309}
310
311
312
313# returns converted filename if available else returns source filename
314sub get_filename_for_hashing {
315    my $self = shift (@_);
316
317    my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
318
319    if (!defined $filename) {
320    my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
321    # if NULPlug processed file, then don't give a filename
322    if (defined $plugin_name && $plugin_name eq "NULPlug") {
323        $filename = undef;
324    } else { # returns the URL encoded source filename!
325        $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
326    }
327    }
328
329    if (!&FileUtils::isFilenameAbsolute($filename)) {
330    $filename = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$filename);
331    }
332
333    return $filename;
334}
335
336sub set_doc_type {
337    my $self = shift (@_);
338    my ($doc_type) = @_;
339
340    $self->set_metadata_element ($self->get_top_section(),
341                 "gsdldoctype",
342                 $doc_type);
343}
344
345# returns the gsdldoctype as it was provided
346# the default of "indexed_doc" is used if no document
347# type was provided
348sub get_doc_type {
349    my $self = shift (@_);
350
351    my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
352    return $doc_type if (defined $doc_type);
353    return "indexed_doc";
354}
355
356
357# look up the reference to the a particular section
358sub _lookup_section {
359    my $self = shift (@_);
360    my ($section) = @_;
361
362    my ($num);
363    my $sectionref = $self;
364
365    while (defined $section && $section ne "") {
366   
367    ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
368   
369    $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
370   
371    $section = "" unless defined $section;
372   
373
374    if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
375        $sectionref = $sectionref->{'subsections'}->{$num};
376    } else {
377        return undef;
378    }
379    }
380   
381    return $sectionref;
382}
383
384# calculate OID by hashing the contents of the document
385sub _calc_OID {
386    my $self = shift (@_);
387    my ($filename) = @_;
388
389
390    my $osexe = &util::get_os_exe();
391
392    my $hashfile_exe = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin",
393                       $ENV{'GSDLOS'},"hashfile$osexe");
394
395    &util::set_gnomelib_env(); # gnomelib_env (particularly lib/libiconv2.dylib) required to run the hashfile executable on Mac Lions
396    # The subroutine will set the gnomelib env once for each subshell launched, by first testing if GEXTGNOME is not already set
397
398    # A different way to set the gnomelib env would be to do it more locally: exporting the necessary vars
399    # (specifically DYLD/LD_LIB_PATH) for gnome_lib as part of the command executed.
400    # E.g. $result=`export LD_LIBRARY_PATH=../ext/gnome-lib/darwin/lib; hashfile...`
401
402    my $result = "NULL";
403
404   
405    if (-e "$hashfile_exe") {
406#   $result = `\"$hashfile_exe\" \"$filename\"`;
407#   $result = `hashfile$osexe \"$filename\" 2>&1`;
408    $result = `hashfile$osexe \"$filename\"`;
409
410    ($result) = $result =~ /:\s*([0-9a-f]+)/i;
411    } else {
412    print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
413    }
414    return "HASH$result";
415}
416
417# methods dealing with OID, not groups of them.
418
419# if $OID is not provided one is calculated
420sub set_OID {
421    my $self = shift (@_);
422    my ($OID) = @_;
423   
424    my $use_hash_oid = 0;
425    # if an OID wasn't provided calculate one
426    if (!defined $OID) {
427    $OID = "NULL";
428    if ($self->{'OIDtype'} =~ /^hash/) {
429        $use_hash_oid = 1;
430    } elsif ($self->{'OIDtype'} eq "incremental") {
431        $OID = "D" . $OIDcount;
432        $OIDcount ++;
433    } elsif ($self->{'OIDtype'} eq "filename") {
434        my $filename = $self->get_source_filename();
435        $OID = &File::Basename::fileparse($filename, qr/\.[^.]*/);
436        $OID = &util::tidy_up_oid($OID);
437    } elsif ($self->{'OIDtype'} eq "full_filename") {
438        my $source_filename = $self->get_source_filename();
439        my $dirsep = &util::get_os_dirsep();
440
441        $source_filename =~ s/^import$dirsep//;
442        $source_filename =~ s/$dirsep/-/g;
443        $source_filename =~ s/\./_/g;
444
445        $OID = $source_filename;
446        $OID = &util::tidy_up_oid($OID);
447    } elsif ($self->{'OIDtype'} eq "dirname") {
448        my $filename = $self->get_source_filename();
449        if (defined($filename)) { # && -e $filename) {
450        # get the immediate parent directory
451        $OID = &File::Basename::dirname($filename);
452        if (defined $OID) {
453            $OID = &File::Basename::basename($OID);
454            $OID = &util::tidy_up_oid($OID);
455        } else {
456            print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
457            $use_hash_oid = 1;
458        }
459        } else {
460        print STDERR "Failed to find a filename, generating hash id\n";
461        $use_hash_oid = 1;
462        }   
463       
464    } elsif ($self->{'OIDtype'} eq "assigned") {
465        my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
466        if (defined $identifier && $identifier ne "") {
467        $OID = $identifier;
468        $OID = &util::tidy_up_oid($OID);
469        } else {
470        # need a hash id
471        print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
472        $use_hash_oid = 1;
473        }
474               
475    } else {
476        $use_hash_oid = 1;
477    }
478
479    if ($use_hash_oid) {
480        my $hash_on_file = 1;
481        my $hash_on_ga_xml = 0;
482
483        if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
484        $hash_on_file = 0;
485        $hash_on_ga_xml = 1;
486        }
487
488        if ($self->{'OIDtype'} eq "hash_on_full_filename") {
489        $hash_on_file = 0;
490        $hash_on_ga_xml = 0;
491
492        my $source_filename = $self->get_source_filename();
493        my $dirsep = &util::get_os_dirsep();
494
495        $source_filename =~ s/^import$dirsep//;
496        $source_filename =~ s/$dirsep/-/g;
497        $source_filename =~ s/\./_/g;
498       
499        # If the filename is very short then (handled naively)
500        # this can cause conjestion in the hash-values
501        # computed, leading documents sharing the same leading
502        # Hex values in the computed has.
503        #
504        # The solution taken here is to replace the name of
505        # the file name a sufficient number of times (up to
506        # the character limit defined in 'rep_limit' and
507        # make that the content that is hashed on
508
509        # *** Think twice before changing the following value
510        # as it will break backward compatability of computed
511        # document HASH values
512
513        my $rep_limit  = 256;
514        my $hash_content = undef;
515
516        if (length($source_filename)<$rep_limit) {
517            my $rep_string = "$source_filename|";
518            my $rs_len = length($rep_string);
519
520            my $clone_times = int(($rep_limit-1)/$rs_len) +1;
521           
522            $hash_content = substr($rep_string x $clone_times, 0, $rep_limit);
523        }
524        else {
525            $hash_content = $source_filename;
526        }
527
528        my $filename = &util::get_tmp_filename();
529        if (!open (OUTFILE, ">:utf8", $filename)) {
530            print STDERR "doc::set_OID could not write to $filename\n";
531        } else {
532            print OUTFILE $hash_content;
533            close (OUTFILE);
534        }
535        $OID = $self->_calc_OID ($filename);
536
537        &FileUtils::removeFiles ($filename);
538        }
539
540        if ($hash_on_file) {
541        # "hash" OID - feed file to hashfile.exe
542        my $filename = $self->get_filename_for_hashing();
543       
544        # -z: don't want to hash on the file if it is zero size
545        if (defined($filename) && -e $filename && !-z $filename) {
546            $OID = $self->_calc_OID ($filename);
547        } else {
548            $hash_on_ga_xml = 1; # switch to back-up plan, and hash on GA file instead
549        }
550        }
551
552        if ($hash_on_ga_xml) {
553        # In addition being asked to explicity calculate the has based on the GA file,
554        # can also end up coming into this block is doing 'hash_on_file' but the file
555        # itself is of zero bytes (as could be the case with 'doc.nul' file
556
557        my $filename = &util::get_tmp_filename();
558        if (!open (OUTFILE, ">:utf8", $filename)) {
559            print STDERR "doc::set_OID could not write to $filename\n";
560        } else {
561            my $doc_text = &docprint::get_section_xml($self);
562            print OUTFILE $doc_text;
563            close (OUTFILE);
564        }
565        $OID = $self->_calc_OID ($filename);
566        &FileUtils::removeFiles($filename);
567        }
568    }
569    }
570    $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
571}
572
573# this uses hashdoc (embedded c thingy) which is faster but still
574# needs a little work to be sufficiently stable
575sub ___set_OID {
576    my $self = shift (@_);
577    my ($OID) = @_;
578
579    # if an OID wasn't provided then calculate hash value based on document
580    if (!defined $OID)
581    {
582    my $hash_text = &docprint::get_section_xml($self);
583    my $hash_len = length($hash_text);
584
585        $OID = &hashdoc::buffer($hash_text,$hash_len);
586    }
587
588    $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
589}
590
591# returns the OID for this document
592sub get_OID {
593    my $self = shift (@_);
594    my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
595    return $OID if (defined $OID);
596    return "NULL";
597}
598
599sub delete_OID {
600    my $self = shift (@_);
601   
602    $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
603}
604
605
606# methods for manipulating section names
607
608# returns the name of the top-most section (the top
609# level of the document
610sub get_top_section {
611    my $self = shift (@_);
612   
613    return "";
614}
615
616# returns a section
617sub get_parent_section {
618    my $self = shift (@_);
619    my ($section) = @_;
620
621    $section =~ s/(^|\.)\d+$//;
622
623    return $section;
624}
625
626# returns the first child section (or the end child
627# if there isn't any)
628sub get_begin_child {
629    my $self = shift (@_);
630    my ($section) = @_;
631
632    my $section_ptr = $self->_lookup_section($section);
633    return "" unless defined $section_ptr;
634
635    if (defined $section_ptr->{'subsection_order'}->[0]) {
636    return "$section.$section_ptr->{'subsection_order'}->[0]";
637    }
638
639    return $self->get_end_child ($section);
640}
641
642# returns the next child of a parent section
643sub get_next_child {
644    my $self = shift (@_);
645    my ($section) = @_;
646   
647    my $parent_section = $self->get_parent_section($section);
648    my $parent_section_ptr = $self->_lookup_section($parent_section);
649    return undef unless defined $parent_section_ptr;
650
651    my ($section_num) = $section =~ /(\d+)$/;
652    return undef unless defined $section_num;
653
654    my $i = 0;
655    my $section_order = $parent_section_ptr->{'subsection_order'};
656    while ($i < scalar(@$section_order)) {
657    last if $section_order->[$i] eq $section_num;
658    $i++;
659    }
660
661    $i++; # the next child
662    if ($i < scalar(@$section_order)) {
663    return $section_order->[$i] if $parent_section eq "";
664    return "$parent_section.$section_order->[$i]";
665    }
666
667    # no more sections in this level
668    return undef;
669}
670
671# returns a reference to a list of children
672sub get_children {
673    my $self = shift (@_);
674    my ($section) = @_;
675
676    my $section_ptr = $self->_lookup_section($section);
677    return [] unless defined $section_ptr;
678
679    my @children = @{$section_ptr->{'subsection_order'}};
680
681    map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
682    return \@children;
683}
684
685# returns the child section one past the last one (which
686# is coded as "0")
687sub get_end_child {
688    my $self = shift (@_);
689    my ($section) = @_;
690
691    return $section . ".0" unless $section eq "";
692    return "0";
693}
694
695# returns the next section in book order
696sub get_next_section {
697    my $self = shift (@_);
698    my ($section) = @_;
699
700    return undef unless defined $section;
701
702    my $section_ptr = $self->_lookup_section($section);
703    return undef unless defined $section_ptr;
704
705    # first try to find first child
706    if (defined $section_ptr->{'subsection_order'}->[0]) {
707    return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
708    return "$section.$section_ptr->{'subsection_order'}->[0]";
709    }
710
711    do {
712    # try to find sibling
713    my $next_child = $self->get_next_child ($section);
714    return $next_child if (defined $next_child);
715
716    # move up one level
717    $section = $self->get_parent_section ($section);
718    } while $section =~ /\d/;
719
720    return undef;
721}
722
723sub is_leaf_section {
724    my $self = shift (@_);
725    my ($section) = @_;
726
727    my $section_ptr = $self->_lookup_section($section);
728    return 1 unless defined $section_ptr;
729
730    return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
731}
732
733# methods for dealing with sections
734
735# returns the name of the inserted section
736sub insert_section {
737    my $self = shift (@_);
738    my ($before_section) = @_;
739
740    # get the child to insert before and its parent section
741    my $parent_section = "";
742    my $before_child = "0";
743    my @before_section = split (/\./, $before_section);
744    if (scalar(@before_section) > 0) {
745    $before_child = pop (@before_section);
746    $parent_section = join (".", @before_section);
747    }
748
749    my $parent_section_ptr = $self->_lookup_section($parent_section);
750    if (!defined $parent_section_ptr) {
751    print STDERR "doc::insert_section couldn't find parent section " .
752        "$parent_section\n";
753    return;
754    }
755
756    # get the next section number
757    my $section_num = $parent_section_ptr->{'next_subsection'}++;
758
759    my $i = 0;
760    while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
761       $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
762    $i++;
763    }
764   
765    # insert the section number into the order list
766    splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
767
768    # add this section to the parent section
769    my $section_ptr = {'subsection_order'=>[],
770               'next_subsection'=>1,
771               'subsections'=>{},
772               'metadata'=>[],
773               'text'=>""};
774    $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
775
776    # work out the full section number
777    my $section = $parent_section;
778    $section .= "." unless $section eq "";
779    $section .= $section_num;
780   
781    return $section;
782}
783
784# creates a pre-named section
785sub create_named_section {
786    my $self = shift (@_);
787    my ($mastersection) = @_;
788
789    my ($num);
790    my $section = $mastersection;
791    my $sectionref = $self;
792
793    while ($section ne "") {
794    ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
795    $num =~ s/^0+(\d)/$1/; # remove leading 0s
796    $section = "" unless defined $section;
797   
798    if (defined $num) {
799        if (!defined $sectionref->{'subsections'}->{$num}) {
800        push (@{$sectionref->{'subsection_order'}}, $num);
801        $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
802                            'next_subsection'=>1,
803                            'subsections'=>{},
804                            'metadata'=>[],
805                            'text'=>""};
806        if ($num >= $sectionref->{'next_subsection'}) {
807            $sectionref->{'next_subsection'} = $num + 1;
808        }
809        }
810        $sectionref = $sectionref->{'subsections'}->{$num};
811
812    } else {
813        print STDERR "doc::create_named_section couldn't create section ";
814        print STDERR "$mastersection\n";
815        last;
816    }
817    }
818}
819
820# returns a reference to a list of subsections
821sub list_subsections {
822    my $self = shift (@_);
823    my ($section) = @_;
824
825    my $section_ptr = $self->_lookup_section ($section);
826    if (!defined $section_ptr) {
827    print STDERR "doc::list_subsections couldn't find section $section\n";
828    return [];
829    }
830
831    return [@{$section_ptr->{'subsection_order'}}];
832}
833
834sub delete_section {
835    my $self = shift (@_);
836    my ($section) = @_;
837
838#    my $section_ptr = {'subsection_order'=>[],
839#              'next_subsection'=>1,
840#              'subsections'=>{},
841#              'metadata'=>[],
842#              'text'=>""};
843
844    # if this is the top section reset everything
845    if ($section eq "") {
846    $self->{'subsection_order'} = [];
847    $self->{'subsections'} = {};
848    $self->{'metadata'} = [];
849    $self->{'text'} = "";
850    return;
851    }
852
853    # find the parent of the section to delete
854    my $parent_section = "";
855    my $child = "0";
856    my @section = split (/\./, $section);
857    if (scalar(@section) > 0) {
858    $child = pop (@section);
859    $parent_section = join (".", @section);
860    }
861
862    my $parent_section_ptr = $self->_lookup_section($parent_section);
863    if (!defined $parent_section_ptr) {
864    print STDERR "doc::delete_section couldn't find parent section " .
865        "$parent_section\n";
866    return;
867    }
868
869    # remove this section from the subsection_order list
870    my $i = 0;
871    while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
872    if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
873        splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
874        last;
875    }
876    $i++;
877    }
878
879    # remove this section from the subsection hash
880    if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
881    undef $parent_section_ptr->{'subsections'}->{$child};
882    }
883}
884
885#--
886# methods for dealing with metadata
887
888# set_metadata_element and get_metadata_element are for metadata
889# which should only have one value. add_meta_data and get_metadata
890# are for metadata which can have more than one value.
891
892# returns the first metadata value which matches field
893
894# This version of get metadata element works much like the one above,
895# except it allows for the namespace portion of a metadata element to
896# be ignored, thus if you are searching for dc.Title, the first piece
897# of matching metadata ending with the name Title (once any namespace
898# is removed) would be returned.
899# 28-11-2003 John Thompson
900sub get_metadata_element {
901    my $self = shift (@_);
902    my ($section, $field, $ignore_namespace) = @_;
903    my ($data);
904
905    $ignore_namespace = 0 unless defined $ignore_namespace;
906
907    my $section_ptr = $self->_lookup_section($section);
908    if (!defined $section_ptr) {
909    print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
910    return;
911    }
912
913    # Remove any namespace if we are being told to ignore them
914    if($ignore_namespace) {
915    $field =~ s/^.*\.//; #$field =~ s/^\w*\.//;
916    }
917
918    foreach $data (@{$section_ptr->{'metadata'}}) {
919
920    my $data_name = $data->[0];
921
922    # Remove any namespace if we are being told to ignore them
923    if($ignore_namespace) {
924        $data_name =~ s/^.*\.//; #$data_name =~ s/^\w*\.//;
925    }
926    # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
927    $data_name =~ s/^ex\.([^.]+)$/$1/; #$data_name =~ s/^ex\.//;
928   
929    return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
930    }
931   
932    return undef; # was not found
933}
934
935# returns a list of the form [value1, value2, ...]
936sub get_metadata {
937    my $self = shift (@_);
938    my ($section, $field, $ignore_namespace) = @_;
939    my ($data);
940
941    $ignore_namespace = 0 unless defined $ignore_namespace;
942
943    my $section_ptr = $self->_lookup_section($section);
944    if (!defined $section_ptr) {
945        print STDERR "doc::get_metadata couldn't find section ",
946        $section, "\n";
947        return;
948    }
949
950    # Remove any namespace if we are being told to ignore them
951    if($ignore_namespace) {
952    $field =~ s/^.*\.//;
953    }
954
955    my @metadata = ();
956    foreach $data (@{$section_ptr->{'metadata'}}) {
957
958    my $data_name = $data->[0];
959
960    # Remove any namespace if we are being told to ignore them
961    if($ignore_namespace) {
962        $data_name =~ s/^.*\.//;
963    }   
964    # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
965    $data_name =~ s/^ex\.([^.]+)$/$1/;
966
967        push (@metadata, $data->[1]) if ($data_name eq $field);
968    }
969
970    return \@metadata;
971}
972
973sub get_metadata_hashmap {
974    my $self = shift (@_);
975    my ($section, $opt_namespace) = @_;
976   
977    my $section_ptr = $self->_lookup_section($section);
978    if (!defined $section_ptr) {
979            print STDERR "doc::get_metadata couldn't find section ",
980            $section, "\n";
981            return;
982        }
983
984    my $metadata_hashmap = {};
985    foreach my $data (@{$section_ptr->{'metadata'}}) {
986            my $metaname = $data->[0];
987         
988            if ((!defined $opt_namespace) || ($metaname =~ m/^$opt_namespace\./)) {
989                if (!defined $metadata_hashmap->{$metaname}) {
990                    $metadata_hashmap->{$metaname} = [];
991                  }
992                my $metaval_list = $metadata_hashmap->{$metaname};
993                push(@$metaval_list, $data->[1]);
994              }
995          }
996   
997    return $metadata_hashmap;
998}
999
1000# returns a list of the form [[field,value],[field,value],...]
1001sub get_all_metadata {
1002    my $self = shift (@_);
1003    my ($section) = @_;
1004
1005    my $section_ptr = $self->_lookup_section($section);
1006    if (!defined $section_ptr) {
1007    print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
1008    return;
1009    }
1010   
1011    return $section_ptr->{'metadata'};
1012}
1013
1014# $value is optional
1015sub delete_metadata {
1016    my $self = shift (@_);
1017    my ($section, $field, $value) = @_;
1018
1019    my $section_ptr = $self->_lookup_section($section);
1020    if (!defined $section_ptr) {
1021    print STDERR "doc::delete_metadata couldn't find section ", $section, "$field\n";
1022    return;
1023    }
1024
1025    my $i = 0;
1026    while ($i < scalar (@{$section_ptr->{'metadata'}})) {
1027    if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
1028        (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
1029        splice (@{$section_ptr->{'metadata'}}, $i, 1);
1030    } else {
1031        $i++;
1032    }
1033    }
1034}
1035
1036sub delete_all_metadata {
1037    my $self = shift (@_);
1038    my ($section) = @_;
1039
1040    my $section_ptr = $self->_lookup_section($section);
1041    if (!defined $section_ptr) {
1042    print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
1043    return;
1044    }
1045   
1046    $section_ptr->{'metadata'} = [];
1047}
1048
1049sub set_metadata_element {
1050    my $self = shift (@_);
1051    my ($section, $field, $value) = @_;
1052
1053    $self->set_utf8_metadata_element ($section, $field,
1054                      &unicode::ascii2utf8(\$value));
1055}
1056
1057# set_utf8_metadata_element assumes the text has already been
1058# converted to the UTF-8 encoding.
1059sub set_utf8_metadata_element {
1060    my $self = shift (@_);
1061    my ($section, $field, $value) = @_;
1062   
1063    $self->delete_metadata ($section, $field);
1064    $self->add_utf8_metadata ($section, $field, $value);
1065}
1066
1067
1068# add_metadata assumes the text is in (extended) ascii form. For
1069# text which has already been converted to the UTF-8 format use
1070# add_utf8_metadata.
1071sub add_metadata {
1072    my $self = shift (@_);
1073    my ($section, $field, $value) = @_;
1074   
1075    $self->add_utf8_metadata ($section, $field,
1076                  &unicode::ascii2utf8(\$value));
1077}
1078
1079sub add_utf8_metadata {
1080    my $self = shift (@_);
1081    my ($section, $field, $value) = @_;
1082   
1083    #    my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1084    #    my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
1085    #    print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
1086
1087    my $section_ptr = $self->_lookup_section($section);
1088    if (!defined $section_ptr) {
1089    print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
1090    return;
1091    }
1092    if (!defined $value) {
1093    print STDERR "doc::add_utf8_metadata undefined value for $field\n";
1094    return;
1095    }
1096    if (!defined $field) {
1097    print STDERR "doc::add_utf8_metadata undefined metadata type \n";
1098    return;
1099    }
1100   
1101    #print STDERR "###$field=$value\n";
1102
1103    # For now, supress this check.  Given that text data read in is now
1104    # Unicode aware, then the following block of code can (ironically enough)
1105    # cause our unicode compliant string to be re-encoded (leading to
1106    # a double-encoded UTF-8 string, which we definitely don't want!).
1107   
1108
1109    # double check that the value is utf-8
1110    #    if (!&unicode::check_is_utf8($value)) {
1111    #   print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
1112    #   &unicode::ensure_utf8(\$value);
1113    #   print STDERR " Tried converting to utf8: $value\n";
1114    #    }
1115
1116    #If the metadata value is either a latitude or a longitude value then we want to save a shortened version for spacial searching purposes
1117    if ($field =~ m/^(.+\.)?Latitude$/ || $field =~ m/^(.+\.)?Longitude$/)
1118    {
1119            my ($mdprefix,$metaname) = ($field =~ m/(.+)\.(.+)$/);
1120        if (defined $mdprefix) {
1121            # Add in a version of Latitude/Longitude without the metadata namespace prefix to keep Runtime happy
1122            push (@{$section_ptr->{'metadata'}}, [$metaname, $value]);
1123        }
1124
1125        my $direction;
1126        if($value =~ m/^-/)
1127        {
1128            $direction = ($field eq "Latitude") ? "S" : "W";
1129        }
1130        else
1131        {
1132            $direction = ($field eq "Latitude") ? "N" : "E";
1133        }
1134       
1135        my ($beforeDec, $afterDec) = ($value =~ m/^-?([0-9]+)\.([0-9]+)$/);
1136        if(defined $beforeDec && defined $afterDec)
1137        {
1138            my $name = ($field eq "Latitude") ? "LatShort" : "LngShort";
1139            push (@{$section_ptr->{'metadata'}}, [$name, $beforeDec . $direction]);
1140           
1141            for(my $i = 2; $i <= 4; $i++)
1142            {
1143                if(length($afterDec) >= $i)
1144                {
1145                    push (@{$section_ptr->{'metadata'}}, [$name, substr($afterDec, 0, $i)]);
1146                }
1147            }
1148           
1149            #Only add the metadata if it has not already been added
1150            my $metaMap = $self->get_metadata_hashmap($section); # metaMap not used and called function has no apparent side-effects,
1151                                                                 # but this line appears important for ensuring uniqueness of (Latitude, value) meta
1152                                                                 # in the section. Also for LatShort, Longitude, LngShort.
1153        }
1154       
1155        if($field =~ m/^(.+\.)?Longitude$/) {
1156            # if we are dealing with Longitude meta, we should 1. have Latitude meta too; 2. already have processed Latitude meta
1157            # in that case, add both Lat and Lng of this section as a Coordinate meta
1158            my $latitude = $self->get_metadata_element ($section, "Latitude");
1159            my $longitude = $value;
1160            # TODO: would like all Longitude meta placed together, followed by all Coordinate meta,
1161            # but the following will add all such coord meta to the doc object and the end of this function will add Longitude meta
1162            $self->processCoordinate($section, $latitude, $longitude);
1163        }
1164    }
1165   
1166    elsif($field eq "GPS.mapOverlay") { # then the $value is a JSON string
1167
1168        # In order to allow searching map-data-enriched documents by map shape descriptions,
1169        # and to run rawquery searches for other docs by proximity based on their map data,
1170        # need to store the shape descriptions and Coordinate info for shapes into the text index.
1171        # We add the description for each shape in the mapoverlay into the text index as GPSMapOverlayLabel
1172        # And we add Coordinate (CD) and CoordShort (CS) info for each shape in the mapoverlay in the format (Lat, Lng) as ("37S339 175E342")
1173        # where the digits before the N/S/E/W direction represents the whole number. And the digits after the direction are the
1174        # decimal places which can vary between 0 and 2 to 4 digits.
1175       
1176        # However, we only want to process GPS.mapOverlay during buildcol and only in the text indexing passes (e.g. sidx and didx for lucene)
1177        # and certainly never during the infodb pass of buildcol. The latter can end up duplicating Coordinate/CoordShort/GPSMapOverlayLabel
1178        # when rebuilding with the online doc editor, as that runs incremental-rebuild which then calls basebuilder::reconstruct_doc_objs_metadata()
1179        # on all docs NOT being incrementally rebuilt. That call would get meta from the infodb and use it to reconstruct the doc objects of docs
1180        # NOT being incrementally built. If the Coord and Label meta were written to the infodb, they would then be loaded back in memory when the
1181        # collection is incrementally rebuilt for those docs that don't need incremental processing. Then this function would once again add the
1182        # same meta into the infodb, thus duplicating what goes into the infodb. Hence, don't do all the following if doc::processor_mode =~ "infodb",
1183        # or anything other than a text mode.
1184
1185        # Note that for incremental rebuilding, the text pass can be called textreindex for instance (and infodb pass can be incinfodb).
1186        # So don't check for exact string match
1187
1188        if($doc::cmd_line_mode eq "buildcol" && $doc::processor_mode =~ m/^text/) { # currently known text processor_modes:
1189                                                                                    # text, textreindex, possibly textdelete (see ArchivesInfPlugin.pm::read() for last 2).
1190                        # OR: ..."buildcol" && $doc::processor_mode !~ m/infodb$/) # if dummy pass important
1191       
1192        ###print STDERR "GPS.mapOverlay before decoding, val = " . $value . "\n";
1193       
1194        # TODO: html decode?
1195        $value =~ s@&#091;@[@g;
1196        $value =~ s@&#093;@]@g;
1197        $value =~ s@&quot;@"@g;
1198        ###print STDERR "GPS.mapOverlay after decoding, val = " . $value . "\n";
1199
1200        my $json_array = decode_json $value;
1201       
1202        foreach my $shape (@$json_array) {
1203
1204            # Put each available shape description/label into this section's metadata with GPSMapOverlayLabel as metaname.
1205            # Just as for Coordinate meta, don't need to know which shape a label belongs too. This is just so each label
1206            # will be indexed, and therefore can be searched.
1207           
1208            my $description = $shape->{"description"};
1209            if($description) {
1210            push (@{$section_ptr->{'metadata'}}, ["GPSMapOverlayLabel", $description]);
1211            ###print STDERR "@@@@############################################ Just added description meta: " . $description . "\n";
1212            }
1213           
1214            my $type = $shape->{"type"};
1215            ###print STDERR "Shape type : " . $type . "\n";
1216       
1217            if($type eq "circle") {
1218                ###print STDERR "Found a circle:\n" . &printShape($json, $shape);
1219       
1220                # work out bounding box
1221                # want the inverse of this useful page:
1222                # https://stackoverflow.com/questions/639695/how-to-convert-latitude-or-longitude-to-meters
1223                # https://www.geodatasource.com/developers/javascript               
1224               
1225                # Dr Bainbridge wants us to use: https://gis.stackexchange.com/questions/5821/calculating-latitude-longitude-x-miles-from-point
1226                # Since for N,E,S,W the following works out the same as above link, we're using the rule of thumb in the following:
1227                # https://gis.stackexchange.com/questions/2951/algorithm-for-offsetting-a-latitude-longitude-by-some-amount-of-meters
1228                # which states
1229                # "If your displacements aren't too great (less than a few kilometers) and you're not right at the poles,
1230                # use the quick and dirty estimate that 111,111 meters (111.111 km) in the y direction is 1 degree
1231                # (of latitude) and 111,111 * cos(latitude) meters in the x direction is 1 degree (of longitude)."
1232                my $centre_lat = $shape->{"center"}->{"lat"};
1233                my $centre_lng = $shape->{"center"}->{"lng"};
1234                my $radius = $shape->{"radius"}; # in metres!
1235
1236                ###print STDERR "@@@ circle centre: ($centre_lat, $centre_lng), radius: $radius\n";
1237
1238                my $lat_north = $centre_lat + ($radius/111111);
1239                my $lat_south = $centre_lat - ($radius/111111);
1240               
1241                ###print STDERR "### lat_north:  $lat_north\n";
1242                ###print STDERR "### lat_south:  $lat_south\n";
1243
1244                # our latitude and longitude values are in degrees. But cos and sin etc in perl and generally all prog languages
1245                # all expect their angle to be in radians. So need to convert from degree to radians before we can take the cos of it.
1246                my $centre_lat_radians = $self->degreesToRadians($centre_lat);
1247                my $cos_in_radians = cos($centre_lat_radians);             
1248                ###print STDERR "cos $centre_lat_radians " . cos($centre_lat_radians) . "\n";
1249                my $lng_east = $centre_lng + ($radius/(111111 * $cos_in_radians));
1250                my $lng_west = $centre_lng - ($radius/(111111 * $cos_in_radians));
1251                ###print STDERR "### lng_east  $lng_east\n";
1252                ###print STDERR "### lng_west  $lng_west\n";
1253
1254                my $cos_lat = cos($centre_lat);             
1255                ###print STDERR "cos $centre_lat is $cos_lat\n";
1256
1257                $self->processCoordinate($section, $lat_north, $lng_east);
1258                $self->processCoordinate($section, $lat_south, $lng_east);
1259                $self->processCoordinate($section, $lat_south, $lng_west);
1260                $self->processCoordinate($section, $lat_north, $lng_west);
1261           
1262            }
1263            elsif ($type eq "marker") {
1264                ###print STDERR "@@ MARKER FOUND WITH LAT: " . $shape->{"position"}->{"lat"} . "\n";
1265                ###print STDERR "@@ MARKER FOUND WITH LNG: " . $shape->{"position"}->{"lng"} . "\n";
1266                $self->processCoordinate($section, $shape->{"position"}->{"lat"}, $shape->{"position"}->{"lng"});               
1267            }
1268            elsif ($type eq "polyline" || $type eq "polygon") {
1269                my $path_array = $shape->{"path"};
1270                foreach my $position (@$path_array) {                                   
1271                    $self->processCoordinate($section, $position->{"lat"}, $position->{"lng"});
1272                }
1273            }
1274            elsif ($type eq "rectangle") {
1275       
1276                my $bounds = $shape->{"bounds"};
1277                $self->processCoordinate($section, $bounds->{"north"}, $bounds->{"east"});
1278                $self->processCoordinate($section, $bounds->{"south"}, $bounds->{"east"});
1279                $self->processCoordinate($section, $bounds->{"south"}, $bounds->{"west"});
1280                $self->processCoordinate($section, $bounds->{"north"}, $bounds->{"west"});
1281            }   
1282       
1283        } # end for on each shape in GPS.mapOverlay
1284        } # end if(buildcol and text pass)
1285    } # end GPS.mapOverlay meta
1286
1287    push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1288}
1289
1290# https://en.wikipedia.org/wiki/Radian
1291sub degreesToRadians
1292{
1293    my $self = shift (@_);
1294    my ($degrees) = @_;
1295
1296    return $degrees * pi /180; # returns radians
1297}
1298
1299sub radiansToDegrees
1300{
1301    my $self = shift (@_);
1302    my ($radians) = @_;
1303
1304    return $radians * 180 / pi; # returns degrees
1305}
1306
1307# Call as:
1308# my $json = JSON->new->allow_nonref;
1309# &printAllShapes($json, $json_array);
1310sub printAllShapes {
1311    my ($json, $json_array) = @_;
1312   
1313   
1314    #my $pretty_print_shape = $json->pretty->encode( $json_array->[0] );
1315    foreach my $shape (@$json_array) {
1316        my $pretty_print_shape = $json->pretty->encode( $shape );
1317        print STDERR "Shape: $pretty_print_shape\n";
1318    }
1319   
1320}
1321
1322# For the (lat, lng) coordinate given,
1323# attaches Coordinate and multiple CoordShort (different precision level) metadata to the doc object
1324sub processCoordinate {
1325    my $self = shift (@_);
1326    my ($section, $latitude, $longitude) = @_;
1327
1328    my $section_ptr = $self->_lookup_section($section);
1329
1330    my $lat_direction = ($latitude =~ m/^-/) ? "S" : "N";
1331    my $lng_direction = ($longitude =~ m/^-/) ? "W" : "E";
1332   
1333    # have to store (lat, lng) in pairs, when there are so many coords to store
1334    #push (@{$section_ptr->{'metadata'}}, ["Latitude", $latitude]);
1335    #push (@{$section_ptr->{'metadata'}}, ["Longitude", $longitude]);
1336
1337    push (@{$section_ptr->{'metadata'}}, ["Coordinate", "$latitude $longitude"]); # "$latitude$lat_direction $longitude$lng_direction"
1338
1339    my ($latBeforeDec, $latAfterDec);
1340    my ($lngBeforeDec, $lngAfterDec);
1341
1342    if($latitude !~ m/\./) {
1343        $latBeforeDec = $latitude;
1344        $latAfterDec = "";
1345    } else {
1346        ($latBeforeDec, $latAfterDec) = ($latitude =~ m/^-?([0-9]+)\.([0-9]+)$/);
1347    }
1348    if($longitude !~ m/\./) {
1349        $lngBeforeDec = $longitude;
1350        $lngAfterDec = "";
1351    } else {
1352        ($lngBeforeDec, $lngAfterDec) = ($longitude =~ m/^-?([0-9]+)\.([0-9]+)$/);
1353    }   
1354   
1355
1356    my $name = "CoordShort";
1357    push (@{$section_ptr->{'metadata'}}, [$name, "$latBeforeDec$lat_direction $lngBeforeDec$lng_direction"]);
1358       
1359    for(my $i = 2; $i <= 4; $i++)
1360    {
1361        my $latDecPlaces = (length($latAfterDec) >= $i) ? substr($latAfterDec, 0, $i) : "";         
1362        my $lngDecPlaces = (length($lngAfterDec) >= $i) ? substr($lngAfterDec, 0, $i) : "";
1363
1364        push (@{$section_ptr->{'metadata'}}, [$name,
1365                $latBeforeDec . $lat_direction. $latDecPlaces . " " . $lngBeforeDec . $lng_direction. $lngDecPlaces]);
1366       
1367    }
1368       
1369    #Only add the metadata if it has not already been added
1370    #my $metaMap = $self->get_metadata_hashmap($section); ### TODO: metaMap not used. Unnecesssary step? (Called function has no apparent side-effects.)
1371}
1372
1373
1374# methods for dealing with text
1375
1376# returns the text for a section
1377sub get_text {
1378    my $self = shift (@_);
1379    my ($section) = @_;
1380
1381    my $section_ptr = $self->_lookup_section($section);
1382    if (!defined $section_ptr) {
1383    print STDERR "doc::get_text couldn't find section " .
1384        "$section\n";
1385    return "";
1386    }
1387
1388    return $section_ptr->{'text'};
1389}
1390
1391# returns the (utf-8 encoded) length of the text for a section
1392sub get_text_length {
1393    my $self = shift (@_);
1394    my ($section) = @_;
1395
1396    my $section_ptr = $self->_lookup_section($section);
1397    if (!defined $section_ptr) {
1398    print STDERR "doc::get_text_length couldn't find section " .
1399        "$section\n";
1400    return 0;
1401    }
1402
1403    return length ($section_ptr->{'text'});
1404}
1405
1406# returns the total length for all the sections
1407sub get_total_text_length {
1408    my $self = shift (@_);
1409
1410    my $section = $self->get_top_section();
1411    my $length = 0;
1412    while (defined $section) {
1413    $length += $self->get_text_length($section);
1414    $section = $self->get_next_section($section);
1415    }
1416    return $length;
1417}
1418
1419sub delete_text {
1420    my $self = shift (@_);
1421    my ($section) = @_;
1422
1423    my $section_ptr = $self->_lookup_section($section);
1424    if (!defined $section_ptr) {
1425    print STDERR "doc::delete_text couldn't find section " .
1426        "$section\n";
1427    return;
1428    }
1429
1430    $section_ptr->{'text'} = "";
1431}
1432
1433# add_text assumes the text is in (extended) ascii form. For
1434# text which has been already converted to the UTF-8 format
1435# use add_utf8_text.
1436sub add_text {
1437    my $self = shift (@_);
1438    my ($section, $text) = @_;
1439
1440    # convert the text to UTF-8 encoded unicode characters
1441    # and add the text
1442    $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1443}
1444
1445
1446# add_utf8_text assumes the text to be added has already
1447# been converted to the UTF-8 encoding. For ascii text use
1448# add_text
1449# Pass by value version (internally calls pass by ref version
1450# to avoid code duplication)
1451sub add_utf8_text {
1452    my $self = shift (@_);
1453    my ($section, $text) = @_;
1454   
1455    $self->add_utf8_textref($section, \$text);
1456}
1457
1458# Pass by reference version, used by GreenstoneSQLPlugin for fulltext
1459sub add_utf8_textref {
1460    my $self = shift (@_);
1461    my ($section, $text_ref) = @_;
1462
1463    my $section_ptr = $self->_lookup_section($section);
1464    if (!defined $section_ptr) {
1465    print STDERR "doc::add_utf8_textref couldn't find section " .
1466        "$section\n";
1467    return;
1468    }
1469
1470    $section_ptr->{'text'} .= $$text_ref;
1471}
1472
1473# returns the Source meta, which is the utf8 filename generated.
1474# Added a separate method here for convenience
1475sub get_source {
1476    my $self = shift (@_);
1477    return $self->get_metadata_element ($self->get_top_section(), "Source");
1478}
1479
1480# returns the SourceFile meta, which is the url reference to the URL-encoded
1481# version of Source (the utf8 filename). Added a separate method here for convenience
1482sub get_sourcefile {
1483    my $self = shift (@_);
1484    return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
1485}
1486
1487# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
1488# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
1489sub get_assocfile_from_sourcefile {
1490    my $self = shift (@_);
1491   
1492    # get the SourceFile meta, which is a *URL* to a file on the filesystem
1493    my $top_section = $self->get_top_section();
1494    my $source_file = $self->get_metadata_element($top_section, "SourceFile");
1495
1496    # get the actual filename as it exists on the filesystem which this url refers to
1497    $source_file = &unicode::url_to_filename($source_file);
1498    my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
1499    return $assocfilename;
1500}
1501
1502# methods for dealing with associated files
1503
1504# a file is associated with a document, NOT a section.
1505# if section is defined it is noted in the data structure
1506# only so that files associated from a particular section
1507# may be removed later (using delete_section_assoc_files)
1508sub associate_file {
1509    my $self = shift (@_);
1510    my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1511    $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1512
1513    # remove all associated files with the same name
1514    $self->delete_assoc_file ($assoc_filename);
1515
1516    # Too harsh a requirement
1517    # Definitely get HTML docs, for example, with some missing
1518    # support files
1519#    if (!&util::fd_exists($real_filename)) {
1520#   print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n";
1521#   exit -1;
1522#    }
1523
1524#    print STDERR "**** is the following a UTF8 rep of *real* filename?\n   $real_filename\n";
1525#    print STDERR "****##### so, ensure it is before storing?!?!?\n";
1526##    my $utf8_filename = Encode::encode("utf8",$filename);
1527
1528    push (@{$self->{'associated_files'}},
1529      [$real_filename, $assoc_filename, $mime_type, $section]);
1530}
1531
1532# returns a list of associated files in the form
1533#   [[real_filename, assoc_filename, mimetype], ...]
1534sub get_assoc_files {
1535    my $self = shift (@_);
1536
1537    return $self->{'associated_files'};
1538}
1539
1540# the following two methods used to keep track of original associated files
1541# for incremental building. eg a txt file used by an item file does not end
1542# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1543# file for incremental build
1544sub associate_source_file {
1545    my $self = shift (@_);
1546    my ($full_filename) = @_;
1547
1548    push (@{$self->{'source_assoc_files'}}, $full_filename);
1549
1550}
1551
1552sub get_source_assoc_files {
1553    my $self = shift (@_);
1554
1555    return $self->{'source_assoc_files'};
1556 
1557
1558}
1559sub metadata_file {
1560    my $self = shift (@_);
1561    my ($real_filename, $filename) = @_;
1562   
1563    push (@{$self->{'metadata_files'}},
1564      [$real_filename, $filename]);
1565}
1566
1567# used for writing out the archiveinf-doc info database, to list all the metadata files
1568sub get_meta_files {
1569    my $self = shift (@_);
1570
1571    return $self->{'metadata_files'};
1572}
1573
1574sub delete_section_assoc_files {
1575    my $self = shift (@_);
1576    my ($section) = @_;
1577
1578    my $i=0;
1579    while ($i < scalar (@{$self->{'associated_files'}})) {
1580    if (defined $self->{'associated_files'}->[$i]->[3] &&
1581        $self->{'associated_files'}->[$i]->[3] eq $section) {
1582        splice (@{$self->{'associated_files'}}, $i, 1);
1583    } else {
1584        $i++;
1585    }
1586    }
1587}
1588
1589sub delete_assoc_file {
1590    my $self = shift (@_);
1591    my ($assoc_filename) = @_;
1592
1593    my $i=0;
1594    while ($i < scalar (@{$self->{'associated_files'}})) {
1595    if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1596        splice (@{$self->{'associated_files'}}, $i, 1);
1597    } else {
1598        $i++;
1599    }
1600    }
1601}
1602
1603sub reset_nextsection_ptr {
1604    my $self = shift (@_);
1605    my ($section) = @_;
1606   
1607    my $section_ptr = $self->_lookup_section($section);
1608    $section_ptr->{'next_subsection'} = 1;
1609}
1610
16111;
Note: See TracBrowser for help on using the browser.