source: main/trunk/greenstone2/perllib/doc.pm@ 33756

Last change on this file since 33756 was 33756, checked in by ak19, 4 years ago

Attempted bugfix for ex meta not always loading in gli for docs that are in subdirs when filenames are base64 encoded. This commit only testedand works on linux for my basic tests with subdirs and without. 1. Perl now encodes all subdirs and the filename in gsdlsourcefilename (but as before, not file extension). Can't encode entire relative path starting with import in one go, as other parts of the perl code do comparisons and remove file GSDLIMPORTDIR prefixes. 2. Perl now also writes out the file rename method used, which can be none, url or base64, into doc.xml. 3. GLI now decodes each part of the gsdlsourcefilename relative path based on the file rename method. e.g. for import/subdir/filename.ext the import, subdir and filename are decoded to reconstitute the filename as it originally was, with file extension stuck back on. This has allowed GLI to finally detect the ex meta associated with a gsdlsourcefilename in cases of subdirs in import or when dealing with base64 encoded filenames. Still need to test more complex cases on linux, then windows too.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 51.8 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use FileUtils;
40use ghtml;
41use File::stat;
42##use hashdoc;
43use docprint;
44use JSON;
45
46# We just need pi from the Trig lib
47# Import constants pi2, pip2, pip4 (2*pi, pi/2, pi/4).
48# But doing
49# use Math::Trig ':pi';
50# resulted in errors on 32 bit linux VM with perl v8 of "bareword pi not allowed while strict subs in use"
51# So, now doing just
52use Math::Trig;
53
54# the document type may be indexed_doc, nonindexed_doc, or
55# classification
56
57our $OIDcount = 0;
58
59
60# Declare the Package (level) Variable that will tell us whether we're in the import or buildcol phase.
61# This var will be set by import.pl, buildcol.pl and export.pl to the then appropriate value.
62# https://perlmaven.com/package-variables-and-lexical-variables-in-perl
63# But: https://stackoverflow.com/questions/7542120/perl-how-to-make-variables-from-requiring-script-available-in-required-script
64# https://stackoverflow.com/questions/33230015/how-do-i-use-a-variable-in-another-perl-script
65#$doc::cmd_line_mode = undef;
66# Decided to follow the usage of $OIDcount above to declare the necessary package-level variables 'globally' accessible
67# actually accessible only for those packages that import this one (with 'use doc')
68my $cmd_line_mode = undef;
69
70# processor_mode keeps track of which buildcol pass we're at: dummy, text (sidx/didx passes) or infodb
71my $processor_mode = undef;
72
73# rename_method can be 'url', 'none', 'base64'
74sub new {
75 my $class = shift (@_);
76 my ($source_filename, $doc_type, $rename_method) = @_;
77
78
79 my $self = bless {'associated_files'=>[],
80 'subsection_order'=>[],
81 'next_subsection'=>1,
82 'subsections'=>{},
83 'metadata'=>[],
84 'text'=>"",
85 'OIDtype'=>"hash"}, $class;
86
87 # used to set lastmodified here, but this can screw up the HASH ids, so
88 # the docsave processor now calls set_lastmodified
89
90 $self->set_source_path($source_filename);
91
92 if (defined $source_filename) {
93 $source_filename = &util::filename_within_collection($source_filename);
94 print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
95 $self->set_source_filename ($source_filename, $rename_method);
96 }
97
98 $self->set_doc_type ($doc_type) if defined $doc_type;
99
100 return $self;
101}
102
103
104sub set_source_path
105{
106 my $self = shift @_;
107 my ($source_filename) = @_;
108
109 if (defined $source_filename) {
110 # On Windows the source_filename can be in terse DOS format
111 # e.g. test~1.txt
112
113 $self->{'terse_source_path'} = $source_filename;
114
115 # Use the FileUtil library methods as they are aware of more special
116 # cases such as HDFS [jmt12]
117 if (&FileUtils::fileExists($source_filename))
118 {
119 # See if we can do better for Windows with a filename
120 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
121 require Win32;
122 $self->{'source_path'} = Win32::GetLongPathName($source_filename);
123 }
124 else {
125 # For Unix-based systems, there is no difference between the two
126 $self->{'source_path'} = $source_filename;
127 }
128 }
129 else {
130 print STDERR "Warning: In doc::set_source_path(), file\n";
131 print STDERR " $source_filename\n";
132 print STDERR " does not exist\n";
133
134 # (default) Set it to whatever we were given
135 $self->{'source_path'} = $source_filename;
136 }
137 }
138 else {
139 # Previous code for setting source_path allowed for
140 # it to be undefined, so continue this practice
141 $self->{'terse_source_path'} = undef;
142 $self->{'source_path'} = undef;
143 }
144}
145
146
147sub get_source_path
148{
149 my $self = shift @_;
150
151 return $self->{'terse_source_path'};
152}
153
154# set lastmodified for OAI purposes, added by GRB, moved by kjdon
155sub set_oailastmodified {
156 my $self = shift (@_);
157
158 my $source_path = $self->{'terse_source_path'};
159
160 if (defined $source_path && (-e $source_path)) {
161 my $current_time = time;
162
163 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
164 $wday, $yday, $isdst) = localtime($current_time);
165
166 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
167
168 $self->add_utf8_metadata($self->get_top_section(), "oailastmodified", $current_time);
169 $self->add_utf8_metadata($self->get_top_section(), "oailastmodifieddate", $date_modified);
170 }
171}
172
173# no longer used for OAI purposes, since lastmodified is not what we want as the
174# Datestamp of a document. This doc metadata may be useful for general purposes.
175sub set_lastmodified {
176 my $self = shift (@_);
177
178 my $source_path = $self->{'terse_source_path'};
179
180 if (defined $source_path && (-e $source_path)) {
181
182 my $file_stat = stat($source_path);
183 my $mtime = $file_stat->mtime;
184 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
185 $wday, $yday, $isdst) = localtime($mtime);
186
187 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
188
189 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $mtime);
190 $self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
191 }
192}
193
194# clone the $self object
195sub duplicate {
196 my $self = shift (@_);
197
198 my $newobj = {};
199
200 foreach my $k (keys %$self) {
201 $newobj->{$k} = &clone ($self->{$k});
202 }
203
204 bless $newobj, ref($self);
205 return $newobj;
206}
207
208sub clone {
209 my ($from) = @_;
210 my $type = ref ($from);
211
212 if ($type eq "HASH") {
213 my $to = {};
214 foreach my $key (keys %$from) {
215 $to->{$key} = &clone ($from->{$key});
216 }
217 return $to;
218 } elsif ($type eq "ARRAY") {
219 my $to = [];
220 foreach my $v (@$from) {
221 push (@$to, &clone ($v));
222 }
223 return $to;
224 } else {
225 return $from;
226 }
227}
228
229sub set_OIDtype {
230 my $self = shift (@_);
231 my ($type, $metadata) = @_;
232
233 if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|hash_on_full_filename|incremental|filename|dirname|full_filename|assigned)$/) {
234 $self->{'OIDtype'} = $type;
235 } else {
236 $self->{'OIDtype'} = "hash";
237 }
238
239 if ($type =~ /^assigned$/) {
240 if (defined $metadata) {
241 $self->{'OIDmetadata'} = $metadata;
242 } else {
243 $self->{'OIDmetadata'} = "dc.Identifier";
244 }
245 }
246}
247
248# rename_method can be 'url', 'none', 'base64'
249sub set_source_filename {
250 my $self = shift (@_);
251 my ($source_filename, $rename_method) = @_;
252
253 # Since the gsdlsourcefilename element goes into the doc.xml it has
254 # to be utf8. However, it should also *represent* the source filename
255 # (in the import directory) which may not be utf8 at all.
256 # For instance, if this meta element (gsdlsourcefilename) will be used
257 # by other applications that parse doc.xml in order to locate
258 # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
259 # encode the real filename as this is a binary-to-text encoding meaning
260 # that the resulting string is ASCII (utf8). Decoding will give the original.
261
262# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
263## print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename\n";
264
265 # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
266 $source_filename = $self->encode_filename($source_filename, $rename_method);
267# my ($srcfilename,$dirname,$suffix)
268# = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
269# print STDERR "-> $srcfilename -> ";
270# $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
271# $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
272# print STDERR "$source_filename\n";
273
274## print STDERR "AFTER URL/base64 encoding gsdlsourcefilename: $source_filename\n";
275
276 $self->set_utf8_metadata_element ($self->get_top_section(),
277 "gsdlsourcefilename",
278 $source_filename);
279
280 # If we set the file renaming method, at least GLI can decode gsdlsourcefilename to get the
281 # original filename (relative path to file) back for GLI to notice what ex Meta is associated with it
282 $self->set_utf8_metadata_element($self->get_top_section(),
283 "gsdlsourcefilerenamemethod",
284 $rename_method);
285}
286
287sub encode_filename {
288 my $self = shift (@_);
289 my ($source_filename, $rename_method) = @_;
290
291 my ($srcfilename,$dirname,$suffix)
292 = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
293# print STDERR "-> $srcfilename -> ";
294 $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
295
296 # encode any subdirs of "import" also, but not import itself
297 my $sep = ($ENV{'GSDLOS'} =~ /^windows$/i) ? "\\" : "/";
298 my @dirs = ($sep eq "\\") ? split(/[\\\/]+/, $dirname) : split(/\//, $dirname);
299
300 my $dirpath = "";
301 foreach my $subdir (@dirs) {
302 ##print STDERR "@@@@ Found subdir: $subdir\n";
303
304 # The import folder can be called anything, including in non-ASCII encodings.
305 # Don't need to avoid encoding default import folder called "import", as it gets
306 # URL/base64 encoded to ITSELF.
307 # But can't encode (URL/base64 encode) any $ENV{'GSDLIMPORTDIR'} though if it's set,
308 # as BasePlugout::get_doc_dir() removes any $ENV{'GSDLIMPORTDIR'} prefix. Because if
309 # the $ENV{'GSDLIMPORTDIR'} part of gsdlsourcefilename is encoded here, the prefix
310 # won't match with $ENV{'GSDLIMPORTDIR'}
311
312 unless ($ENV{'GSDLIMPORTDIR'} && $subdir eq $ENV{'GSDLIMPORTDIR'}) {
313 $subdir = &util::rename_file($subdir, $rename_method);
314 }
315 $dirpath = &FileUtils::filenameConcatenate($dirpath, $subdir);
316 }
317 if($dirpath ne "") {
318 $dirname = $dirpath;
319 }
320
321 $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
322
323 return $source_filename;
324}
325
326sub set_converted_filename {
327 my $self = shift (@_);
328 my ($converted_filename) = @_;
329
330 # we know the converted filename is utf8
331 $self->set_utf8_metadata_element ($self->get_top_section(),
332 "gsdlconvertedfilename",
333 $converted_filename);
334}
335
336# returns the source_filename as it was provided
337sub get_unmodified_source_filename {
338 my $self = shift (@_);
339
340 return $self->{'terse_source_path'};
341}
342
343# returns the source_filename with whatever rename_method was given
344sub get_source_filename {
345 my $self = shift (@_);
346
347 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
348}
349
350
351
352# returns converted filename if available else returns source filename
353sub get_filename_for_hashing {
354 my $self = shift (@_);
355
356 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
357
358 if (!defined $filename) {
359 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
360 # if NULPlug processed file, then don't give a filename
361 if (defined $plugin_name && $plugin_name eq "NULPlug") {
362 $filename = undef;
363 } else { # returns the URL encoded source filename!
364 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
365 }
366 }
367
368 if (!&FileUtils::isFilenameAbsolute($filename)) {
369 $filename = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$filename);
370 }
371
372 return $filename;
373}
374
375sub set_doc_type {
376 my $self = shift (@_);
377 my ($doc_type) = @_;
378
379 $self->set_metadata_element ($self->get_top_section(),
380 "gsdldoctype",
381 $doc_type);
382}
383
384# returns the gsdldoctype as it was provided
385# the default of "indexed_doc" is used if no document
386# type was provided
387sub get_doc_type {
388 my $self = shift (@_);
389
390 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
391 return $doc_type if (defined $doc_type);
392 return "indexed_doc";
393}
394
395
396# look up the reference to the a particular section
397sub _lookup_section {
398 my $self = shift (@_);
399 my ($section) = @_;
400
401 my ($num);
402 my $sectionref = $self;
403
404 while (defined $section && $section ne "") {
405
406 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
407
408 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
409
410 $section = "" unless defined $section;
411
412
413 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
414 $sectionref = $sectionref->{'subsections'}->{$num};
415 } else {
416 return undef;
417 }
418 }
419
420 return $sectionref;
421}
422
423# calculate OID by hashing the contents of the document
424sub _calc_OID {
425 my $self = shift (@_);
426 my ($filename) = @_;
427
428
429 my $osexe = &util::get_os_exe();
430
431 my $hashfile_exe = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin",
432 $ENV{'GSDLOS'},"hashfile$osexe");
433
434 &util::set_gnomelib_env(); # gnomelib_env (particularly lib/libiconv2.dylib) required to run the hashfile executable on Mac Lions
435 # The subroutine will set the gnomelib env once for each subshell launched, by first testing if GEXTGNOME is not already set
436
437 # A different way to set the gnomelib env would be to do it more locally: exporting the necessary vars
438 # (specifically DYLD/LD_LIB_PATH) for gnome_lib as part of the command executed.
439 # E.g. $result=`export LD_LIBRARY_PATH=../ext/gnome-lib/darwin/lib; hashfile...`
440
441 my $result = "NULL";
442
443
444 if (-e "$hashfile_exe") {
445# $result = `\"$hashfile_exe\" \"$filename\"`;
446# $result = `hashfile$osexe \"$filename\" 2>&1`;
447 $result = `hashfile$osexe \"$filename\"`;
448
449 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
450 } else {
451 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
452 }
453 return "HASH$result";
454}
455
456# methods dealing with OID, not groups of them.
457
458# if $OID is not provided one is calculated
459sub set_OID {
460 my $self = shift (@_);
461 my ($OID) = @_;
462
463 my $use_hash_oid = 0;
464 # if an OID wasn't provided calculate one
465 if (!defined $OID) {
466 $OID = "NULL";
467 if ($self->{'OIDtype'} =~ /^hash/) {
468 $use_hash_oid = 1;
469 } elsif ($self->{'OIDtype'} eq "incremental") {
470 $OID = "D" . $OIDcount;
471 $OIDcount ++;
472 } elsif ($self->{'OIDtype'} eq "filename") {
473 my $filename = $self->get_source_filename();
474 $OID = &File::Basename::fileparse($filename, qr/\.[^.]*/);
475 $OID = &util::tidy_up_oid($OID);
476 } elsif ($self->{'OIDtype'} eq "full_filename") {
477 my $source_filename = $self->get_source_filename();
478 my $dirsep = &util::get_os_dirsep();
479
480 $source_filename =~ s/^import$dirsep//;
481 $source_filename =~ s/$dirsep/-/g;
482 $source_filename =~ s/\./_/g;
483
484 $OID = $source_filename;
485 $OID = &util::tidy_up_oid($OID);
486 } elsif ($self->{'OIDtype'} eq "dirname") {
487 my $filename = $self->get_source_filename();
488 if (defined($filename)) { # && -e $filename) {
489 # get the immediate parent directory
490 $OID = &File::Basename::dirname($filename);
491 if (defined $OID) {
492 $OID = &File::Basename::basename($OID);
493 $OID = &util::tidy_up_oid($OID);
494 } else {
495 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
496 $use_hash_oid = 1;
497 }
498 } else {
499 print STDERR "Failed to find a filename, generating hash id\n";
500 $use_hash_oid = 1;
501 }
502
503 } elsif ($self->{'OIDtype'} eq "assigned") {
504 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
505 if (defined $identifier && $identifier ne "") {
506 $OID = $identifier;
507 $OID = &util::tidy_up_oid($OID);
508 } else {
509 # need a hash id
510 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
511 $use_hash_oid = 1;
512 }
513
514 } else {
515 $use_hash_oid = 1;
516 }
517
518 if ($use_hash_oid) {
519 my $hash_on_file = 1;
520 my $hash_on_ga_xml = 0;
521
522 if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
523 $hash_on_file = 0;
524 $hash_on_ga_xml = 1;
525 }
526
527 if ($self->{'OIDtype'} eq "hash_on_full_filename") {
528 $hash_on_file = 0;
529 $hash_on_ga_xml = 0;
530
531 my $source_filename = $self->get_source_filename();
532 my $dirsep = &util::get_os_dirsep();
533
534 $source_filename =~ s/^import$dirsep//;
535 $source_filename =~ s/$dirsep/-/g;
536 $source_filename =~ s/\./_/g;
537
538 # If the filename is very short then (handled naively)
539 # this can cause conjestion in the hash-values
540 # computed, leading documents sharing the same leading
541 # Hex values in the computed has.
542 #
543 # The solution taken here is to replace the name of
544 # the file name a sufficient number of times (up to
545 # the character limit defined in 'rep_limit' and
546 # make that the content that is hashed on
547
548 # *** Think twice before changing the following value
549 # as it will break backward compatability of computed
550 # document HASH values
551
552 my $rep_limit = 256;
553 my $hash_content = undef;
554
555 if (length($source_filename)<$rep_limit) {
556 my $rep_string = "$source_filename|";
557 my $rs_len = length($rep_string);
558
559 my $clone_times = int(($rep_limit-1)/$rs_len) +1;
560
561 $hash_content = substr($rep_string x $clone_times, 0, $rep_limit);
562 }
563 else {
564 $hash_content = $source_filename;
565 }
566
567 my $filename = &util::get_tmp_filename();
568 if (!open (OUTFILE, ">:utf8", $filename)) {
569 print STDERR "doc::set_OID could not write to $filename\n";
570 } else {
571 print OUTFILE $hash_content;
572 close (OUTFILE);
573 }
574 $OID = $self->_calc_OID ($filename);
575
576 &FileUtils::removeFiles ($filename);
577 }
578
579 if ($hash_on_file) {
580 # "hash" OID - feed file to hashfile.exe
581 my $filename = $self->get_filename_for_hashing();
582
583 # -z: don't want to hash on the file if it is zero size
584 if (defined($filename) && -e $filename && !-z $filename) {
585 $OID = $self->_calc_OID ($filename);
586 } else {
587 $hash_on_ga_xml = 1; # switch to back-up plan, and hash on GA file instead
588 }
589 }
590
591 if ($hash_on_ga_xml) {
592 # In addition being asked to explicity calculate the has based on the GA file,
593 # can also end up coming into this block is doing 'hash_on_file' but the file
594 # itself is of zero bytes (as could be the case with 'doc.nul' file
595
596 my $filename = &util::get_tmp_filename();
597 if (!open (OUTFILE, ">:utf8", $filename)) {
598 print STDERR "doc::set_OID could not write to $filename\n";
599 } else {
600 my $doc_text = &docprint::get_section_xml($self);
601 print OUTFILE $doc_text;
602 close (OUTFILE);
603 }
604 $OID = $self->_calc_OID ($filename);
605 &FileUtils::removeFiles($filename);
606 }
607 }
608 }
609 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
610}
611
612# this uses hashdoc (embedded c thingy) which is faster but still
613# needs a little work to be sufficiently stable
614sub ___set_OID {
615 my $self = shift (@_);
616 my ($OID) = @_;
617
618 # if an OID wasn't provided then calculate hash value based on document
619 if (!defined $OID)
620 {
621 my $hash_text = &docprint::get_section_xml($self);
622 my $hash_len = length($hash_text);
623
624 $OID = &hashdoc::buffer($hash_text,$hash_len);
625 }
626
627 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
628}
629
630# returns the OID for this document
631sub get_OID {
632 my $self = shift (@_);
633 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
634 return $OID if (defined $OID);
635 return "NULL";
636}
637
638sub delete_OID {
639 my $self = shift (@_);
640
641 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
642}
643
644
645# methods for manipulating section names
646
647# returns the name of the top-most section (the top
648# level of the document
649sub get_top_section {
650 my $self = shift (@_);
651
652 return "";
653}
654
655# returns a section
656sub get_parent_section {
657 my $self = shift (@_);
658 my ($section) = @_;
659
660 $section =~ s/(^|\.)\d+$//;
661
662 return $section;
663}
664
665# returns the first child section (or the end child
666# if there isn't any)
667sub get_begin_child {
668 my $self = shift (@_);
669 my ($section) = @_;
670
671 my $section_ptr = $self->_lookup_section($section);
672 return "" unless defined $section_ptr;
673
674 if (defined $section_ptr->{'subsection_order'}->[0]) {
675 return "$section.$section_ptr->{'subsection_order'}->[0]";
676 }
677
678 return $self->get_end_child ($section);
679}
680
681# returns the next child of a parent section
682sub get_next_child {
683 my $self = shift (@_);
684 my ($section) = @_;
685
686 my $parent_section = $self->get_parent_section($section);
687 my $parent_section_ptr = $self->_lookup_section($parent_section);
688 return undef unless defined $parent_section_ptr;
689
690 my ($section_num) = $section =~ /(\d+)$/;
691 return undef unless defined $section_num;
692
693 my $i = 0;
694 my $section_order = $parent_section_ptr->{'subsection_order'};
695 while ($i < scalar(@$section_order)) {
696 last if $section_order->[$i] eq $section_num;
697 $i++;
698 }
699
700 $i++; # the next child
701 if ($i < scalar(@$section_order)) {
702 return $section_order->[$i] if $parent_section eq "";
703 return "$parent_section.$section_order->[$i]";
704 }
705
706 # no more sections in this level
707 return undef;
708}
709
710# returns a reference to a list of children
711sub get_children {
712 my $self = shift (@_);
713 my ($section) = @_;
714
715 my $section_ptr = $self->_lookup_section($section);
716 return [] unless defined $section_ptr;
717
718 my @children = @{$section_ptr->{'subsection_order'}};
719
720 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
721 return \@children;
722}
723
724# returns the child section one past the last one (which
725# is coded as "0")
726sub get_end_child {
727 my $self = shift (@_);
728 my ($section) = @_;
729
730 return $section . ".0" unless $section eq "";
731 return "0";
732}
733
734# returns the next section in book order
735sub get_next_section {
736 my $self = shift (@_);
737 my ($section) = @_;
738
739 return undef unless defined $section;
740
741 my $section_ptr = $self->_lookup_section($section);
742 return undef unless defined $section_ptr;
743
744 # first try to find first child
745 if (defined $section_ptr->{'subsection_order'}->[0]) {
746 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
747 return "$section.$section_ptr->{'subsection_order'}->[0]";
748 }
749
750 do {
751 # try to find sibling
752 my $next_child = $self->get_next_child ($section);
753 return $next_child if (defined $next_child);
754
755 # move up one level
756 $section = $self->get_parent_section ($section);
757 } while $section =~ /\d/;
758
759 return undef;
760}
761
762sub is_leaf_section {
763 my $self = shift (@_);
764 my ($section) = @_;
765
766 my $section_ptr = $self->_lookup_section($section);
767 return 1 unless defined $section_ptr;
768
769 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
770}
771
772# methods for dealing with sections
773
774# returns the name of the inserted section
775sub insert_section {
776 my $self = shift (@_);
777 my ($before_section) = @_;
778
779 # get the child to insert before and its parent section
780 my $parent_section = "";
781 my $before_child = "0";
782 my @before_section = split (/\./, $before_section);
783 if (scalar(@before_section) > 0) {
784 $before_child = pop (@before_section);
785 $parent_section = join (".", @before_section);
786 }
787
788 my $parent_section_ptr = $self->_lookup_section($parent_section);
789 if (!defined $parent_section_ptr) {
790 print STDERR "doc::insert_section couldn't find parent section " .
791 "$parent_section\n";
792 return;
793 }
794
795 # get the next section number
796 my $section_num = $parent_section_ptr->{'next_subsection'}++;
797
798 my $i = 0;
799 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
800 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
801 $i++;
802 }
803
804 # insert the section number into the order list
805 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
806
807 # add this section to the parent section
808 my $section_ptr = {'subsection_order'=>[],
809 'next_subsection'=>1,
810 'subsections'=>{},
811 'metadata'=>[],
812 'text'=>""};
813 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
814
815 # work out the full section number
816 my $section = $parent_section;
817 $section .= "." unless $section eq "";
818 $section .= $section_num;
819
820 return $section;
821}
822
823# creates a pre-named section
824sub create_named_section {
825 my $self = shift (@_);
826 my ($mastersection) = @_;
827
828 my ($num);
829 my $section = $mastersection;
830 my $sectionref = $self;
831
832 while ($section ne "") {
833 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
834 $num =~ s/^0+(\d)/$1/; # remove leading 0s
835 $section = "" unless defined $section;
836
837 if (defined $num) {
838 if (!defined $sectionref->{'subsections'}->{$num}) {
839 push (@{$sectionref->{'subsection_order'}}, $num);
840 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
841 'next_subsection'=>1,
842 'subsections'=>{},
843 'metadata'=>[],
844 'text'=>""};
845 if ($num >= $sectionref->{'next_subsection'}) {
846 $sectionref->{'next_subsection'} = $num + 1;
847 }
848 }
849 $sectionref = $sectionref->{'subsections'}->{$num};
850
851 } else {
852 print STDERR "doc::create_named_section couldn't create section ";
853 print STDERR "$mastersection\n";
854 last;
855 }
856 }
857}
858
859# returns a reference to a list of subsections
860sub list_subsections {
861 my $self = shift (@_);
862 my ($section) = @_;
863
864 my $section_ptr = $self->_lookup_section ($section);
865 if (!defined $section_ptr) {
866 print STDERR "doc::list_subsections couldn't find section $section\n";
867 return [];
868 }
869
870 return [@{$section_ptr->{'subsection_order'}}];
871}
872
873sub delete_section {
874 my $self = shift (@_);
875 my ($section) = @_;
876
877# my $section_ptr = {'subsection_order'=>[],
878# 'next_subsection'=>1,
879# 'subsections'=>{},
880# 'metadata'=>[],
881# 'text'=>""};
882
883 # if this is the top section reset everything
884 if ($section eq "") {
885 $self->{'subsection_order'} = [];
886 $self->{'subsections'} = {};
887 $self->{'metadata'} = [];
888 $self->{'text'} = "";
889 return;
890 }
891
892 # find the parent of the section to delete
893 my $parent_section = "";
894 my $child = "0";
895 my @section = split (/\./, $section);
896 if (scalar(@section) > 0) {
897 $child = pop (@section);
898 $parent_section = join (".", @section);
899 }
900
901 my $parent_section_ptr = $self->_lookup_section($parent_section);
902 if (!defined $parent_section_ptr) {
903 print STDERR "doc::delete_section couldn't find parent section " .
904 "$parent_section\n";
905 return;
906 }
907
908 # remove this section from the subsection_order list
909 my $i = 0;
910 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
911 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
912 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
913 last;
914 }
915 $i++;
916 }
917
918 # remove this section from the subsection hash
919 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
920 undef $parent_section_ptr->{'subsections'}->{$child};
921 }
922}
923
924#--
925# methods for dealing with metadata
926
927# set_metadata_element and get_metadata_element are for metadata
928# which should only have one value. add_meta_data and get_metadata
929# are for metadata which can have more than one value.
930
931# returns the first metadata value which matches field
932
933# This version of get metadata element works much like the one above,
934# except it allows for the namespace portion of a metadata element to
935# be ignored, thus if you are searching for dc.Title, the first piece
936# of matching metadata ending with the name Title (once any namespace
937# is removed) would be returned.
938# 28-11-2003 John Thompson
939sub get_metadata_element {
940 my $self = shift (@_);
941 my ($section, $field, $ignore_namespace) = @_;
942 my ($data);
943
944 $ignore_namespace = 0 unless defined $ignore_namespace;
945
946 my $section_ptr = $self->_lookup_section($section);
947 if (!defined $section_ptr) {
948 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
949 return;
950 }
951
952 # Remove any namespace if we are being told to ignore them
953 if($ignore_namespace) {
954 $field =~ s/^.*\.//; #$field =~ s/^\w*\.//;
955 }
956
957 foreach $data (@{$section_ptr->{'metadata'}}) {
958
959 my $data_name = $data->[0];
960
961 # Remove any namespace if we are being told to ignore them
962 if($ignore_namespace) {
963 $data_name =~ s/^.*\.//; #$data_name =~ s/^\w*\.//;
964 }
965 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
966 $data_name =~ s/^ex\.([^.]+)$/$1/; #$data_name =~ s/^ex\.//;
967
968 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
969 }
970
971 return undef; # was not found
972}
973
974# returns a list of the form [value1, value2, ...]
975sub get_metadata {
976 my $self = shift (@_);
977 my ($section, $field, $ignore_namespace) = @_;
978 my ($data);
979
980 $ignore_namespace = 0 unless defined $ignore_namespace;
981
982 my $section_ptr = $self->_lookup_section($section);
983 if (!defined $section_ptr) {
984 print STDERR "doc::get_metadata couldn't find section ",
985 $section, "\n";
986 return;
987 }
988
989 # Remove any namespace if we are being told to ignore them
990 if($ignore_namespace) {
991 $field =~ s/^.*\.//;
992 }
993
994 my @metadata = ();
995 foreach $data (@{$section_ptr->{'metadata'}}) {
996
997 my $data_name = $data->[0];
998
999 # Remove any namespace if we are being told to ignore them
1000 if($ignore_namespace) {
1001 $data_name =~ s/^.*\.//;
1002 }
1003 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
1004 $data_name =~ s/^ex\.([^.]+)$/$1/;
1005
1006 push (@metadata, $data->[1]) if ($data_name eq $field);
1007 }
1008
1009 return \@metadata;
1010}
1011
1012sub get_metadata_hashmap {
1013 my $self = shift (@_);
1014 my ($section, $opt_namespace) = @_;
1015
1016 my $section_ptr = $self->_lookup_section($section);
1017 if (!defined $section_ptr) {
1018 print STDERR "doc::get_metadata couldn't find section ",
1019 $section, "\n";
1020 return;
1021 }
1022
1023 my $metadata_hashmap = {};
1024 foreach my $data (@{$section_ptr->{'metadata'}}) {
1025 my $metaname = $data->[0];
1026
1027 if ((!defined $opt_namespace) || ($metaname =~ m/^$opt_namespace\./)) {
1028 if (!defined $metadata_hashmap->{$metaname}) {
1029 $metadata_hashmap->{$metaname} = [];
1030 }
1031 my $metaval_list = $metadata_hashmap->{$metaname};
1032 push(@$metaval_list, $data->[1]);
1033 }
1034 }
1035
1036 return $metadata_hashmap;
1037}
1038
1039# returns a list of the form [[field,value],[field,value],...]
1040sub get_all_metadata {
1041 my $self = shift (@_);
1042 my ($section) = @_;
1043
1044 my $section_ptr = $self->_lookup_section($section);
1045 if (!defined $section_ptr) {
1046 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
1047 return;
1048 }
1049
1050 return $section_ptr->{'metadata'};
1051}
1052
1053# $value is optional
1054sub delete_metadata {
1055 my $self = shift (@_);
1056 my ($section, $field, $value) = @_;
1057
1058 my $section_ptr = $self->_lookup_section($section);
1059 if (!defined $section_ptr) {
1060 print STDERR "doc::delete_metadata couldn't find section ", $section, "$field\n";
1061 return;
1062 }
1063
1064 my $i = 0;
1065 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
1066 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
1067 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
1068 splice (@{$section_ptr->{'metadata'}}, $i, 1);
1069 } else {
1070 $i++;
1071 }
1072 }
1073}
1074
1075sub delete_all_metadata {
1076 my $self = shift (@_);
1077 my ($section) = @_;
1078
1079 my $section_ptr = $self->_lookup_section($section);
1080 if (!defined $section_ptr) {
1081 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
1082 return;
1083 }
1084
1085 $section_ptr->{'metadata'} = [];
1086}
1087
1088sub set_metadata_element {
1089 my $self = shift (@_);
1090 my ($section, $field, $value) = @_;
1091
1092 $self->set_utf8_metadata_element ($section, $field,
1093 &unicode::ascii2utf8(\$value));
1094}
1095
1096# set_utf8_metadata_element assumes the text has already been
1097# converted to the UTF-8 encoding.
1098sub set_utf8_metadata_element {
1099 my $self = shift (@_);
1100 my ($section, $field, $value) = @_;
1101
1102 $self->delete_metadata ($section, $field);
1103 $self->add_utf8_metadata ($section, $field, $value);
1104}
1105
1106
1107# add_metadata assumes the text is in (extended) ascii form. For
1108# text which has already been converted to the UTF-8 format use
1109# add_utf8_metadata.
1110sub add_metadata {
1111 my $self = shift (@_);
1112 my ($section, $field, $value) = @_;
1113
1114 $self->add_utf8_metadata ($section, $field,
1115 &unicode::ascii2utf8(\$value));
1116}
1117
1118sub add_utf8_metadata {
1119 my $self = shift (@_);
1120 my ($section, $field, $value) = @_;
1121
1122 # my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1123 # my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
1124 # print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
1125
1126 my $section_ptr = $self->_lookup_section($section);
1127 if (!defined $section_ptr) {
1128 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
1129 return;
1130 }
1131 if (!defined $value) {
1132 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
1133 return;
1134 }
1135 if (!defined $field) {
1136 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
1137 return;
1138 }
1139
1140 #print STDERR "###$field=$value\n";
1141
1142 # For now, supress this check. Given that text data read in is now
1143 # Unicode aware, then the following block of code can (ironically enough)
1144 # cause our unicode compliant string to be re-encoded (leading to
1145 # a double-encoded UTF-8 string, which we definitely don't want!).
1146
1147
1148 # double check that the value is utf-8
1149 # if (!&unicode::check_is_utf8($value)) {
1150 # print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
1151 # &unicode::ensure_utf8(\$value);
1152 # print STDERR " Tried converting to utf8: $value\n";
1153 # }
1154
1155 #If the metadata value is either a latitude or a longitude value then we want to save a shortened version for spacial searching purposes
1156 if ($field =~ m/^(.+\.)?Latitude$/ || $field =~ m/^(.+\.)?Longitude$/)
1157 {
1158 my ($mdprefix,$metaname) = ($field =~ m/(.+)\.(.+)$/);
1159 if (defined $mdprefix) {
1160 # Add in a version of Latitude/Longitude without the metadata namespace prefix to keep Runtime happy
1161 push (@{$section_ptr->{'metadata'}}, [$metaname, $value]);
1162 }
1163
1164 my $direction;
1165 if($value =~ m/^-/)
1166 {
1167 $direction = ($field eq "Latitude") ? "S" : "W";
1168 }
1169 else
1170 {
1171 $direction = ($field eq "Latitude") ? "N" : "E";
1172 }
1173
1174 my ($beforeDec, $afterDec) = ($value =~ m/^-?([0-9]+)\.([0-9]+)$/);
1175 if(defined $beforeDec && defined $afterDec)
1176 {
1177 my $name = ($field eq "Latitude") ? "LatShort" : "LngShort";
1178 push (@{$section_ptr->{'metadata'}}, [$name, $beforeDec . $direction]);
1179
1180 for(my $i = 2; $i <= 4; $i++)
1181 {
1182 if(length($afterDec) >= $i)
1183 {
1184 push (@{$section_ptr->{'metadata'}}, [$name, substr($afterDec, 0, $i)]);
1185 }
1186 }
1187
1188 #Only add the metadata if it has not already been added
1189 my $metaMap = $self->get_metadata_hashmap($section); # metaMap not used and called function has no apparent side-effects,
1190 # but this line appears important for ensuring uniqueness of (Latitude, value) meta
1191 # in the section. Also for LatShort, Longitude, LngShort.
1192 }
1193
1194 if($field =~ m/^(.+\.)?Longitude$/) {
1195 # if we are dealing with Longitude meta, we should 1. have Latitude meta too; 2. already have processed Latitude meta
1196 # in that case, add both Lat and Lng of this section as a Coordinate meta
1197 my $latitude = $self->get_metadata_element ($section, "Latitude");
1198 my $longitude = $value;
1199 # TODO: would like all Longitude meta placed together, followed by all Coordinate meta,
1200 # but the following will add all such coord meta to the doc object and the end of this function will add Longitude meta
1201 $self->processCoordinate($section, $latitude, $longitude);
1202 }
1203 }
1204
1205 elsif($field eq "GPS.mapOverlay") { # then the $value is a JSON string
1206
1207 # In order to allow searching map-data-enriched documents by map shape descriptions,
1208 # and to run rawquery searches for other docs by proximity based on their map data,
1209 # need to store the shape descriptions and Coordinate info for shapes into the text index.
1210 # We add the description for each shape in the mapoverlay into the text index as GPSMapOverlayLabel
1211 # And we add Coordinate (CD) and CoordShort (CS) info for each shape in the mapoverlay in the format (Lat, Lng) as ("37S339 175E342")
1212 # where the digits before the N/S/E/W direction represents the whole number. And the digits after the direction are the
1213 # decimal places which can vary between 0 and 2 to 4 digits.
1214
1215 # However, we only want to process GPS.mapOverlay during buildcol and only in the text indexing passes (e.g. sidx and didx for lucene)
1216 # and certainly never during the infodb pass of buildcol. The latter can end up duplicating Coordinate/CoordShort/GPSMapOverlayLabel
1217 # when rebuilding with the online doc editor, as that runs incremental-rebuild which then calls basebuilder::reconstruct_doc_objs_metadata()
1218 # on all docs NOT being incrementally rebuilt. That call would get meta from the infodb and use it to reconstruct the doc objects of docs
1219 # NOT being incrementally built. If the Coord and Label meta were written to the infodb, they would then be loaded back in memory when the
1220 # collection is incrementally rebuilt for those docs that don't need incremental processing. Then this function would once again add the
1221 # same meta into the infodb, thus duplicating what goes into the infodb. Hence, don't do all the following if doc::processor_mode =~ "infodb",
1222 # or anything other than a text mode.
1223
1224 # Note that for incremental rebuilding, the text pass can be called textreindex for instance (and infodb pass can be incinfodb).
1225 # So don't check for exact string match
1226
1227 if($doc::cmd_line_mode eq "buildcol" && $doc::processor_mode =~ m/^text/) { # currently known text processor_modes:
1228 # text, textreindex, possibly textdelete (see ArchivesInfPlugin.pm::read() for last 2).
1229 # OR: ..."buildcol" && $doc::processor_mode !~ m/infodb$/) # if dummy pass important
1230
1231 ###print STDERR "GPS.mapOverlay before decoding, val = " . $value . "\n";
1232
1233 # TODO: html decode?
1234 $value =~ s@&#091;@[@g;
1235 $value =~ s@&#093;@]@g;
1236 $value =~ s@&quot;@"@g;
1237 ###print STDERR "GPS.mapOverlay after decoding, val = " . $value . "\n";
1238
1239 my $json_array = decode_json $value;
1240
1241 foreach my $shape (@$json_array) {
1242
1243 # Put each available shape description/label into this section's metadata with GPSMapOverlayLabel as metaname.
1244 # Just as for Coordinate meta, don't need to know which shape a label belongs too. This is just so each label
1245 # will be indexed, and therefore can be searched.
1246
1247 my $description = $shape->{"description"};
1248 if($description) {
1249 push (@{$section_ptr->{'metadata'}}, ["GPSMapOverlayLabel", $description]);
1250 ###print STDERR "@@@@############################################ Just added description meta: " . $description . "\n";
1251 }
1252
1253 my $type = $shape->{"type"};
1254 ###print STDERR "Shape type : " . $type . "\n";
1255
1256 if($type eq "circle") {
1257 ###print STDERR "Found a circle:\n" . &printShape($json, $shape);
1258
1259 # work out bounding box
1260 # want the inverse of this useful page:
1261 # https://stackoverflow.com/questions/639695/how-to-convert-latitude-or-longitude-to-meters
1262 # https://www.geodatasource.com/developers/javascript
1263
1264 # Dr Bainbridge wants us to use: https://gis.stackexchange.com/questions/5821/calculating-latitude-longitude-x-miles-from-point
1265 # Since for N,E,S,W the following works out the same as above link, we're using the rule of thumb in the following:
1266 # https://gis.stackexchange.com/questions/2951/algorithm-for-offsetting-a-latitude-longitude-by-some-amount-of-meters
1267 # which states
1268 # "If your displacements aren't too great (less than a few kilometers) and you're not right at the poles,
1269 # use the quick and dirty estimate that 111,111 meters (111.111 km) in the y direction is 1 degree
1270 # (of latitude) and 111,111 * cos(latitude) meters in the x direction is 1 degree (of longitude)."
1271 my $centre_lat = $shape->{"center"}->{"lat"};
1272 my $centre_lng = $shape->{"center"}->{"lng"};
1273 my $radius = $shape->{"radius"}; # in metres!
1274
1275 ###print STDERR "@@@ circle centre: ($centre_lat, $centre_lng), radius: $radius\n";
1276
1277 my $lat_north = $centre_lat + ($radius/111111);
1278 my $lat_south = $centre_lat - ($radius/111111);
1279
1280 ###print STDERR "### lat_north: $lat_north\n";
1281 ###print STDERR "### lat_south: $lat_south\n";
1282
1283 # our latitude and longitude values are in degrees. But cos and sin etc in perl and generally all prog languages
1284 # all expect their angle to be in radians. So need to convert from degree to radians before we can take the cos of it.
1285 my $centre_lat_radians = $self->degreesToRadians($centre_lat);
1286 my $cos_in_radians = cos($centre_lat_radians);
1287 ###print STDERR "cos $centre_lat_radians " . cos($centre_lat_radians) . "\n";
1288 my $lng_east = $centre_lng + ($radius/(111111 * $cos_in_radians));
1289 my $lng_west = $centre_lng - ($radius/(111111 * $cos_in_radians));
1290 ###print STDERR "### lng_east $lng_east\n";
1291 ###print STDERR "### lng_west $lng_west\n";
1292
1293 my $cos_lat = cos($centre_lat);
1294 ###print STDERR "cos $centre_lat is $cos_lat\n";
1295
1296 $self->processCoordinate($section, $lat_north, $lng_east);
1297 $self->processCoordinate($section, $lat_south, $lng_east);
1298 $self->processCoordinate($section, $lat_south, $lng_west);
1299 $self->processCoordinate($section, $lat_north, $lng_west);
1300
1301 }
1302 elsif ($type eq "marker") {
1303 ###print STDERR "@@ MARKER FOUND WITH LAT: " . $shape->{"position"}->{"lat"} . "\n";
1304 ###print STDERR "@@ MARKER FOUND WITH LNG: " . $shape->{"position"}->{"lng"} . "\n";
1305 $self->processCoordinate($section, $shape->{"position"}->{"lat"}, $shape->{"position"}->{"lng"});
1306 }
1307 elsif ($type eq "polyline" || $type eq "polygon") {
1308 my $path_array = $shape->{"path"};
1309 foreach my $position (@$path_array) {
1310 $self->processCoordinate($section, $position->{"lat"}, $position->{"lng"});
1311 }
1312 }
1313 elsif ($type eq "rectangle") {
1314
1315 my $bounds = $shape->{"bounds"};
1316 $self->processCoordinate($section, $bounds->{"north"}, $bounds->{"east"});
1317 $self->processCoordinate($section, $bounds->{"south"}, $bounds->{"east"});
1318 $self->processCoordinate($section, $bounds->{"south"}, $bounds->{"west"});
1319 $self->processCoordinate($section, $bounds->{"north"}, $bounds->{"west"});
1320 }
1321
1322 } # end for on each shape in GPS.mapOverlay
1323 } # end if(buildcol and text pass)
1324 } # end GPS.mapOverlay meta
1325
1326 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1327}
1328
1329# https://en.wikipedia.org/wiki/Radian
1330sub degreesToRadians
1331{
1332 my $self = shift (@_);
1333 my ($degrees) = @_;
1334
1335 return $degrees * pi /180; # returns radians
1336}
1337
1338sub radiansToDegrees
1339{
1340 my $self = shift (@_);
1341 my ($radians) = @_;
1342
1343 return $radians * 180 / pi; # returns degrees
1344}
1345
1346# Call as:
1347# my $json = JSON->new->allow_nonref;
1348# &printAllShapes($json, $json_array);
1349sub printAllShapes {
1350 my ($json, $json_array) = @_;
1351
1352
1353 #my $pretty_print_shape = $json->pretty->encode( $json_array->[0] );
1354 foreach my $shape (@$json_array) {
1355 my $pretty_print_shape = $json->pretty->encode( $shape );
1356 print STDERR "Shape: $pretty_print_shape\n";
1357 }
1358
1359}
1360
1361# For the (lat, lng) coordinate given,
1362# attaches Coordinate and multiple CoordShort (different precision level) metadata to the doc object
1363sub processCoordinate {
1364 my $self = shift (@_);
1365 my ($section, $latitude, $longitude) = @_;
1366
1367 my $section_ptr = $self->_lookup_section($section);
1368
1369 my $lat_direction = ($latitude =~ m/^-/) ? "S" : "N";
1370 my $lng_direction = ($longitude =~ m/^-/) ? "W" : "E";
1371
1372 # have to store (lat, lng) in pairs, when there are so many coords to store
1373 #push (@{$section_ptr->{'metadata'}}, ["Latitude", $latitude]);
1374 #push (@{$section_ptr->{'metadata'}}, ["Longitude", $longitude]);
1375
1376 push (@{$section_ptr->{'metadata'}}, ["Coordinate", "$latitude $longitude"]); # "$latitude$lat_direction $longitude$lng_direction"
1377
1378 my ($latBeforeDec, $latAfterDec);
1379 my ($lngBeforeDec, $lngAfterDec);
1380
1381 if($latitude !~ m/\./) {
1382 $latBeforeDec = $latitude;
1383 $latAfterDec = "";
1384 } else {
1385 ($latBeforeDec, $latAfterDec) = ($latitude =~ m/^-?([0-9]+)\.([0-9]+)$/);
1386 }
1387 if($longitude !~ m/\./) {
1388 $lngBeforeDec = $longitude;
1389 $lngAfterDec = "";
1390 } else {
1391 ($lngBeforeDec, $lngAfterDec) = ($longitude =~ m/^-?([0-9]+)\.([0-9]+)$/);
1392 }
1393
1394
1395 my $name = "CoordShort";
1396 push (@{$section_ptr->{'metadata'}}, [$name, "$latBeforeDec$lat_direction $lngBeforeDec$lng_direction"]);
1397
1398 for(my $i = 2; $i <= 4; $i++)
1399 {
1400 my $latDecPlaces = (length($latAfterDec) >= $i) ? substr($latAfterDec, 0, $i) : "";
1401 my $lngDecPlaces = (length($lngAfterDec) >= $i) ? substr($lngAfterDec, 0, $i) : "";
1402
1403 push (@{$section_ptr->{'metadata'}}, [$name,
1404 $latBeforeDec . $lat_direction. $latDecPlaces . " " . $lngBeforeDec . $lng_direction. $lngDecPlaces]);
1405
1406 }
1407
1408 #Only add the metadata if it has not already been added
1409 #my $metaMap = $self->get_metadata_hashmap($section); ### TODO: metaMap not used. Unnecesssary step? (Called function has no apparent side-effects.)
1410}
1411
1412
1413# methods for dealing with text
1414
1415# returns the text for a section
1416sub get_text {
1417 my $self = shift (@_);
1418 my ($section) = @_;
1419
1420 my $section_ptr = $self->_lookup_section($section);
1421 if (!defined $section_ptr) {
1422 print STDERR "doc::get_text couldn't find section " .
1423 "$section\n";
1424 return "";
1425 }
1426
1427 return $section_ptr->{'text'};
1428}
1429
1430# returns the (utf-8 encoded) length of the text for a section
1431sub get_text_length {
1432 my $self = shift (@_);
1433 my ($section) = @_;
1434
1435 my $section_ptr = $self->_lookup_section($section);
1436 if (!defined $section_ptr) {
1437 print STDERR "doc::get_text_length couldn't find section " .
1438 "$section\n";
1439 return 0;
1440 }
1441
1442 return length ($section_ptr->{'text'});
1443}
1444
1445# returns the total length for all the sections
1446sub get_total_text_length {
1447 my $self = shift (@_);
1448
1449 my $section = $self->get_top_section();
1450 my $length = 0;
1451 while (defined $section) {
1452 $length += $self->get_text_length($section);
1453 $section = $self->get_next_section($section);
1454 }
1455 return $length;
1456}
1457
1458sub delete_text {
1459 my $self = shift (@_);
1460 my ($section) = @_;
1461
1462 my $section_ptr = $self->_lookup_section($section);
1463 if (!defined $section_ptr) {
1464 print STDERR "doc::delete_text couldn't find section " .
1465 "$section\n";
1466 return;
1467 }
1468
1469 $section_ptr->{'text'} = "";
1470}
1471
1472# add_text assumes the text is in (extended) ascii form. For
1473# text which has been already converted to the UTF-8 format
1474# use add_utf8_text.
1475sub add_text {
1476 my $self = shift (@_);
1477 my ($section, $text) = @_;
1478
1479 # convert the text to UTF-8 encoded unicode characters
1480 # and add the text
1481 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1482}
1483
1484
1485# add_utf8_text assumes the text to be added has already
1486# been converted to the UTF-8 encoding. For ascii text use
1487# add_text
1488# Pass by value version (internally calls pass by ref version
1489# to avoid code duplication)
1490sub add_utf8_text {
1491 my $self = shift (@_);
1492 my ($section, $text) = @_;
1493
1494 $self->add_utf8_textref($section, \$text);
1495}
1496
1497# Pass by reference version, used by GreenstoneSQLPlugin for fulltext
1498sub add_utf8_textref {
1499 my $self = shift (@_);
1500 my ($section, $text_ref) = @_;
1501
1502 my $section_ptr = $self->_lookup_section($section);
1503 if (!defined $section_ptr) {
1504 print STDERR "doc::add_utf8_textref couldn't find section " .
1505 "$section\n";
1506 return;
1507 }
1508
1509 $section_ptr->{'text'} .= $$text_ref;
1510}
1511
1512# returns the Source meta, which is the utf8 filename generated.
1513# Added a separate method here for convenience
1514sub get_source {
1515 my $self = shift (@_);
1516 return $self->get_metadata_element ($self->get_top_section(), "Source");
1517}
1518
1519# returns the SourceFile meta, which is the url reference to the URL-encoded
1520# version of Source (the utf8 filename). Added a separate method here for convenience
1521sub get_sourcefile {
1522 my $self = shift (@_);
1523 return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
1524}
1525
1526# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
1527# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
1528sub get_assocfile_from_sourcefile {
1529 my $self = shift (@_);
1530
1531 # get the SourceFile meta, which is a *URL* to a file on the filesystem
1532 my $top_section = $self->get_top_section();
1533 my $source_file = $self->get_metadata_element($top_section, "SourceFile");
1534
1535 # get the actual filename as it exists on the filesystem which this url refers to
1536 $source_file = &unicode::url_to_filename($source_file);
1537 my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
1538 return $assocfilename;
1539}
1540
1541# methods for dealing with associated files
1542
1543# a file is associated with a document, NOT a section.
1544# if section is defined it is noted in the data structure
1545# only so that files associated from a particular section
1546# may be removed later (using delete_section_assoc_files)
1547sub associate_file {
1548 my $self = shift (@_);
1549 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1550 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1551
1552 # remove all associated files with the same name
1553 $self->delete_assoc_file ($assoc_filename);
1554
1555 # Too harsh a requirement
1556 # Definitely get HTML docs, for example, with some missing
1557 # support files
1558# if (!&util::fd_exists($real_filename)) {
1559# print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n";
1560# exit -1;
1561# }
1562
1563# print STDERR "**** is the following a UTF8 rep of *real* filename?\n $real_filename\n";
1564# print STDERR "****##### so, ensure it is before storing?!?!?\n";
1565## my $utf8_filename = Encode::encode("utf8",$filename);
1566
1567 push (@{$self->{'associated_files'}},
1568 [$real_filename, $assoc_filename, $mime_type, $section]);
1569}
1570
1571# returns a list of associated files in the form
1572# [[real_filename, assoc_filename, mimetype], ...]
1573sub get_assoc_files {
1574 my $self = shift (@_);
1575
1576 return $self->{'associated_files'};
1577}
1578
1579# the following two methods used to keep track of original associated files
1580# for incremental building. eg a txt file used by an item file does not end
1581# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1582# file for incremental build
1583sub associate_source_file {
1584 my $self = shift (@_);
1585 my ($full_filename) = @_;
1586
1587 push (@{$self->{'source_assoc_files'}}, $full_filename);
1588
1589}
1590
1591sub get_source_assoc_files {
1592 my $self = shift (@_);
1593
1594 return $self->{'source_assoc_files'};
1595
1596
1597}
1598sub metadata_file {
1599 my $self = shift (@_);
1600 my ($real_filename, $filename) = @_;
1601
1602 push (@{$self->{'metadata_files'}},
1603 [$real_filename, $filename]);
1604}
1605
1606# used for writing out the archiveinf-doc info database, to list all the metadata files
1607sub get_meta_files {
1608 my $self = shift (@_);
1609
1610 return $self->{'metadata_files'};
1611}
1612
1613sub delete_section_assoc_files {
1614 my $self = shift (@_);
1615 my ($section) = @_;
1616
1617 my $i=0;
1618 while ($i < scalar (@{$self->{'associated_files'}})) {
1619 if (defined $self->{'associated_files'}->[$i]->[3] &&
1620 $self->{'associated_files'}->[$i]->[3] eq $section) {
1621 splice (@{$self->{'associated_files'}}, $i, 1);
1622 } else {
1623 $i++;
1624 }
1625 }
1626}
1627
1628sub delete_assoc_file {
1629 my $self = shift (@_);
1630 my ($assoc_filename) = @_;
1631
1632 my $i=0;
1633 while ($i < scalar (@{$self->{'associated_files'}})) {
1634 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1635 splice (@{$self->{'associated_files'}}, $i, 1);
1636 } else {
1637 $i++;
1638 }
1639 }
1640}
1641
1642sub reset_nextsection_ptr {
1643 my $self = shift (@_);
1644 my ($section) = @_;
1645
1646 my $section_ptr = $self->_lookup_section($section);
1647 $section_ptr->{'next_subsection'} = 1;
1648}
1649
16501;
Note: See TracBrowser for help on using the repository browser.