source: main/trunk/greenstone2/perllib/doc.pm

Last change on this file was 38753, checked in by kjdon, 3 months ago

field may be ex.Latitude, so use metaname (has had namespace removed) when testing if eq Latitude/Longitude. Note, haven't tested this

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 54.4 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use FileUtils;
40use ghtml;
41use File::stat;
42##use hashdoc;
43use docprint;
44use JSON;
45
46# We just need pi from the Trig lib
47# Import constants pi2, pip2, pip4 (2*pi, pi/2, pi/4).
48# But doing
49# use Math::Trig ':pi';
50# resulted in errors on 32 bit linux VM with perl v8 of "bareword pi not allowed while strict subs in use"
51# So, now doing just
52use Math::Trig;
53
54# the document type may be indexed_doc, nonindexed_doc, or
55# classification
56
57our $OIDcount = 0;
58
59
60# Declare the Package (level) Variable that will tell us whether we're in the import or buildcol phase.
61# This var will be set by import.pl, buildcol.pl and export.pl to the then appropriate value.
62# https://perlmaven.com/package-variables-and-lexical-variables-in-perl
63# But: https://stackoverflow.com/questions/7542120/perl-how-to-make-variables-from-requiring-script-available-in-required-script
64# https://stackoverflow.com/questions/33230015/how-do-i-use-a-variable-in-another-perl-script
65#$doc::cmd_line_mode = undef;
66# Decided to follow the usage of $OIDcount above to declare the necessary package-level variables 'globally' accessible
67# actually accessible only for those packages that import this one (with 'use doc')
68my $cmd_line_mode = undef;
69
70# processor_mode keeps track of which buildcol pass we're at: dummy, text (sidx/didx passes) or infodb
71my $processor_mode = undef;
72
73# rename_method can be 'url', 'none', 'base64'
74sub new {
75 my $class = shift (@_);
76 my ($source_filename, $doc_type, $rename_method) = @_;
77
78
79 my $self = bless {'associated_files'=>[],
80 'subsection_order'=>[],
81 'next_subsection'=>1,
82 'subsections'=>{},
83 'metadata'=>[],
84 'text'=>"",
85 'OIDtype'=>"hash"}, $class;
86
87 # used to set lastmodified here, but this can screw up the HASH ids, so
88 # the docsave processor now calls set_lastmodified
89
90 $self->set_source_path($source_filename);
91
92 if (defined $source_filename) {
93 $source_filename = &util::filename_within_collection($source_filename);
94 print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
95 $self->set_source_filename ($source_filename, $rename_method);
96 }
97
98 $self->set_doc_type ($doc_type) if defined $doc_type;
99
100 return $self;
101}
102
103
104sub set_source_path
105{
106 my $self = shift @_;
107 my ($source_filename) = @_;
108
109 if (defined $source_filename) {
110 # On Windows the source_filename can be in terse DOS format
111 # e.g. test~1.txt
112
113 $self->{'terse_source_path'} = $source_filename;
114
115 # Use the FileUtil library methods as they are aware of more special
116 # cases such as HDFS [jmt12]
117 if (&FileUtils::fileExists($source_filename))
118 {
119 # See if we can do better for Windows with a filename
120 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
121 require Win32;
122 $self->{'source_path'} = Win32::GetLongPathName($source_filename);
123 }
124 else {
125 # For Unix-based systems, there is no difference between the two
126 $self->{'source_path'} = $source_filename;
127 }
128
129 ## Unused.
130 # On Windows, the code above has ensured source_path is the Win long (full) path name.
131 # To help GLI associate DocXML metadata with an easily calculated and accurate representation
132 # of filenames, we now store the Win long path name, hex encoded.
133 # We're not using this field on Linux, as I can't get the hex encodings generated to match
134 # what GLI Java code generates. But for symmetry we store this field on Unix too, but we need
135 # to hex-encode source_path on Unix too, or it may not be UTF-8 and doc.xml will be invalid
136 ##my $hexencodedlongsourcepath = &unicode::debug_unicode_string($self->{'source_path'});
137 ##$self->set_utf8_metadata_element ($self->get_top_section(), "gsdlfullsourcepath", $hexencodedlongsourcepath);
138
139 }
140 else {
141 print STDERR "Warning: In doc::set_source_path(), file\n";
142 print STDERR " $source_filename\n";
143 print STDERR " does not exist\n";
144
145 # (default) Set it to whatever we were given
146 $self->{'source_path'} = $source_filename;
147 }
148 }
149 else {
150 # Previous code for setting source_path allowed for
151 # it to be undefined, so continue this practice
152 $self->{'terse_source_path'} = undef;
153 $self->{'source_path'} = undef;
154 }
155}
156
157
158sub get_source_path
159{
160 my $self = shift @_;
161
162 return $self->{'terse_source_path'};
163}
164
165# set lastmodified for OAI purposes, added by GRB, moved by kjdon
166sub set_oailastmodified {
167 my $self = shift (@_);
168
169 my $source_path = $self->{'terse_source_path'};
170
171 if (defined $source_path && (-e $source_path)) {
172 my $current_time = time;
173
174 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
175 $wday, $yday, $isdst) = localtime($current_time);
176
177 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
178
179 $self->add_utf8_metadata($self->get_top_section(), "oailastmodified", $current_time);
180 $self->add_utf8_metadata($self->get_top_section(), "oailastmodifieddate", $date_modified);
181 }
182}
183
184# no longer used for OAI purposes, since lastmodified is not what we want as the
185# Datestamp of a document. This doc metadata may be useful for general purposes.
186sub set_lastmodified {
187 my $self = shift (@_);
188
189 my $source_path = $self->{'terse_source_path'};
190
191 if (defined $source_path && (-e $source_path)) {
192
193 my $file_stat = stat($source_path);
194 my $mtime = $file_stat->mtime;
195 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
196 $wday, $yday, $isdst) = localtime($mtime);
197
198 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
199
200 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $mtime);
201 $self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
202 }
203}
204
205# clone the $self object
206sub duplicate {
207 my $self = shift (@_);
208
209 my $newobj = {};
210
211 foreach my $k (keys %$self) {
212 $newobj->{$k} = &clone ($self->{$k});
213 }
214
215 bless $newobj, ref($self);
216 return $newobj;
217}
218
219sub clone {
220 my ($from) = @_;
221 my $type = ref ($from);
222
223 if ($type eq "HASH") {
224 my $to = {};
225 foreach my $key (keys %$from) {
226 $to->{$key} = &clone ($from->{$key});
227 }
228 return $to;
229 } elsif ($type eq "ARRAY") {
230 my $to = [];
231 foreach my $v (@$from) {
232 push (@$to, &clone ($v));
233 }
234 return $to;
235 } else {
236 return $from;
237 }
238}
239
240sub set_OIDtype {
241 my $self = shift (@_);
242 my ($type, $metadata) = @_;
243
244 if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|hash_on_full_filename|incremental|filename|dirname|full_filename|assigned)$/) {
245 $self->{'OIDtype'} = $type;
246 } else {
247 $self->{'OIDtype'} = "hash";
248 }
249
250 if ($type =~ /^assigned$/) {
251 if (defined $metadata) {
252 $self->{'OIDmetadata'} = $metadata;
253 } else {
254 $self->{'OIDmetadata'} = "dc.Identifier";
255 }
256 }
257}
258
259# rename_method can be 'url', 'none', 'base64'
260sub set_source_filename {
261 my $self = shift (@_);
262 my ($source_filename, $rename_method) = @_;
263
264 # Since the gsdlsourcefilename element goes into the doc.xml it has
265 # to be utf8. However, it should also *represent* the source filename
266 # (in the import directory) which may not be utf8 at all.
267 # For instance, if this meta element (gsdlsourcefilename) will be used
268 # by other applications that parse doc.xml in order to locate
269 # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
270 # encode the real filename as this is a binary-to-text encoding meaning
271 # that the resulting string is ASCII (utf8). Decoding will give the original.
272
273# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
274## print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename\n";
275
276 # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
277 $source_filename = $self->encode_filename($source_filename, $rename_method);
278# my ($srcfilename,$dirname,$suffix)
279# = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
280# print STDERR "-> $srcfilename -> ";
281# $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
282# $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
283# print STDERR "$source_filename\n";
284
285## print STDERR "AFTER URL/base64 encoding gsdlsourcefilename: $source_filename\n";
286
287 $self->set_utf8_metadata_element ($self->get_top_section(),
288 "gsdlsourcefilename",
289 $source_filename);
290
291 # If we set the file renaming method, at least GLI can decode gsdlsourcefilename to get the
292 # original filename (relative path to file) back for GLI to notice what ex Meta is associated with it
293 $self->set_utf8_metadata_element($self->get_top_section(),
294 "gsdlsourcefilerenamemethod",
295 $rename_method);
296}
297
298sub encode_filename {
299 my $self = shift (@_);
300 my ($source_filename, $rename_method) = @_;
301
302 my ($srcfilename,$dirname,$suffix)
303 = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
304# print STDERR "-> $srcfilename -> ";
305 $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
306
307 # encode any subdirs of "import" also, but not import itself
308 my $sep = ($ENV{'GSDLOS'} =~ /^windows$/i) ? "\\" : "/";
309 my @dirs = ($sep eq "\\") ? split(/[\\\/]+/, $dirname) : split(/\//, $dirname);
310
311 my $dirpath = "";
312
313 # Don't encode the first folder ("import" or "tmp"): GLI's DocXMLFile.java looks for literal "import"
314 # or "tmp" before it knows what the file rename method is that is to be used decode the rest of gsdlsourcefilename.
315 if(scalar (@dirs) >= 1) {
316 $dirpath = shift(@dirs);
317 }
318 foreach my $subdir (@dirs) {
319 #print STDERR "@@@@ Found subdir: $subdir\n";
320
321 $subdir = &util::rename_file($subdir, $rename_method);
322 #print STDERR "@@@@ encoded subdir: $subdir\n";
323
324 $dirpath = &FileUtils::filenameConcatenate($dirpath, $subdir);
325 }
326 if($dirpath ne "") {
327 $dirname = $dirpath;
328 }
329
330 $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
331
332 return $source_filename;
333}
334
335sub set_converted_filename {
336 my $self = shift (@_);
337 my ($converted_filename) = @_;
338
339 # we know the converted filename is utf8
340 $self->set_utf8_metadata_element ($self->get_top_section(),
341 "gsdlconvertedfilename",
342 $converted_filename);
343}
344
345# returns the source_filename as it was provided
346sub get_unmodified_source_filename {
347 my $self = shift (@_);
348
349 return $self->{'terse_source_path'};
350}
351
352# returns the source_filename with whatever rename_method was given
353sub get_source_filename {
354 my $self = shift (@_);
355
356 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
357}
358
359
360
361# returns converted filename if available else returns source filename
362sub get_filename_for_hashing {
363 my $self = shift (@_);
364
365 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
366
367 if (!defined $filename) {
368 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
369 # if NULPlug processed file, then don't give a filename
370 if (defined $plugin_name && $plugin_name eq "NULPlug") {
371 $filename = undef;
372 } else { # returns the URL encoded source filename!
373 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
374 }
375 }
376
377 if (!&FileUtils::isFilenameAbsolute($filename)) {
378 $filename = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$filename);
379 }
380
381 return $filename;
382}
383
384sub set_doc_type {
385 my $self = shift (@_);
386 my ($doc_type) = @_;
387
388 $self->set_metadata_element ($self->get_top_section(),
389 "gsdldoctype",
390 $doc_type);
391}
392
393# returns the gsdldoctype as it was provided
394# the default of "indexed_doc" is used if no document
395# type was provided
396sub get_doc_type {
397 my $self = shift (@_);
398
399 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
400 return $doc_type if (defined $doc_type);
401 return "indexed_doc";
402}
403
404
405# look up the reference to the a particular section
406sub _lookup_section {
407 my $self = shift (@_);
408 my ($section) = @_;
409
410 my ($num);
411 my $sectionref = $self;
412
413 while (defined $section && $section ne "") {
414
415 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
416
417 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
418
419 $section = "" unless defined $section;
420
421
422 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
423 $sectionref = $sectionref->{'subsections'}->{$num};
424 } else {
425 return undef;
426 }
427 }
428
429 return $sectionref;
430}
431
432# calculate OID by hashing the contents of the document
433sub _calc_OID {
434 my $self = shift (@_);
435 my ($filename) = @_;
436
437
438 my $osexe = &util::get_os_exe();
439
440 my $hashfile_exe = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin",
441 $ENV{'GSDLOS'},"hashfile$osexe");
442
443 &util::set_gnomelib_env(); # gnomelib_env (particularly lib/libiconv2.dylib) required to run the hashfile executable on Mac Lions
444 # The subroutine will set the gnomelib env once for each subshell launched, by first testing if GEXTGNOME is not already set
445
446 # A different way to set the gnomelib env would be to do it more locally: exporting the necessary vars
447 # (specifically DYLD/LD_LIB_PATH) for gnome_lib as part of the command executed.
448 # E.g. $result=`export LD_LIBRARY_PATH=../ext/gnome-lib/darwin/lib; hashfile...`
449
450 my $result = "NULL";
451
452
453 if (-e "$hashfile_exe") {
454# $result = `\"$hashfile_exe\" \"$filename\"`;
455# $result = `hashfile$osexe \"$filename\" 2>&1`;
456 $result = `hashfile$osexe \"$filename\"`;
457
458 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
459 } else {
460 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
461 }
462 return "HASH$result";
463}
464
465# methods dealing with OID, not groups of them.
466
467# if $OID is not provided one is calculated
468sub set_OID {
469 my $self = shift (@_);
470 my ($OID) = @_;
471
472 my $use_hash_oid = 0;
473 # if an OID wasn't provided calculate one
474 if (!defined $OID) {
475 $OID = "NULL";
476 if ($self->{'OIDtype'} =~ /^hash/) {
477 $use_hash_oid = 1;
478 } elsif ($self->{'OIDtype'} eq "incremental") {
479 $OID = "D" . $OIDcount;
480 $OIDcount ++;
481 } elsif ($self->{'OIDtype'} eq "filename") {
482 my $filename = $self->get_source_filename();
483 $OID = &File::Basename::fileparse($filename, qr/\.[^.]*/);
484 $OID = &util::tidy_up_oid($OID);
485 } elsif ($self->{'OIDtype'} eq "full_filename") {
486 my $source_filename = $self->get_source_filename();
487 my $dirsep = &util::get_os_dirsep();
488
489 $source_filename =~ s/^import$dirsep//;
490 $source_filename =~ s/$dirsep/-/g;
491 $source_filename =~ s/\./_/g;
492
493 $OID = $source_filename;
494 $OID = &util::tidy_up_oid($OID);
495 } elsif ($self->{'OIDtype'} eq "dirname") {
496 my $filename = $self->get_source_filename();
497 if (defined($filename)) { # && -e $filename) {
498 # get the immediate parent directory
499 $OID = &File::Basename::dirname($filename);
500 if (defined $OID) {
501 $OID = &File::Basename::basename($OID);
502 $OID = &util::tidy_up_oid($OID);
503 } else {
504 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
505 $use_hash_oid = 1;
506 }
507 } else {
508 print STDERR "Failed to find a filename, generating hash id\n";
509 $use_hash_oid = 1;
510 }
511
512 } elsif ($self->{'OIDtype'} eq "assigned") {
513 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
514 if (defined $identifier && $identifier ne "") {
515 $OID = $identifier;
516 $OID = &util::tidy_up_oid($OID);
517 } else {
518 # need a hash id
519 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
520 $use_hash_oid = 1;
521 }
522
523 } else {
524 $use_hash_oid = 1;
525 }
526
527 if ($use_hash_oid) {
528 my $hash_on_file = 1;
529 my $hash_on_ga_xml = 0;
530
531 if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
532 $hash_on_file = 0;
533 $hash_on_ga_xml = 1;
534 }
535
536 if ($self->{'OIDtype'} eq "hash_on_full_filename") {
537 $hash_on_file = 0;
538 $hash_on_ga_xml = 0;
539
540 my $source_filename = $self->get_source_filename();
541 my $dirsep = &util::get_os_dirsep();
542
543 $source_filename =~ s/^import$dirsep//;
544 $source_filename =~ s/$dirsep/-/g;
545 $source_filename =~ s/\./_/g;
546
547 # If the filename is very short then (handled naively)
548 # this can cause conjestion in the hash-values
549 # computed, leading documents sharing the same leading
550 # Hex values in the computed has.
551 #
552 # The solution taken here is to replace the name of
553 # the file name a sufficient number of times (up to
554 # the character limit defined in 'rep_limit' and
555 # make that the content that is hashed on
556
557 # *** Think twice before changing the following value
558 # as it will break backward compatability of computed
559 # document HASH values
560
561 my $rep_limit = 256;
562 my $hash_content = undef;
563
564 if (length($source_filename)<$rep_limit) {
565 my $rep_string = "$source_filename|";
566 my $rs_len = length($rep_string);
567
568 my $clone_times = int(($rep_limit-1)/$rs_len) +1;
569
570 $hash_content = substr($rep_string x $clone_times, 0, $rep_limit);
571 }
572 else {
573 $hash_content = $source_filename;
574 }
575
576 my $filename = &util::get_tmp_filename();
577 if (!open (OUTFILE, ">:utf8", $filename)) {
578 print STDERR "doc::set_OID could not write to $filename\n";
579 } else {
580 print OUTFILE $hash_content;
581 close (OUTFILE);
582 }
583 $OID = $self->_calc_OID ($filename);
584
585 &FileUtils::removeFiles ($filename);
586 }
587
588 if ($hash_on_file) {
589 # "hash" OID - feed file to hashfile.exe
590 my $filename = $self->get_filename_for_hashing();
591
592 # -z: don't want to hash on the file if it is zero size
593 if (defined($filename) && -e $filename && !-z $filename) {
594 $OID = $self->_calc_OID ($filename);
595 } else {
596 $hash_on_ga_xml = 1; # switch to back-up plan, and hash on GA file instead
597 }
598 }
599
600 if ($hash_on_ga_xml) {
601 # In addition being asked to explicity calculate the has based on the GA file,
602 # can also end up coming into this block is doing 'hash_on_file' but the file
603 # itself is of zero bytes (as could be the case with 'doc.nul' file
604
605 my $filename = &util::get_tmp_filename();
606 if (!open (OUTFILE, ">:utf8", $filename)) {
607 print STDERR "doc::set_OID could not write to $filename\n";
608 } else {
609 my $doc_text = &docprint::get_section_xml($self);
610 print OUTFILE $doc_text;
611 close (OUTFILE);
612 }
613 $OID = $self->_calc_OID ($filename);
614 &FileUtils::removeFiles($filename);
615 }
616 }
617 }
618
619 ## print STDERR "**** away to set Identifier OID = $OID\n";
620
621 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
622}
623
624# this uses hashdoc (embedded c thingy) which is faster but still
625# needs a little work to be sufficiently stable
626sub ___set_OID {
627 my $self = shift (@_);
628 my ($OID) = @_;
629
630 # if an OID wasn't provided then calculate hash value based on document
631 if (!defined $OID)
632 {
633 my $hash_text = &docprint::get_section_xml($self);
634 my $hash_len = length($hash_text);
635
636 $OID = &hashdoc::buffer($hash_text,$hash_len);
637 }
638
639 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
640}
641
642# returns the OID for this document
643sub get_OID {
644 my $self = shift (@_);
645 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
646 return $OID if (defined $OID);
647 return "NULL";
648}
649
650sub delete_OID {
651 my $self = shift (@_);
652
653 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
654}
655
656
657# methods for manipulating section names
658
659# returns the name of the top-most section (the top
660# level of the document
661sub get_top_section {
662 my $self = shift (@_);
663
664 return "";
665}
666
667# returns a section
668sub get_parent_section {
669 my $self = shift (@_);
670 my ($section) = @_;
671
672 $section =~ s/(^|\.)\d+$//;
673
674 return $section;
675}
676
677# returns the first child section (or the end child
678# if there isn't any)
679sub get_begin_child {
680 my $self = shift (@_);
681 my ($section) = @_;
682
683 my $section_ptr = $self->_lookup_section($section);
684 return "" unless defined $section_ptr;
685
686 if (defined $section_ptr->{'subsection_order'}->[0]) {
687 return "$section.$section_ptr->{'subsection_order'}->[0]";
688 }
689
690 return $self->get_end_child ($section);
691}
692
693# returns the next child of a parent section
694sub get_next_child {
695 my $self = shift (@_);
696 my ($section) = @_;
697
698 my $parent_section = $self->get_parent_section($section);
699 my $parent_section_ptr = $self->_lookup_section($parent_section);
700 return undef unless defined $parent_section_ptr;
701
702 my ($section_num) = $section =~ /(\d+)$/;
703 return undef unless defined $section_num;
704
705 my $i = 0;
706 my $section_order = $parent_section_ptr->{'subsection_order'};
707 while ($i < scalar(@$section_order)) {
708 last if $section_order->[$i] eq $section_num;
709 $i++;
710 }
711
712 $i++; # the next child
713 if ($i < scalar(@$section_order)) {
714 return $section_order->[$i] if $parent_section eq "";
715 return "$parent_section.$section_order->[$i]";
716 }
717
718 # no more sections in this level
719 return undef;
720}
721
722# returns a reference to a list of children
723sub get_children {
724 my $self = shift (@_);
725 my ($section) = @_;
726
727 my $section_ptr = $self->_lookup_section($section);
728 return [] unless defined $section_ptr;
729
730 my @children = @{$section_ptr->{'subsection_order'}};
731
732 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
733 return \@children;
734}
735
736# returns the child section one past the last one (which
737# is coded as "0")
738sub get_end_child {
739 my $self = shift (@_);
740 my ($section) = @_;
741
742 return $section . ".0" unless $section eq "";
743 return "0";
744}
745
746# returns the next section in book order
747sub get_next_section {
748 my $self = shift (@_);
749 my ($section) = @_;
750
751 return undef unless defined $section;
752
753 my $section_ptr = $self->_lookup_section($section);
754 return undef unless defined $section_ptr;
755
756 # first try to find first child
757 if (defined $section_ptr->{'subsection_order'}->[0]) {
758 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
759 return "$section.$section_ptr->{'subsection_order'}->[0]";
760 }
761
762 do {
763 # try to find sibling
764 my $next_child = $self->get_next_child ($section);
765 return $next_child if (defined $next_child);
766
767 # move up one level
768 $section = $self->get_parent_section ($section);
769 } while $section =~ /\d/;
770
771 return undef;
772}
773
774sub is_leaf_section {
775 my $self = shift (@_);
776 my ($section) = @_;
777
778 my $section_ptr = $self->_lookup_section($section);
779 return 1 unless defined $section_ptr;
780
781 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
782}
783
784# methods for dealing with sections
785
786# returns the name of the inserted section
787sub insert_section {
788 my $self = shift (@_);
789 my ($before_section) = @_;
790
791 # get the child to insert before and its parent section
792 my $parent_section = "";
793 my $before_child = "0";
794 my @before_section = split (/\./, $before_section);
795 if (scalar(@before_section) > 0) {
796 $before_child = pop (@before_section);
797 $parent_section = join (".", @before_section);
798 }
799
800 my $parent_section_ptr = $self->_lookup_section($parent_section);
801 if (!defined $parent_section_ptr) {
802 print STDERR "doc::insert_section couldn't find parent section " .
803 "$parent_section\n";
804 return;
805 }
806
807 # get the next section number
808 my $section_num = $parent_section_ptr->{'next_subsection'}++;
809
810 my $i = 0;
811 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
812 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
813 $i++;
814 }
815
816 # insert the section number into the order list
817 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
818
819 # add this section to the parent section
820 my $section_ptr = {'subsection_order'=>[],
821 'next_subsection'=>1,
822 'subsections'=>{},
823 'metadata'=>[],
824 'text'=>""};
825 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
826
827 # work out the full section number
828 my $section = $parent_section;
829 $section .= "." unless $section eq "";
830 $section .= $section_num;
831
832 return $section;
833}
834
835# creates a pre-named section
836sub create_named_section {
837 my $self = shift (@_);
838 my ($mastersection) = @_;
839
840 my ($num);
841 my $section = $mastersection;
842 my $sectionref = $self;
843
844 while ($section ne "") {
845 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
846 $num =~ s/^0+(\d)/$1/; # remove leading 0s
847 $section = "" unless defined $section;
848
849 if (defined $num) {
850 if (!defined $sectionref->{'subsections'}->{$num}) {
851 push (@{$sectionref->{'subsection_order'}}, $num);
852 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
853 'next_subsection'=>1,
854 'subsections'=>{},
855 'metadata'=>[],
856 'text'=>""};
857 if ($num >= $sectionref->{'next_subsection'}) {
858 $sectionref->{'next_subsection'} = $num + 1;
859 }
860 }
861 $sectionref = $sectionref->{'subsections'}->{$num};
862
863 } else {
864 print STDERR "doc::create_named_section couldn't create section ";
865 print STDERR "$mastersection\n";
866 last;
867 }
868 }
869}
870
871# returns a reference to a list of subsections
872sub list_subsections {
873 my $self = shift (@_);
874 my ($section) = @_;
875
876 my $section_ptr = $self->_lookup_section ($section);
877 if (!defined $section_ptr) {
878 print STDERR "doc::list_subsections couldn't find section $section\n";
879 return [];
880 }
881
882 return [@{$section_ptr->{'subsection_order'}}];
883}
884
885sub delete_section {
886 my $self = shift (@_);
887 my ($section) = @_;
888
889# my $section_ptr = {'subsection_order'=>[],
890# 'next_subsection'=>1,
891# 'subsections'=>{},
892# 'metadata'=>[],
893# 'text'=>""};
894
895 # if this is the top section reset everything
896 if ($section eq "") {
897 $self->{'subsection_order'} = [];
898 $self->{'subsections'} = {};
899 $self->{'metadata'} = [];
900 $self->{'text'} = "";
901 return;
902 }
903
904 # find the parent of the section to delete
905 my $parent_section = "";
906 my $child = "0";
907 my @section = split (/\./, $section);
908 if (scalar(@section) > 0) {
909 $child = pop (@section);
910 $parent_section = join (".", @section);
911 }
912
913 my $parent_section_ptr = $self->_lookup_section($parent_section);
914 if (!defined $parent_section_ptr) {
915 print STDERR "doc::delete_section couldn't find parent section " .
916 "$parent_section\n";
917 return;
918 }
919
920 # remove this section from the subsection_order list
921 my $i = 0;
922 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
923 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
924 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
925 last;
926 }
927 $i++;
928 }
929
930 # remove this section from the subsection hash
931 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
932 undef $parent_section_ptr->{'subsections'}->{$child};
933 }
934}
935
936#--
937# methods for dealing with metadata
938
939# set_metadata_element and get_metadata_element are for metadata
940# which should only have one value. add_meta_data and get_metadata
941# are for metadata which can have more than one value.
942
943# returns the first metadata value which matches field
944
945# This version of get metadata element works much like the one above,
946# except it allows for the namespace portion of a metadata element to
947# be ignored, thus if you are searching for dc.Title, the first piece
948# of matching metadata ending with the name Title (once any namespace
949# is removed) would be returned.
950# 28-11-2003 John Thompson
951sub get_metadata_element {
952 my $self = shift (@_);
953 my ($section, $field, $ignore_namespace) = @_;
954 my ($data);
955
956 $ignore_namespace = 0 unless defined $ignore_namespace;
957
958 my $section_ptr = $self->_lookup_section($section);
959 if (!defined $section_ptr) {
960 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
961 return;
962 }
963
964 # Remove any namespace if we are being told to ignore them
965 if($ignore_namespace) {
966 $field =~ s/^.*\.//; #$field =~ s/^\w*\.//;
967 }
968
969 foreach $data (@{$section_ptr->{'metadata'}}) {
970
971 my $data_name = $data->[0];
972
973 # Remove any namespace if we are being told to ignore them
974 if($ignore_namespace) {
975 $data_name =~ s/^.*\.//; #$data_name =~ s/^\w*\.//;
976 }
977 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
978 $data_name =~ s/^ex\.([^.]+)$/$1/; #$data_name =~ s/^ex\.//;
979
980 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
981 }
982
983 return undef; # was not found
984}
985
986# returns a list of the form [value1, value2, ...]
987sub get_metadata {
988 my $self = shift (@_);
989 my ($section, $field, $ignore_namespace) = @_;
990 my ($data);
991
992 $ignore_namespace = 0 unless defined $ignore_namespace;
993
994 my $section_ptr = $self->_lookup_section($section);
995 if (!defined $section_ptr) {
996 print STDERR "doc::get_metadata couldn't find section ",
997 $section, "\n";
998 return;
999 }
1000
1001 # Remove any namespace if we are being told to ignore them
1002 if($ignore_namespace) {
1003 $field =~ s/^.*\.//;
1004 }
1005
1006 my @metadata = ();
1007 foreach $data (@{$section_ptr->{'metadata'}}) {
1008
1009 my $data_name = $data->[0];
1010
1011 # Remove any namespace if we are being told to ignore them
1012 if($ignore_namespace) {
1013 $data_name =~ s/^.*\.//;
1014 }
1015 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
1016 $data_name =~ s/^ex\.([^.]+)$/$1/;
1017
1018 push (@metadata, $data->[1]) if ($data_name eq $field);
1019 }
1020
1021 return \@metadata;
1022}
1023
1024sub get_metadata_hashmap {
1025 my $self = shift (@_);
1026 my ($section, $opt_namespace) = @_;
1027
1028 my $section_ptr = $self->_lookup_section($section);
1029 if (!defined $section_ptr) {
1030 print STDERR "doc::get_metadata couldn't find section ",
1031 $section, "\n";
1032 return;
1033 }
1034
1035 my $metadata_hashmap = {};
1036 foreach my $data (@{$section_ptr->{'metadata'}}) {
1037 my $metaname = $data->[0];
1038
1039 if ((!defined $opt_namespace) || ($metaname =~ m/^$opt_namespace\./)) {
1040 if (!defined $metadata_hashmap->{$metaname}) {
1041 $metadata_hashmap->{$metaname} = [];
1042 }
1043 my $metaval_list = $metadata_hashmap->{$metaname};
1044 push(@$metaval_list, $data->[1]);
1045 }
1046 }
1047
1048 return $metadata_hashmap;
1049}
1050
1051# returns a list of the form [[field,value],[field,value],...]
1052sub get_all_metadata {
1053 my $self = shift (@_);
1054 my ($section) = @_;
1055
1056 my $section_ptr = $self->_lookup_section($section);
1057 if (!defined $section_ptr) {
1058 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
1059 return;
1060 }
1061
1062 return $section_ptr->{'metadata'};
1063}
1064
1065# Does this section contain the specified field->value pair?
1066sub contains_metadata_value {
1067 my $self = shift (@_);
1068 my ($section, $field, $value, $ignore_namespace) = @_;
1069
1070 $ignore_namespace = 0 unless defined $ignore_namespace;
1071
1072 my $section_ptr = $self->_lookup_section($section);
1073 if (!defined $section_ptr) {
1074 print STDERR "doc::get_metadata couldn't find section ",
1075 $section, "\n";
1076 return 0;
1077 }
1078
1079 # Remove any namespace if we are being told to ignore them
1080 if($ignore_namespace) {
1081 $field =~ s/^.*\.//;
1082 }
1083
1084 my @metadata = ();
1085 foreach my $data (@{$section_ptr->{'metadata'}}) {
1086
1087 my $data_name = $data->[0];
1088
1089 # Remove any namespace if we are being told to ignore them
1090 if($ignore_namespace) {
1091 $data_name =~ s/^.*\.//;
1092 }
1093 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
1094 $data_name =~ s/^ex\.([^.]+)$/$1/;
1095 if ($data_name eq $field && $data->[1] eq $value) {
1096 return 1;
1097 }
1098 }
1099 # if we get here, we haven't found it.
1100 return 0;
1101
1102}
1103
1104# $value is optional
1105sub delete_metadata {
1106 my $self = shift (@_);
1107 my ($section, $field, $value) = @_;
1108
1109 my $section_ptr = $self->_lookup_section($section);
1110 if (!defined $section_ptr) {
1111 print STDERR "doc::delete_metadata couldn't find section ", $section, "$field\n";
1112 return;
1113 }
1114
1115 my $i = 0;
1116 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
1117 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
1118 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
1119 splice (@{$section_ptr->{'metadata'}}, $i, 1);
1120 } else {
1121 $i++;
1122 }
1123 }
1124}
1125
1126sub delete_all_metadata {
1127 my $self = shift (@_);
1128 my ($section) = @_;
1129
1130 my $section_ptr = $self->_lookup_section($section);
1131 if (!defined $section_ptr) {
1132 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
1133 return;
1134 }
1135
1136 $section_ptr->{'metadata'} = [];
1137}
1138
1139sub set_metadata_element {
1140 my $self = shift (@_);
1141 my ($section, $field, $value) = @_;
1142
1143 $self->set_utf8_metadata_element ($section, $field,
1144 &unicode::ascii2utf8(\$value));
1145}
1146
1147# set_utf8_metadata_element assumes the text has already been
1148# converted to the UTF-8 encoding.
1149sub set_utf8_metadata_element {
1150 my $self = shift (@_);
1151 my ($section, $field, $value) = @_;
1152
1153 $self->delete_metadata ($section, $field);
1154 $self->add_utf8_metadata ($section, $field, $value);
1155}
1156
1157
1158# add_metadata assumes the text is in (extended) ascii form. For
1159# text which has already been converted to the UTF-8 format use
1160# add_utf8_metadata.
1161sub add_metadata {
1162 my $self = shift (@_);
1163 my ($section, $field, $value) = @_;
1164
1165 $self->add_utf8_metadata ($section, $field,
1166 &unicode::ascii2utf8(\$value));
1167}
1168
1169sub add_utf8_metadata {
1170 my $self = shift (@_);
1171 my ($section, $field, $value) = @_;
1172
1173 # my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1174 # my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
1175 # print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
1176
1177 my $section_ptr = $self->_lookup_section($section);
1178 if (!defined $section_ptr) {
1179 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
1180 return;
1181 }
1182 if (!defined $value) {
1183 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
1184 return;
1185 }
1186 if (!defined $field) {
1187 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
1188 return;
1189 }
1190
1191 #print STDERR "###$field=$value\n";
1192
1193 # For now, supress this check. Given that text data read in is now
1194 # Unicode aware, then the following block of code can (ironically enough)
1195 # cause our unicode compliant string to be re-encoded (leading to
1196 # a double-encoded UTF-8 string, which we definitely don't want!).
1197
1198
1199 # double check that the value is utf-8
1200 # if (!&unicode::check_is_utf8($value)) {
1201 # print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
1202 # &unicode::ensure_utf8(\$value);
1203 # print STDERR " Tried converting to utf8: $value\n";
1204 # }
1205
1206 #If the metadata value is either a latitude or a longitude value then we want to save a shortened version for spacial searching purposes
1207 if ($field =~ m/^(.+\.)?Latitude$/ || $field =~ m/^(.+\.)?Longitude$/)
1208 {
1209 my ($mdprefix,$metaname) = ($field =~ m/(.+)\.(.+)$/);
1210 if (defined $mdprefix) {
1211 # Add in a version of Latitude/Longitude without the metadata namespace prefix to keep Runtime happy
1212 push (@{$section_ptr->{'metadata'}}, [$metaname, $value]);
1213 }
1214
1215 my $direction;
1216 # TODO does the following work, with eq xx.Latitude? the above code implies can be xxxxx.Latitude. should use metaname??
1217 if($value =~ m/^-/)
1218 {
1219 $direction = ($metaname eq "Latitude") ? "S" : "W";
1220 }
1221 else
1222 {
1223 $direction = ($metaname eq "Latitude") ? "N" : "E";
1224 }
1225
1226 my ($beforeDec, $afterDec) = ($value =~ m/^-?([0-9]+)\.([0-9]+)$/);
1227 if(defined $beforeDec && defined $afterDec)
1228 {
1229 my $name = ($metaname eq "Latitude") ? "LatShort" : "LngShort";
1230 # only do the following if we haven't already done it
1231 if (!$self->contains_metadata_value($section, $name, $beforeDec . $direction)) {
1232 push (@{$section_ptr->{'metadata'}}, [$name, $beforeDec . $direction]);
1233
1234 for(my $i = 2; $i <= 4; $i++)
1235 {
1236 if(length($afterDec) >= $i)
1237 {
1238 push (@{$section_ptr->{'metadata'}}, [$name, substr($afterDec, 0, $i)]);
1239 }
1240 }
1241 }
1242 # what is this???? completely does nothing...
1243 #Only add the metadata if it has not already been added
1244 my $metaMap = $self->get_metadata_hashmap($section); # metaMap not used and called function has no apparent side-effects,
1245 # but this line appears important for ensuring uniqueness of (Latitude, value) meta
1246 # in the section. Also for LatShort, Longitude, LngShort.
1247 }
1248
1249 # if we have both Latitude and Longitude, then add coordinate metadata (not sure why, but someone else added it)
1250 my $latitude;
1251 my $longitude;
1252 if($metaname eq "Longitude") {
1253 # if we are dealing with Longitude meta, and we have already seen Latitude, then add Coordinate metadata
1254 $latitude = $self->get_metadata_element ($section, "Latitude");
1255 if (defined $latitude) {
1256 $longitude = $value;
1257 }
1258 } else {
1259 # we are dealing with Latitude - if we have already seen Longitude, then add both as Coordinate metadata
1260 $longitude = $self->get_metadata_element ($section, "Longitude");
1261 if (defined $longitude) {
1262 $latitude = $value;
1263 }
1264 }
1265 if (defined $latitude && defined $longitude) {
1266 ##rint STDERR "adding coords for $section, $latitude, $longitude\n";
1267 $self->processCoordinate($section, $latitude, $longitude);
1268 } else {
1269 ##rint STDERR "lat or long not defined, not adding coords for $section\n";
1270 }
1271
1272 }
1273
1274 elsif($field eq "GPS.mapOverlay") { # then the $value is a JSON string
1275
1276 # In order to allow searching map-data-enriched documents by map shape descriptions,
1277 # and to run rawquery searches for other docs by proximity based on their map data,
1278 # need to store the shape descriptions and Coordinate info for shapes into the text index.
1279 # We add the description for each shape in the mapoverlay into the text index as GPSMapOverlayLabel
1280 # And we add Coordinate (CD) and CoordShort (CS) info for each shape in the mapoverlay in the format (Lat, Lng) as ("37S339 175E342")
1281 # where the digits before the N/S/E/W direction represents the whole number. And the digits after the direction are the
1282 # decimal places which can vary between 0 and 2 to 4 digits.
1283
1284 # However, we only want to process GPS.mapOverlay during buildcol and only in the text indexing passes (e.g. sidx and didx for lucene)
1285 # and certainly never during the infodb pass of buildcol. The latter can end up duplicating Coordinate/CoordShort/GPSMapOverlayLabel
1286 # when rebuilding with the online doc editor, as that runs incremental-rebuild which then calls basebuilder::reconstruct_doc_objs_metadata()
1287 # on all docs NOT being incrementally rebuilt. That call would get meta from the infodb and use it to reconstruct the doc objects of docs
1288 # NOT being incrementally built. If the Coord and Label meta were written to the infodb, they would then be loaded back in memory when the
1289 # collection is incrementally rebuilt for those docs that don't need incremental processing. Then this function would once again add the
1290 # same meta into the infodb, thus duplicating what goes into the infodb. Hence, don't do all the following if doc::processor_mode =~ "infodb",
1291 # or anything other than a text mode.
1292
1293 # Note that for incremental rebuilding, the text pass can be called textreindex for instance (and infodb pass can be incinfodb).
1294 # So don't check for exact string match
1295
1296 if($doc::cmd_line_mode eq "buildcol" && $doc::processor_mode =~ m/^text/) { # currently known text processor_modes:
1297 # text, textreindex, possibly textdelete (see ArchivesInfPlugin.pm::read() for last 2).
1298 # OR: ..."buildcol" && $doc::processor_mode !~ m/infodb$/) # if dummy pass important
1299
1300 ###print STDERR "GPS.mapOverlay before decoding, val = " . $value . "\n";
1301
1302 # TODO: html decode?
1303 $value =~ s@&#091;@[@g;
1304 $value =~ s@&#093;@]@g;
1305 $value =~ s@&quot;@"@g;
1306 ###print STDERR "GPS.mapOverlay after decoding, val = " . $value . "\n";
1307
1308 my $json_array = decode_json $value;
1309
1310 foreach my $shape (@$json_array) {
1311
1312 # Put each available shape description/label into this section's metadata with GPSMapOverlayLabel as metaname.
1313 # Just as for Coordinate meta, don't need to know which shape a label belongs too. This is just so each label
1314 # will be indexed, and therefore can be searched.
1315
1316 my $description = $shape->{"description"};
1317 if($description) {
1318 push (@{$section_ptr->{'metadata'}}, ["GPSMapOverlayLabel", $description]);
1319 ###print STDERR "@@@@############################################ Just added description meta: " . $description . "\n";
1320 }
1321
1322 my $type = $shape->{"type"};
1323 ###print STDERR "Shape type : " . $type . "\n";
1324
1325 if($type eq "circle") {
1326 ###print STDERR "Found a circle:\n" . &printShape($json, $shape);
1327
1328 # work out bounding box
1329 # want the inverse of this useful page:
1330 # https://stackoverflow.com/questions/639695/how-to-convert-latitude-or-longitude-to-meters
1331 # https://www.geodatasource.com/developers/javascript
1332
1333 # Dr Bainbridge wants us to use: https://gis.stackexchange.com/questions/5821/calculating-latitude-longitude-x-miles-from-point
1334 # Since for N,E,S,W the following works out the same as above link, we're using the rule of thumb in the following:
1335 # https://gis.stackexchange.com/questions/2951/algorithm-for-offsetting-a-latitude-longitude-by-some-amount-of-meters
1336 # which states
1337 # "If your displacements aren't too great (less than a few kilometers) and you're not right at the poles,
1338 # use the quick and dirty estimate that 111,111 meters (111.111 km) in the y direction is 1 degree
1339 # (of latitude) and 111,111 * cos(latitude) meters in the x direction is 1 degree (of longitude)."
1340 my $centre_lat = $shape->{"center"}->{"lat"};
1341 my $centre_lng = $shape->{"center"}->{"lng"};
1342 my $radius = $shape->{"radius"}; # in metres!
1343
1344 ###print STDERR "@@@ circle centre: ($centre_lat, $centre_lng), radius: $radius\n";
1345
1346 my $lat_north = $centre_lat + ($radius/111111);
1347 my $lat_south = $centre_lat - ($radius/111111);
1348
1349 ###print STDERR "### lat_north: $lat_north\n";
1350 ###print STDERR "### lat_south: $lat_south\n";
1351
1352 # our latitude and longitude values are in degrees. But cos and sin etc in perl and generally all prog languages
1353 # all expect their angle to be in radians. So need to convert from degree to radians before we can take the cos of it.
1354 my $centre_lat_radians = $self->degreesToRadians($centre_lat);
1355 my $cos_in_radians = cos($centre_lat_radians);
1356 ###print STDERR "cos $centre_lat_radians " . cos($centre_lat_radians) . "\n";
1357 my $lng_east = $centre_lng + ($radius/(111111 * $cos_in_radians));
1358 my $lng_west = $centre_lng - ($radius/(111111 * $cos_in_radians));
1359 ###print STDERR "### lng_east $lng_east\n";
1360 ###print STDERR "### lng_west $lng_west\n";
1361
1362 my $cos_lat = cos($centre_lat);
1363 ###print STDERR "cos $centre_lat is $cos_lat\n";
1364
1365 $self->processCoordinate($section, $lat_north, $lng_east);
1366 $self->processCoordinate($section, $lat_south, $lng_east);
1367 $self->processCoordinate($section, $lat_south, $lng_west);
1368 $self->processCoordinate($section, $lat_north, $lng_west);
1369
1370 }
1371 elsif ($type eq "marker") {
1372 ###print STDERR "@@ MARKER FOUND WITH LAT: " . $shape->{"position"}->{"lat"} . "\n";
1373 ###print STDERR "@@ MARKER FOUND WITH LNG: " . $shape->{"position"}->{"lng"} . "\n";
1374 $self->processCoordinate($section, $shape->{"position"}->{"lat"}, $shape->{"position"}->{"lng"});
1375 }
1376 elsif ($type eq "polyline" || $type eq "polygon") {
1377 my $path_array = $shape->{"path"};
1378 foreach my $position (@$path_array) {
1379 $self->processCoordinate($section, $position->{"lat"}, $position->{"lng"});
1380 }
1381 }
1382 elsif ($type eq "rectangle") {
1383
1384 my $bounds = $shape->{"bounds"};
1385 $self->processCoordinate($section, $bounds->{"north"}, $bounds->{"east"});
1386 $self->processCoordinate($section, $bounds->{"south"}, $bounds->{"east"});
1387 $self->processCoordinate($section, $bounds->{"south"}, $bounds->{"west"});
1388 $self->processCoordinate($section, $bounds->{"north"}, $bounds->{"west"});
1389 }
1390
1391 } # end for on each shape in GPS.mapOverlay
1392 } # end if(buildcol and text pass)
1393 } # end GPS.mapOverlay meta
1394
1395 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1396}
1397
1398# https://en.wikipedia.org/wiki/Radian
1399sub degreesToRadians
1400{
1401 my $self = shift (@_);
1402 my ($degrees) = @_;
1403
1404 return $degrees * pi /180; # returns radians
1405}
1406
1407sub radiansToDegrees
1408{
1409 my $self = shift (@_);
1410 my ($radians) = @_;
1411
1412 return $radians * 180 / pi; # returns degrees
1413}
1414
1415# Call as:
1416# my $json = JSON->new->allow_nonref;
1417# &printAllShapes($json, $json_array);
1418sub printAllShapes {
1419 my ($json, $json_array) = @_;
1420
1421
1422 #my $pretty_print_shape = $json->pretty->encode( $json_array->[0] );
1423 foreach my $shape (@$json_array) {
1424 my $pretty_print_shape = $json->pretty->encode( $shape );
1425 print STDERR "Shape: $pretty_print_shape\n";
1426 }
1427
1428}
1429
1430# For the (lat, lng) coordinate given,
1431# attaches Coordinate and multiple CoordShort (different precision level) metadata to the doc object
1432sub processCoordinate {
1433 my $self = shift (@_);
1434 my ($section, $latitude, $longitude) = @_;
1435
1436 my $section_ptr = $self->_lookup_section($section);
1437
1438 my $lat_direction = ($latitude =~ m/^-/) ? "S" : "N";
1439 my $lng_direction = ($longitude =~ m/^-/) ? "W" : "E";
1440
1441 # have to store (lat, lng) in pairs, when there are so many coords to store
1442 #push (@{$section_ptr->{'metadata'}}, ["Latitude", $latitude]);
1443 #push (@{$section_ptr->{'metadata'}}, ["Longitude", $longitude]);
1444
1445 if ($self->contains_metadata_value($section, "Coordinate", "$latitude $longitude")) {
1446 # it looks like we have already processed these values
1447 return;
1448 }
1449 push (@{$section_ptr->{'metadata'}}, ["Coordinate", "$latitude $longitude"]); # "$latitude$lat_direction $longitude$lng_direction"
1450
1451 my ($latBeforeDec, $latAfterDec);
1452 my ($lngBeforeDec, $lngAfterDec);
1453
1454 if($latitude !~ m/\./) {
1455 $latBeforeDec = $latitude;
1456 $latAfterDec = "";
1457 } else {
1458 ($latBeforeDec, $latAfterDec) = ($latitude =~ m/^-?([0-9]+)\.([0-9]+)$/);
1459 }
1460 if($longitude !~ m/\./) {
1461 $lngBeforeDec = $longitude;
1462 $lngAfterDec = "";
1463 } else {
1464 ($lngBeforeDec, $lngAfterDec) = ($longitude =~ m/^-?([0-9]+)\.([0-9]+)$/);
1465 }
1466
1467
1468 my $name = "CoordShort";
1469 push (@{$section_ptr->{'metadata'}}, [$name, "$latBeforeDec$lat_direction $lngBeforeDec$lng_direction"]);
1470
1471 for(my $i = 2; $i <= 4; $i++)
1472 {
1473 my $latDecPlaces = (length($latAfterDec) >= $i) ? substr($latAfterDec, 0, $i) : "";
1474 my $lngDecPlaces = (length($lngAfterDec) >= $i) ? substr($lngAfterDec, 0, $i) : "";
1475
1476 push (@{$section_ptr->{'metadata'}}, [$name,
1477 $latBeforeDec . $lat_direction. $latDecPlaces . " " . $lngBeforeDec . $lng_direction. $lngDecPlaces]);
1478
1479 }
1480
1481 #Only add the metadata if it has not already been added
1482 #my $metaMap = $self->get_metadata_hashmap($section); ### TODO: metaMap not used. Unnecesssary step? (Called function has no apparent side-effects.)
1483}
1484
1485
1486# methods for dealing with text
1487
1488# returns the text for a section
1489sub get_text {
1490 my $self = shift (@_);
1491 my ($section) = @_;
1492
1493 my $section_ptr = $self->_lookup_section($section);
1494 if (!defined $section_ptr) {
1495 print STDERR "doc::get_text couldn't find section " .
1496 "$section\n";
1497 return "";
1498 }
1499
1500 return $section_ptr->{'text'};
1501}
1502
1503# returns the (utf-8 encoded) length of the text for a section
1504sub get_text_length {
1505 my $self = shift (@_);
1506 my ($section) = @_;
1507
1508 my $section_ptr = $self->_lookup_section($section);
1509 if (!defined $section_ptr) {
1510 print STDERR "doc::get_text_length couldn't find section " .
1511 "$section\n";
1512 return 0;
1513 }
1514
1515 return length ($section_ptr->{'text'});
1516}
1517
1518# returns the total length for all the sections
1519sub get_total_text_length {
1520 my $self = shift (@_);
1521
1522 my $section = $self->get_top_section();
1523 my $length = 0;
1524 while (defined $section) {
1525 $length += $self->get_text_length($section);
1526 $section = $self->get_next_section($section);
1527 }
1528 return $length;
1529}
1530
1531sub delete_text {
1532 my $self = shift (@_);
1533 my ($section) = @_;
1534
1535 my $section_ptr = $self->_lookup_section($section);
1536 if (!defined $section_ptr) {
1537 print STDERR "doc::delete_text couldn't find section " .
1538 "$section\n";
1539 return;
1540 }
1541
1542 $section_ptr->{'text'} = "";
1543}
1544
1545# add_text assumes the text is in (extended) ascii form. For
1546# text which has been already converted to the UTF-8 format
1547# use add_utf8_text.
1548sub add_text {
1549 my $self = shift (@_);
1550 my ($section, $text) = @_;
1551
1552 # convert the text to UTF-8 encoded unicode characters
1553 # and add the text
1554 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1555}
1556
1557
1558# add_utf8_text assumes the text to be added has already
1559# been converted to the UTF-8 encoding. For ascii text use
1560# add_text
1561# Pass by value version (internally calls pass by ref version
1562# to avoid code duplication)
1563sub add_utf8_text {
1564 my $self = shift (@_);
1565 my ($section, $text) = @_;
1566
1567 $self->add_utf8_textref($section, \$text);
1568}
1569
1570# Pass by reference version, used by GreenstoneSQLPlugin for fulltext
1571sub add_utf8_textref {
1572 my $self = shift (@_);
1573 my ($section, $text_ref) = @_;
1574
1575 my $section_ptr = $self->_lookup_section($section);
1576 if (!defined $section_ptr) {
1577 print STDERR "doc::add_utf8_textref couldn't find section " .
1578 "$section\n";
1579 return;
1580 }
1581
1582 $section_ptr->{'text'} .= $$text_ref;
1583}
1584
1585# returns the Source meta, which is the utf8 filename generated.
1586# Added a separate method here for convenience
1587sub get_source {
1588 my $self = shift (@_);
1589 return $self->get_metadata_element ($self->get_top_section(), "Source");
1590}
1591
1592# returns the SourceFile meta, which is the url reference to the URL-encoded
1593# version of Source (the utf8 filename). Added a separate method here for convenience
1594sub get_sourcefile {
1595 my $self = shift (@_);
1596 return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
1597}
1598
1599# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
1600# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
1601sub get_assocfile_from_sourcefile {
1602 my $self = shift (@_);
1603
1604 # get the SourceFile meta, which is a *URL* to a file on the filesystem
1605 my $top_section = $self->get_top_section();
1606 my $source_file = $self->get_metadata_element($top_section, "SourceFile");
1607
1608 # get the actual filename as it exists on the filesystem which this url refers to
1609 $source_file = &unicode::url_to_filename($source_file);
1610 my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
1611 return $assocfilename;
1612}
1613
1614# methods for dealing with associated files
1615
1616# a file is associated with a document, NOT a section.
1617# if section is defined it is noted in the data structure
1618# only so that files associated from a particular section
1619# may be removed later (using delete_section_assoc_files)
1620sub associate_file {
1621 my $self = shift (@_);
1622 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1623 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1624
1625 # remove all associated files with the same name
1626 $self->delete_assoc_file ($assoc_filename);
1627
1628 # Too harsh a requirement
1629 # Definitely get HTML docs, for example, with some missing
1630 # support files
1631# if (!&util::fd_exists($real_filename)) {
1632# print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n";
1633# exit -1;
1634# }
1635
1636# print STDERR "**** is the following a UTF8 rep of *real* filename?\n $real_filename\n";
1637# print STDERR "****##### so, ensure it is before storing?!?!?\n";
1638## my $utf8_filename = Encode::encode("utf8",$filename);
1639
1640 push (@{$self->{'associated_files'}},
1641 [$real_filename, $assoc_filename, $mime_type, $section]);
1642}
1643
1644# returns a list of associated files in the form
1645# [[real_filename, assoc_filename, mimetype], ...]
1646sub get_assoc_files {
1647 my $self = shift (@_);
1648
1649 return $self->{'associated_files'};
1650}
1651
1652# the following two methods used to keep track of original associated files
1653# for incremental building. eg a txt file used by an item file does not end
1654# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1655# file for incremental build
1656sub associate_source_file {
1657 my $self = shift (@_);
1658 my ($full_filename) = @_;
1659
1660 push (@{$self->{'source_assoc_files'}}, $full_filename);
1661
1662}
1663
1664sub get_source_assoc_files {
1665 my $self = shift (@_);
1666
1667 return $self->{'source_assoc_files'};
1668
1669
1670}
1671sub metadata_file {
1672 my $self = shift (@_);
1673 my ($real_filename, $filename) = @_;
1674
1675 push (@{$self->{'metadata_files'}},
1676 [$real_filename, $filename]);
1677}
1678
1679# used for writing out the archiveinf-doc info database, to list all the metadata files
1680sub get_meta_files {
1681 my $self = shift (@_);
1682
1683 return $self->{'metadata_files'};
1684}
1685
1686sub delete_section_assoc_files {
1687 my $self = shift (@_);
1688 my ($section) = @_;
1689
1690 my $i=0;
1691 while ($i < scalar (@{$self->{'associated_files'}})) {
1692 if (defined $self->{'associated_files'}->[$i]->[3] &&
1693 $self->{'associated_files'}->[$i]->[3] eq $section) {
1694 splice (@{$self->{'associated_files'}}, $i, 1);
1695 } else {
1696 $i++;
1697 }
1698 }
1699}
1700
1701sub delete_assoc_file {
1702 my $self = shift (@_);
1703 my ($assoc_filename) = @_;
1704
1705 my $i=0;
1706 while ($i < scalar (@{$self->{'associated_files'}})) {
1707 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1708 splice (@{$self->{'associated_files'}}, $i, 1);
1709 } else {
1710 $i++;
1711 }
1712 }
1713}
1714
1715sub reset_nextsection_ptr {
1716 my $self = shift (@_);
1717 my ($section) = @_;
1718
1719 my $section_ptr = $self->_lookup_section($section);
1720 $section_ptr->{'next_subsection'} = 1;
1721}
1722
17231;
Note: See TracBrowser for help on using the repository browser.