source: main/trunk/greenstone2/perllib/doc.pm@ 34276

Last change on this file since 34276 was 34276, checked in by kjdon, 4 years ago

modified the code that added Coordinate metadata when we have got both Latitude and Longitude. The previous code assumed you were alsways going to see these in the order Latitude then Longitude. Why? paradice gardens coll has them in the reverse order. so instead of assuming we have latitude when we are processing longitude, now for each one, if we have got the other, then add coords.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 52.1 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use FileUtils;
40use ghtml;
41use File::stat;
42##use hashdoc;
43use docprint;
44use JSON;
45
46# We just need pi from the Trig lib
47# Import constants pi2, pip2, pip4 (2*pi, pi/2, pi/4).
48# But doing
49# use Math::Trig ':pi';
50# resulted in errors on 32 bit linux VM with perl v8 of "bareword pi not allowed while strict subs in use"
51# So, now doing just
52use Math::Trig;
53
54# the document type may be indexed_doc, nonindexed_doc, or
55# classification
56
57our $OIDcount = 0;
58
59
60# Declare the Package (level) Variable that will tell us whether we're in the import or buildcol phase.
61# This var will be set by import.pl, buildcol.pl and export.pl to the then appropriate value.
62# https://perlmaven.com/package-variables-and-lexical-variables-in-perl
63# But: https://stackoverflow.com/questions/7542120/perl-how-to-make-variables-from-requiring-script-available-in-required-script
64# https://stackoverflow.com/questions/33230015/how-do-i-use-a-variable-in-another-perl-script
65#$doc::cmd_line_mode = undef;
66# Decided to follow the usage of $OIDcount above to declare the necessary package-level variables 'globally' accessible
67# actually accessible only for those packages that import this one (with 'use doc')
68my $cmd_line_mode = undef;
69
70# processor_mode keeps track of which buildcol pass we're at: dummy, text (sidx/didx passes) or infodb
71my $processor_mode = undef;
72
73# rename_method can be 'url', 'none', 'base64'
74sub new {
75 my $class = shift (@_);
76 my ($source_filename, $doc_type, $rename_method) = @_;
77
78
79 my $self = bless {'associated_files'=>[],
80 'subsection_order'=>[],
81 'next_subsection'=>1,
82 'subsections'=>{},
83 'metadata'=>[],
84 'text'=>"",
85 'OIDtype'=>"hash"}, $class;
86
87 # used to set lastmodified here, but this can screw up the HASH ids, so
88 # the docsave processor now calls set_lastmodified
89
90 $self->set_source_path($source_filename);
91
92 if (defined $source_filename) {
93 $source_filename = &util::filename_within_collection($source_filename);
94 print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
95 $self->set_source_filename ($source_filename, $rename_method);
96 }
97
98 $self->set_doc_type ($doc_type) if defined $doc_type;
99
100 return $self;
101}
102
103
104sub set_source_path
105{
106 my $self = shift @_;
107 my ($source_filename) = @_;
108
109 if (defined $source_filename) {
110 # On Windows the source_filename can be in terse DOS format
111 # e.g. test~1.txt
112
113 $self->{'terse_source_path'} = $source_filename;
114
115 # Use the FileUtil library methods as they are aware of more special
116 # cases such as HDFS [jmt12]
117 if (&FileUtils::fileExists($source_filename))
118 {
119 # See if we can do better for Windows with a filename
120 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
121 require Win32;
122 $self->{'source_path'} = Win32::GetLongPathName($source_filename);
123 }
124 else {
125 # For Unix-based systems, there is no difference between the two
126 $self->{'source_path'} = $source_filename;
127 }
128 }
129 else {
130 print STDERR "Warning: In doc::set_source_path(), file\n";
131 print STDERR " $source_filename\n";
132 print STDERR " does not exist\n";
133
134 # (default) Set it to whatever we were given
135 $self->{'source_path'} = $source_filename;
136 }
137 }
138 else {
139 # Previous code for setting source_path allowed for
140 # it to be undefined, so continue this practice
141 $self->{'terse_source_path'} = undef;
142 $self->{'source_path'} = undef;
143 }
144}
145
146
147sub get_source_path
148{
149 my $self = shift @_;
150
151 return $self->{'terse_source_path'};
152}
153
154# set lastmodified for OAI purposes, added by GRB, moved by kjdon
155sub set_oailastmodified {
156 my $self = shift (@_);
157
158 my $source_path = $self->{'terse_source_path'};
159
160 if (defined $source_path && (-e $source_path)) {
161 my $current_time = time;
162
163 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
164 $wday, $yday, $isdst) = localtime($current_time);
165
166 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
167
168 $self->add_utf8_metadata($self->get_top_section(), "oailastmodified", $current_time);
169 $self->add_utf8_metadata($self->get_top_section(), "oailastmodifieddate", $date_modified);
170 }
171}
172
173# no longer used for OAI purposes, since lastmodified is not what we want as the
174# Datestamp of a document. This doc metadata may be useful for general purposes.
175sub set_lastmodified {
176 my $self = shift (@_);
177
178 my $source_path = $self->{'terse_source_path'};
179
180 if (defined $source_path && (-e $source_path)) {
181
182 my $file_stat = stat($source_path);
183 my $mtime = $file_stat->mtime;
184 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
185 $wday, $yday, $isdst) = localtime($mtime);
186
187 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
188
189 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $mtime);
190 $self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
191 }
192}
193
194# clone the $self object
195sub duplicate {
196 my $self = shift (@_);
197
198 my $newobj = {};
199
200 foreach my $k (keys %$self) {
201 $newobj->{$k} = &clone ($self->{$k});
202 }
203
204 bless $newobj, ref($self);
205 return $newobj;
206}
207
208sub clone {
209 my ($from) = @_;
210 my $type = ref ($from);
211
212 if ($type eq "HASH") {
213 my $to = {};
214 foreach my $key (keys %$from) {
215 $to->{$key} = &clone ($from->{$key});
216 }
217 return $to;
218 } elsif ($type eq "ARRAY") {
219 my $to = [];
220 foreach my $v (@$from) {
221 push (@$to, &clone ($v));
222 }
223 return $to;
224 } else {
225 return $from;
226 }
227}
228
229sub set_OIDtype {
230 my $self = shift (@_);
231 my ($type, $metadata) = @_;
232
233 if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|hash_on_full_filename|incremental|filename|dirname|full_filename|assigned)$/) {
234 $self->{'OIDtype'} = $type;
235 } else {
236 $self->{'OIDtype'} = "hash";
237 }
238
239 if ($type =~ /^assigned$/) {
240 if (defined $metadata) {
241 $self->{'OIDmetadata'} = $metadata;
242 } else {
243 $self->{'OIDmetadata'} = "dc.Identifier";
244 }
245 }
246}
247
248# rename_method can be 'url', 'none', 'base64'
249sub set_source_filename {
250 my $self = shift (@_);
251 my ($source_filename, $rename_method) = @_;
252
253 # Since the gsdlsourcefilename element goes into the doc.xml it has
254 # to be utf8. However, it should also *represent* the source filename
255 # (in the import directory) which may not be utf8 at all.
256 # For instance, if this meta element (gsdlsourcefilename) will be used
257 # by other applications that parse doc.xml in order to locate
258 # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
259 # encode the real filename as this is a binary-to-text encoding meaning
260 # that the resulting string is ASCII (utf8). Decoding will give the original.
261
262# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
263## print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename\n";
264
265 # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
266 $source_filename = $self->encode_filename($source_filename, $rename_method);
267# my ($srcfilename,$dirname,$suffix)
268# = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
269# print STDERR "-> $srcfilename -> ";
270# $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
271# $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
272# print STDERR "$source_filename\n";
273
274## print STDERR "AFTER URL/base64 encoding gsdlsourcefilename: $source_filename\n";
275
276 $self->set_utf8_metadata_element ($self->get_top_section(),
277 "gsdlsourcefilename",
278 $source_filename);
279
280 # If we set the file renaming method, at least GLI can decode gsdlsourcefilename to get the
281 # original filename (relative path to file) back for GLI to notice what ex Meta is associated with it
282 $self->set_utf8_metadata_element($self->get_top_section(),
283 "gsdlsourcefilerenamemethod",
284 $rename_method);
285}
286
287sub encode_filename {
288 my $self = shift (@_);
289 my ($source_filename, $rename_method) = @_;
290
291 my ($srcfilename,$dirname,$suffix)
292 = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
293# print STDERR "-> $srcfilename -> ";
294 $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
295
296 # encode any subdirs of "import" also, but not import itself
297 my $sep = ($ENV{'GSDLOS'} =~ /^windows$/i) ? "\\" : "/";
298 my @dirs = ($sep eq "\\") ? split(/[\\\/]+/, $dirname) : split(/\//, $dirname);
299
300 my $dirpath = "";
301
302 # Don't encode the first folder ("import" or "tmp"): GLI's DocXMLFile.java looks for literal "import"
303 # or "tmp" before it knows what the file rename method is that is to be used decode the rest of gsdlsourcefilename.
304 if(scalar (@dirs) >= 1) {
305 $dirpath = shift(@dirs);
306 }
307 foreach my $subdir (@dirs) {
308 #print STDERR "@@@@ Found subdir: $subdir\n";
309
310 $subdir = &util::rename_file($subdir, $rename_method);
311 #print STDERR "@@@@ encoded subdir: $subdir\n";
312
313 $dirpath = &FileUtils::filenameConcatenate($dirpath, $subdir);
314 }
315 if($dirpath ne "") {
316 $dirname = $dirpath;
317 }
318
319 $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
320
321 return $source_filename;
322}
323
324sub set_converted_filename {
325 my $self = shift (@_);
326 my ($converted_filename) = @_;
327
328 # we know the converted filename is utf8
329 $self->set_utf8_metadata_element ($self->get_top_section(),
330 "gsdlconvertedfilename",
331 $converted_filename);
332}
333
334# returns the source_filename as it was provided
335sub get_unmodified_source_filename {
336 my $self = shift (@_);
337
338 return $self->{'terse_source_path'};
339}
340
341# returns the source_filename with whatever rename_method was given
342sub get_source_filename {
343 my $self = shift (@_);
344
345 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
346}
347
348
349
350# returns converted filename if available else returns source filename
351sub get_filename_for_hashing {
352 my $self = shift (@_);
353
354 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
355
356 if (!defined $filename) {
357 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
358 # if NULPlug processed file, then don't give a filename
359 if (defined $plugin_name && $plugin_name eq "NULPlug") {
360 $filename = undef;
361 } else { # returns the URL encoded source filename!
362 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
363 }
364 }
365
366 if (!&FileUtils::isFilenameAbsolute($filename)) {
367 $filename = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$filename);
368 }
369
370 return $filename;
371}
372
373sub set_doc_type {
374 my $self = shift (@_);
375 my ($doc_type) = @_;
376
377 $self->set_metadata_element ($self->get_top_section(),
378 "gsdldoctype",
379 $doc_type);
380}
381
382# returns the gsdldoctype as it was provided
383# the default of "indexed_doc" is used if no document
384# type was provided
385sub get_doc_type {
386 my $self = shift (@_);
387
388 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
389 return $doc_type if (defined $doc_type);
390 return "indexed_doc";
391}
392
393
394# look up the reference to the a particular section
395sub _lookup_section {
396 my $self = shift (@_);
397 my ($section) = @_;
398
399 my ($num);
400 my $sectionref = $self;
401
402 while (defined $section && $section ne "") {
403
404 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
405
406 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
407
408 $section = "" unless defined $section;
409
410
411 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
412 $sectionref = $sectionref->{'subsections'}->{$num};
413 } else {
414 return undef;
415 }
416 }
417
418 return $sectionref;
419}
420
421# calculate OID by hashing the contents of the document
422sub _calc_OID {
423 my $self = shift (@_);
424 my ($filename) = @_;
425
426
427 my $osexe = &util::get_os_exe();
428
429 my $hashfile_exe = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin",
430 $ENV{'GSDLOS'},"hashfile$osexe");
431
432 &util::set_gnomelib_env(); # gnomelib_env (particularly lib/libiconv2.dylib) required to run the hashfile executable on Mac Lions
433 # The subroutine will set the gnomelib env once for each subshell launched, by first testing if GEXTGNOME is not already set
434
435 # A different way to set the gnomelib env would be to do it more locally: exporting the necessary vars
436 # (specifically DYLD/LD_LIB_PATH) for gnome_lib as part of the command executed.
437 # E.g. $result=`export LD_LIBRARY_PATH=../ext/gnome-lib/darwin/lib; hashfile...`
438
439 my $result = "NULL";
440
441
442 if (-e "$hashfile_exe") {
443# $result = `\"$hashfile_exe\" \"$filename\"`;
444# $result = `hashfile$osexe \"$filename\" 2>&1`;
445 $result = `hashfile$osexe \"$filename\"`;
446
447 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
448 } else {
449 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
450 }
451 return "HASH$result";
452}
453
454# methods dealing with OID, not groups of them.
455
456# if $OID is not provided one is calculated
457sub set_OID {
458 my $self = shift (@_);
459 my ($OID) = @_;
460
461 my $use_hash_oid = 0;
462 # if an OID wasn't provided calculate one
463 if (!defined $OID) {
464 $OID = "NULL";
465 if ($self->{'OIDtype'} =~ /^hash/) {
466 $use_hash_oid = 1;
467 } elsif ($self->{'OIDtype'} eq "incremental") {
468 $OID = "D" . $OIDcount;
469 $OIDcount ++;
470 } elsif ($self->{'OIDtype'} eq "filename") {
471 my $filename = $self->get_source_filename();
472 $OID = &File::Basename::fileparse($filename, qr/\.[^.]*/);
473 $OID = &util::tidy_up_oid($OID);
474 } elsif ($self->{'OIDtype'} eq "full_filename") {
475 my $source_filename = $self->get_source_filename();
476 my $dirsep = &util::get_os_dirsep();
477
478 $source_filename =~ s/^import$dirsep//;
479 $source_filename =~ s/$dirsep/-/g;
480 $source_filename =~ s/\./_/g;
481
482 $OID = $source_filename;
483 $OID = &util::tidy_up_oid($OID);
484 } elsif ($self->{'OIDtype'} eq "dirname") {
485 my $filename = $self->get_source_filename();
486 if (defined($filename)) { # && -e $filename) {
487 # get the immediate parent directory
488 $OID = &File::Basename::dirname($filename);
489 if (defined $OID) {
490 $OID = &File::Basename::basename($OID);
491 $OID = &util::tidy_up_oid($OID);
492 } else {
493 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
494 $use_hash_oid = 1;
495 }
496 } else {
497 print STDERR "Failed to find a filename, generating hash id\n";
498 $use_hash_oid = 1;
499 }
500
501 } elsif ($self->{'OIDtype'} eq "assigned") {
502 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
503 if (defined $identifier && $identifier ne "") {
504 $OID = $identifier;
505 $OID = &util::tidy_up_oid($OID);
506 } else {
507 # need a hash id
508 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
509 $use_hash_oid = 1;
510 }
511
512 } else {
513 $use_hash_oid = 1;
514 }
515
516 if ($use_hash_oid) {
517 my $hash_on_file = 1;
518 my $hash_on_ga_xml = 0;
519
520 if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
521 $hash_on_file = 0;
522 $hash_on_ga_xml = 1;
523 }
524
525 if ($self->{'OIDtype'} eq "hash_on_full_filename") {
526 $hash_on_file = 0;
527 $hash_on_ga_xml = 0;
528
529 my $source_filename = $self->get_source_filename();
530 my $dirsep = &util::get_os_dirsep();
531
532 $source_filename =~ s/^import$dirsep//;
533 $source_filename =~ s/$dirsep/-/g;
534 $source_filename =~ s/\./_/g;
535
536 # If the filename is very short then (handled naively)
537 # this can cause conjestion in the hash-values
538 # computed, leading documents sharing the same leading
539 # Hex values in the computed has.
540 #
541 # The solution taken here is to replace the name of
542 # the file name a sufficient number of times (up to
543 # the character limit defined in 'rep_limit' and
544 # make that the content that is hashed on
545
546 # *** Think twice before changing the following value
547 # as it will break backward compatability of computed
548 # document HASH values
549
550 my $rep_limit = 256;
551 my $hash_content = undef;
552
553 if (length($source_filename)<$rep_limit) {
554 my $rep_string = "$source_filename|";
555 my $rs_len = length($rep_string);
556
557 my $clone_times = int(($rep_limit-1)/$rs_len) +1;
558
559 $hash_content = substr($rep_string x $clone_times, 0, $rep_limit);
560 }
561 else {
562 $hash_content = $source_filename;
563 }
564
565 my $filename = &util::get_tmp_filename();
566 if (!open (OUTFILE, ">:utf8", $filename)) {
567 print STDERR "doc::set_OID could not write to $filename\n";
568 } else {
569 print OUTFILE $hash_content;
570 close (OUTFILE);
571 }
572 $OID = $self->_calc_OID ($filename);
573
574 &FileUtils::removeFiles ($filename);
575 }
576
577 if ($hash_on_file) {
578 # "hash" OID - feed file to hashfile.exe
579 my $filename = $self->get_filename_for_hashing();
580
581 # -z: don't want to hash on the file if it is zero size
582 if (defined($filename) && -e $filename && !-z $filename) {
583 $OID = $self->_calc_OID ($filename);
584 } else {
585 $hash_on_ga_xml = 1; # switch to back-up plan, and hash on GA file instead
586 }
587 }
588
589 if ($hash_on_ga_xml) {
590 # In addition being asked to explicity calculate the has based on the GA file,
591 # can also end up coming into this block is doing 'hash_on_file' but the file
592 # itself is of zero bytes (as could be the case with 'doc.nul' file
593
594 my $filename = &util::get_tmp_filename();
595 if (!open (OUTFILE, ">:utf8", $filename)) {
596 print STDERR "doc::set_OID could not write to $filename\n";
597 } else {
598 my $doc_text = &docprint::get_section_xml($self);
599 print OUTFILE $doc_text;
600 close (OUTFILE);
601 }
602 $OID = $self->_calc_OID ($filename);
603 &FileUtils::removeFiles($filename);
604 }
605 }
606 }
607 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
608}
609
610# this uses hashdoc (embedded c thingy) which is faster but still
611# needs a little work to be sufficiently stable
612sub ___set_OID {
613 my $self = shift (@_);
614 my ($OID) = @_;
615
616 # if an OID wasn't provided then calculate hash value based on document
617 if (!defined $OID)
618 {
619 my $hash_text = &docprint::get_section_xml($self);
620 my $hash_len = length($hash_text);
621
622 $OID = &hashdoc::buffer($hash_text,$hash_len);
623 }
624
625 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
626}
627
628# returns the OID for this document
629sub get_OID {
630 my $self = shift (@_);
631 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
632 return $OID if (defined $OID);
633 return "NULL";
634}
635
636sub delete_OID {
637 my $self = shift (@_);
638
639 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
640}
641
642
643# methods for manipulating section names
644
645# returns the name of the top-most section (the top
646# level of the document
647sub get_top_section {
648 my $self = shift (@_);
649
650 return "";
651}
652
653# returns a section
654sub get_parent_section {
655 my $self = shift (@_);
656 my ($section) = @_;
657
658 $section =~ s/(^|\.)\d+$//;
659
660 return $section;
661}
662
663# returns the first child section (or the end child
664# if there isn't any)
665sub get_begin_child {
666 my $self = shift (@_);
667 my ($section) = @_;
668
669 my $section_ptr = $self->_lookup_section($section);
670 return "" unless defined $section_ptr;
671
672 if (defined $section_ptr->{'subsection_order'}->[0]) {
673 return "$section.$section_ptr->{'subsection_order'}->[0]";
674 }
675
676 return $self->get_end_child ($section);
677}
678
679# returns the next child of a parent section
680sub get_next_child {
681 my $self = shift (@_);
682 my ($section) = @_;
683
684 my $parent_section = $self->get_parent_section($section);
685 my $parent_section_ptr = $self->_lookup_section($parent_section);
686 return undef unless defined $parent_section_ptr;
687
688 my ($section_num) = $section =~ /(\d+)$/;
689 return undef unless defined $section_num;
690
691 my $i = 0;
692 my $section_order = $parent_section_ptr->{'subsection_order'};
693 while ($i < scalar(@$section_order)) {
694 last if $section_order->[$i] eq $section_num;
695 $i++;
696 }
697
698 $i++; # the next child
699 if ($i < scalar(@$section_order)) {
700 return $section_order->[$i] if $parent_section eq "";
701 return "$parent_section.$section_order->[$i]";
702 }
703
704 # no more sections in this level
705 return undef;
706}
707
708# returns a reference to a list of children
709sub get_children {
710 my $self = shift (@_);
711 my ($section) = @_;
712
713 my $section_ptr = $self->_lookup_section($section);
714 return [] unless defined $section_ptr;
715
716 my @children = @{$section_ptr->{'subsection_order'}};
717
718 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
719 return \@children;
720}
721
722# returns the child section one past the last one (which
723# is coded as "0")
724sub get_end_child {
725 my $self = shift (@_);
726 my ($section) = @_;
727
728 return $section . ".0" unless $section eq "";
729 return "0";
730}
731
732# returns the next section in book order
733sub get_next_section {
734 my $self = shift (@_);
735 my ($section) = @_;
736
737 return undef unless defined $section;
738
739 my $section_ptr = $self->_lookup_section($section);
740 return undef unless defined $section_ptr;
741
742 # first try to find first child
743 if (defined $section_ptr->{'subsection_order'}->[0]) {
744 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
745 return "$section.$section_ptr->{'subsection_order'}->[0]";
746 }
747
748 do {
749 # try to find sibling
750 my $next_child = $self->get_next_child ($section);
751 return $next_child if (defined $next_child);
752
753 # move up one level
754 $section = $self->get_parent_section ($section);
755 } while $section =~ /\d/;
756
757 return undef;
758}
759
760sub is_leaf_section {
761 my $self = shift (@_);
762 my ($section) = @_;
763
764 my $section_ptr = $self->_lookup_section($section);
765 return 1 unless defined $section_ptr;
766
767 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
768}
769
770# methods for dealing with sections
771
772# returns the name of the inserted section
773sub insert_section {
774 my $self = shift (@_);
775 my ($before_section) = @_;
776
777 # get the child to insert before and its parent section
778 my $parent_section = "";
779 my $before_child = "0";
780 my @before_section = split (/\./, $before_section);
781 if (scalar(@before_section) > 0) {
782 $before_child = pop (@before_section);
783 $parent_section = join (".", @before_section);
784 }
785
786 my $parent_section_ptr = $self->_lookup_section($parent_section);
787 if (!defined $parent_section_ptr) {
788 print STDERR "doc::insert_section couldn't find parent section " .
789 "$parent_section\n";
790 return;
791 }
792
793 # get the next section number
794 my $section_num = $parent_section_ptr->{'next_subsection'}++;
795
796 my $i = 0;
797 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
798 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
799 $i++;
800 }
801
802 # insert the section number into the order list
803 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
804
805 # add this section to the parent section
806 my $section_ptr = {'subsection_order'=>[],
807 'next_subsection'=>1,
808 'subsections'=>{},
809 'metadata'=>[],
810 'text'=>""};
811 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
812
813 # work out the full section number
814 my $section = $parent_section;
815 $section .= "." unless $section eq "";
816 $section .= $section_num;
817
818 return $section;
819}
820
821# creates a pre-named section
822sub create_named_section {
823 my $self = shift (@_);
824 my ($mastersection) = @_;
825
826 my ($num);
827 my $section = $mastersection;
828 my $sectionref = $self;
829
830 while ($section ne "") {
831 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
832 $num =~ s/^0+(\d)/$1/; # remove leading 0s
833 $section = "" unless defined $section;
834
835 if (defined $num) {
836 if (!defined $sectionref->{'subsections'}->{$num}) {
837 push (@{$sectionref->{'subsection_order'}}, $num);
838 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
839 'next_subsection'=>1,
840 'subsections'=>{},
841 'metadata'=>[],
842 'text'=>""};
843 if ($num >= $sectionref->{'next_subsection'}) {
844 $sectionref->{'next_subsection'} = $num + 1;
845 }
846 }
847 $sectionref = $sectionref->{'subsections'}->{$num};
848
849 } else {
850 print STDERR "doc::create_named_section couldn't create section ";
851 print STDERR "$mastersection\n";
852 last;
853 }
854 }
855}
856
857# returns a reference to a list of subsections
858sub list_subsections {
859 my $self = shift (@_);
860 my ($section) = @_;
861
862 my $section_ptr = $self->_lookup_section ($section);
863 if (!defined $section_ptr) {
864 print STDERR "doc::list_subsections couldn't find section $section\n";
865 return [];
866 }
867
868 return [@{$section_ptr->{'subsection_order'}}];
869}
870
871sub delete_section {
872 my $self = shift (@_);
873 my ($section) = @_;
874
875# my $section_ptr = {'subsection_order'=>[],
876# 'next_subsection'=>1,
877# 'subsections'=>{},
878# 'metadata'=>[],
879# 'text'=>""};
880
881 # if this is the top section reset everything
882 if ($section eq "") {
883 $self->{'subsection_order'} = [];
884 $self->{'subsections'} = {};
885 $self->{'metadata'} = [];
886 $self->{'text'} = "";
887 return;
888 }
889
890 # find the parent of the section to delete
891 my $parent_section = "";
892 my $child = "0";
893 my @section = split (/\./, $section);
894 if (scalar(@section) > 0) {
895 $child = pop (@section);
896 $parent_section = join (".", @section);
897 }
898
899 my $parent_section_ptr = $self->_lookup_section($parent_section);
900 if (!defined $parent_section_ptr) {
901 print STDERR "doc::delete_section couldn't find parent section " .
902 "$parent_section\n";
903 return;
904 }
905
906 # remove this section from the subsection_order list
907 my $i = 0;
908 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
909 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
910 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
911 last;
912 }
913 $i++;
914 }
915
916 # remove this section from the subsection hash
917 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
918 undef $parent_section_ptr->{'subsections'}->{$child};
919 }
920}
921
922#--
923# methods for dealing with metadata
924
925# set_metadata_element and get_metadata_element are for metadata
926# which should only have one value. add_meta_data and get_metadata
927# are for metadata which can have more than one value.
928
929# returns the first metadata value which matches field
930
931# This version of get metadata element works much like the one above,
932# except it allows for the namespace portion of a metadata element to
933# be ignored, thus if you are searching for dc.Title, the first piece
934# of matching metadata ending with the name Title (once any namespace
935# is removed) would be returned.
936# 28-11-2003 John Thompson
937sub get_metadata_element {
938 my $self = shift (@_);
939 my ($section, $field, $ignore_namespace) = @_;
940 my ($data);
941
942 $ignore_namespace = 0 unless defined $ignore_namespace;
943
944 my $section_ptr = $self->_lookup_section($section);
945 if (!defined $section_ptr) {
946 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
947 return;
948 }
949
950 # Remove any namespace if we are being told to ignore them
951 if($ignore_namespace) {
952 $field =~ s/^.*\.//; #$field =~ s/^\w*\.//;
953 }
954
955 foreach $data (@{$section_ptr->{'metadata'}}) {
956
957 my $data_name = $data->[0];
958
959 # Remove any namespace if we are being told to ignore them
960 if($ignore_namespace) {
961 $data_name =~ s/^.*\.//; #$data_name =~ s/^\w*\.//;
962 }
963 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
964 $data_name =~ s/^ex\.([^.]+)$/$1/; #$data_name =~ s/^ex\.//;
965
966 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
967 }
968
969 return undef; # was not found
970}
971
972# returns a list of the form [value1, value2, ...]
973sub get_metadata {
974 my $self = shift (@_);
975 my ($section, $field, $ignore_namespace) = @_;
976 my ($data);
977
978 $ignore_namespace = 0 unless defined $ignore_namespace;
979
980 my $section_ptr = $self->_lookup_section($section);
981 if (!defined $section_ptr) {
982 print STDERR "doc::get_metadata couldn't find section ",
983 $section, "\n";
984 return;
985 }
986
987 # Remove any namespace if we are being told to ignore them
988 if($ignore_namespace) {
989 $field =~ s/^.*\.//;
990 }
991
992 my @metadata = ();
993 foreach $data (@{$section_ptr->{'metadata'}}) {
994
995 my $data_name = $data->[0];
996
997 # Remove any namespace if we are being told to ignore them
998 if($ignore_namespace) {
999 $data_name =~ s/^.*\.//;
1000 }
1001 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
1002 $data_name =~ s/^ex\.([^.]+)$/$1/;
1003
1004 push (@metadata, $data->[1]) if ($data_name eq $field);
1005 }
1006
1007 return \@metadata;
1008}
1009
1010sub get_metadata_hashmap {
1011 my $self = shift (@_);
1012 my ($section, $opt_namespace) = @_;
1013
1014 my $section_ptr = $self->_lookup_section($section);
1015 if (!defined $section_ptr) {
1016 print STDERR "doc::get_metadata couldn't find section ",
1017 $section, "\n";
1018 return;
1019 }
1020
1021 my $metadata_hashmap = {};
1022 foreach my $data (@{$section_ptr->{'metadata'}}) {
1023 my $metaname = $data->[0];
1024
1025 if ((!defined $opt_namespace) || ($metaname =~ m/^$opt_namespace\./)) {
1026 if (!defined $metadata_hashmap->{$metaname}) {
1027 $metadata_hashmap->{$metaname} = [];
1028 }
1029 my $metaval_list = $metadata_hashmap->{$metaname};
1030 push(@$metaval_list, $data->[1]);
1031 }
1032 }
1033
1034 return $metadata_hashmap;
1035}
1036
1037# returns a list of the form [[field,value],[field,value],...]
1038sub get_all_metadata {
1039 my $self = shift (@_);
1040 my ($section) = @_;
1041
1042 my $section_ptr = $self->_lookup_section($section);
1043 if (!defined $section_ptr) {
1044 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
1045 return;
1046 }
1047
1048 return $section_ptr->{'metadata'};
1049}
1050
1051# $value is optional
1052sub delete_metadata {
1053 my $self = shift (@_);
1054 my ($section, $field, $value) = @_;
1055
1056 my $section_ptr = $self->_lookup_section($section);
1057 if (!defined $section_ptr) {
1058 print STDERR "doc::delete_metadata couldn't find section ", $section, "$field\n";
1059 return;
1060 }
1061
1062 my $i = 0;
1063 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
1064 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
1065 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
1066 splice (@{$section_ptr->{'metadata'}}, $i, 1);
1067 } else {
1068 $i++;
1069 }
1070 }
1071}
1072
1073sub delete_all_metadata {
1074 my $self = shift (@_);
1075 my ($section) = @_;
1076
1077 my $section_ptr = $self->_lookup_section($section);
1078 if (!defined $section_ptr) {
1079 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
1080 return;
1081 }
1082
1083 $section_ptr->{'metadata'} = [];
1084}
1085
1086sub set_metadata_element {
1087 my $self = shift (@_);
1088 my ($section, $field, $value) = @_;
1089
1090 $self->set_utf8_metadata_element ($section, $field,
1091 &unicode::ascii2utf8(\$value));
1092}
1093
1094# set_utf8_metadata_element assumes the text has already been
1095# converted to the UTF-8 encoding.
1096sub set_utf8_metadata_element {
1097 my $self = shift (@_);
1098 my ($section, $field, $value) = @_;
1099
1100 $self->delete_metadata ($section, $field);
1101 $self->add_utf8_metadata ($section, $field, $value);
1102}
1103
1104
1105# add_metadata assumes the text is in (extended) ascii form. For
1106# text which has already been converted to the UTF-8 format use
1107# add_utf8_metadata.
1108sub add_metadata {
1109 my $self = shift (@_);
1110 my ($section, $field, $value) = @_;
1111
1112 $self->add_utf8_metadata ($section, $field,
1113 &unicode::ascii2utf8(\$value));
1114}
1115
1116sub add_utf8_metadata {
1117 my $self = shift (@_);
1118 my ($section, $field, $value) = @_;
1119
1120 # my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1121 # my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
1122 # print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
1123
1124 my $section_ptr = $self->_lookup_section($section);
1125 if (!defined $section_ptr) {
1126 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
1127 return;
1128 }
1129 if (!defined $value) {
1130 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
1131 return;
1132 }
1133 if (!defined $field) {
1134 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
1135 return;
1136 }
1137
1138 #print STDERR "###$field=$value\n";
1139
1140 # For now, supress this check. Given that text data read in is now
1141 # Unicode aware, then the following block of code can (ironically enough)
1142 # cause our unicode compliant string to be re-encoded (leading to
1143 # a double-encoded UTF-8 string, which we definitely don't want!).
1144
1145
1146 # double check that the value is utf-8
1147 # if (!&unicode::check_is_utf8($value)) {
1148 # print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
1149 # &unicode::ensure_utf8(\$value);
1150 # print STDERR " Tried converting to utf8: $value\n";
1151 # }
1152
1153 #If the metadata value is either a latitude or a longitude value then we want to save a shortened version for spacial searching purposes
1154 if ($field =~ m/^(.+\.)?Latitude$/ || $field =~ m/^(.+\.)?Longitude$/)
1155 {
1156 my ($mdprefix,$metaname) = ($field =~ m/(.+)\.(.+)$/);
1157 if (defined $mdprefix) {
1158 # Add in a version of Latitude/Longitude without the metadata namespace prefix to keep Runtime happy
1159 push (@{$section_ptr->{'metadata'}}, [$metaname, $value]);
1160 }
1161
1162 my $direction;
1163 # TODO does the following work, with eq latitude? the above code implies can be xxxxx.Latitude. should use metaname??
1164 if($value =~ m/^-/)
1165 {
1166 $direction = ($field eq "Latitude") ? "S" : "W";
1167 }
1168 else
1169 {
1170 $direction = ($field eq "Latitude") ? "N" : "E";
1171 }
1172
1173 my ($beforeDec, $afterDec) = ($value =~ m/^-?([0-9]+)\.([0-9]+)$/);
1174 if(defined $beforeDec && defined $afterDec)
1175 {
1176 my $name = ($field eq "Latitude") ? "LatShort" : "LngShort";
1177 push (@{$section_ptr->{'metadata'}}, [$name, $beforeDec . $direction]);
1178
1179 for(my $i = 2; $i <= 4; $i++)
1180 {
1181 if(length($afterDec) >= $i)
1182 {
1183 push (@{$section_ptr->{'metadata'}}, [$name, substr($afterDec, 0, $i)]);
1184 }
1185 }
1186
1187 # what is this???? completely does nothing...
1188 #Only add the metadata if it has not already been added
1189 my $metaMap = $self->get_metadata_hashmap($section); # metaMap not used and called function has no apparent side-effects,
1190 # but this line appears important for ensuring uniqueness of (Latitude, value) meta
1191 # in the section. Also for LatShort, Longitude, LngShort.
1192 }
1193
1194 # if we have both Latitude and Longitude, then add coordinate metadata (not sure why, but someone else added it)
1195 my $latitude;
1196 my $longitude;
1197 if($field =~ m/^(.+\.)?Longitude$/) {
1198 # if we are dealing with Longitude meta, and we have already seen Latitude, then add Coordinate metadata
1199 $latitude = $self->get_metadata_element ($section, "Latitude");
1200 if (defined $latitude) {
1201 $longitude = $value;
1202 }
1203 } else {
1204 # we are dealing with Latitude - if we have already seen Longitude, then add both as Coordinate metadata
1205 $longitude = $self->get_metadata_element ($section, "Longitude");
1206 if (defined $longitude) {
1207 $latitude = $value;
1208 }
1209 }
1210 if (defined $latitude && defined $longitude) {
1211 ##rint STDERR "adding coords for $section, $latitude, $longitude\n";
1212 $self->processCoordinate($section, $latitude, $longitude);
1213 } else {
1214 ##rint STDERR "lat or long not defined, not adding coords for $section\n";
1215 }
1216
1217 }
1218
1219 elsif($field eq "GPS.mapOverlay") { # then the $value is a JSON string
1220
1221 # In order to allow searching map-data-enriched documents by map shape descriptions,
1222 # and to run rawquery searches for other docs by proximity based on their map data,
1223 # need to store the shape descriptions and Coordinate info for shapes into the text index.
1224 # We add the description for each shape in the mapoverlay into the text index as GPSMapOverlayLabel
1225 # And we add Coordinate (CD) and CoordShort (CS) info for each shape in the mapoverlay in the format (Lat, Lng) as ("37S339 175E342")
1226 # where the digits before the N/S/E/W direction represents the whole number. And the digits after the direction are the
1227 # decimal places which can vary between 0 and 2 to 4 digits.
1228
1229 # However, we only want to process GPS.mapOverlay during buildcol and only in the text indexing passes (e.g. sidx and didx for lucene)
1230 # and certainly never during the infodb pass of buildcol. The latter can end up duplicating Coordinate/CoordShort/GPSMapOverlayLabel
1231 # when rebuilding with the online doc editor, as that runs incremental-rebuild which then calls basebuilder::reconstruct_doc_objs_metadata()
1232 # on all docs NOT being incrementally rebuilt. That call would get meta from the infodb and use it to reconstruct the doc objects of docs
1233 # NOT being incrementally built. If the Coord and Label meta were written to the infodb, they would then be loaded back in memory when the
1234 # collection is incrementally rebuilt for those docs that don't need incremental processing. Then this function would once again add the
1235 # same meta into the infodb, thus duplicating what goes into the infodb. Hence, don't do all the following if doc::processor_mode =~ "infodb",
1236 # or anything other than a text mode.
1237
1238 # Note that for incremental rebuilding, the text pass can be called textreindex for instance (and infodb pass can be incinfodb).
1239 # So don't check for exact string match
1240
1241 if($doc::cmd_line_mode eq "buildcol" && $doc::processor_mode =~ m/^text/) { # currently known text processor_modes:
1242 # text, textreindex, possibly textdelete (see ArchivesInfPlugin.pm::read() for last 2).
1243 # OR: ..."buildcol" && $doc::processor_mode !~ m/infodb$/) # if dummy pass important
1244
1245 ###print STDERR "GPS.mapOverlay before decoding, val = " . $value . "\n";
1246
1247 # TODO: html decode?
1248 $value =~ s@&#091;@[@g;
1249 $value =~ s@&#093;@]@g;
1250 $value =~ s@&quot;@"@g;
1251 ###print STDERR "GPS.mapOverlay after decoding, val = " . $value . "\n";
1252
1253 my $json_array = decode_json $value;
1254
1255 foreach my $shape (@$json_array) {
1256
1257 # Put each available shape description/label into this section's metadata with GPSMapOverlayLabel as metaname.
1258 # Just as for Coordinate meta, don't need to know which shape a label belongs too. This is just so each label
1259 # will be indexed, and therefore can be searched.
1260
1261 my $description = $shape->{"description"};
1262 if($description) {
1263 push (@{$section_ptr->{'metadata'}}, ["GPSMapOverlayLabel", $description]);
1264 ###print STDERR "@@@@############################################ Just added description meta: " . $description . "\n";
1265 }
1266
1267 my $type = $shape->{"type"};
1268 ###print STDERR "Shape type : " . $type . "\n";
1269
1270 if($type eq "circle") {
1271 ###print STDERR "Found a circle:\n" . &printShape($json, $shape);
1272
1273 # work out bounding box
1274 # want the inverse of this useful page:
1275 # https://stackoverflow.com/questions/639695/how-to-convert-latitude-or-longitude-to-meters
1276 # https://www.geodatasource.com/developers/javascript
1277
1278 # Dr Bainbridge wants us to use: https://gis.stackexchange.com/questions/5821/calculating-latitude-longitude-x-miles-from-point
1279 # Since for N,E,S,W the following works out the same as above link, we're using the rule of thumb in the following:
1280 # https://gis.stackexchange.com/questions/2951/algorithm-for-offsetting-a-latitude-longitude-by-some-amount-of-meters
1281 # which states
1282 # "If your displacements aren't too great (less than a few kilometers) and you're not right at the poles,
1283 # use the quick and dirty estimate that 111,111 meters (111.111 km) in the y direction is 1 degree
1284 # (of latitude) and 111,111 * cos(latitude) meters in the x direction is 1 degree (of longitude)."
1285 my $centre_lat = $shape->{"center"}->{"lat"};
1286 my $centre_lng = $shape->{"center"}->{"lng"};
1287 my $radius = $shape->{"radius"}; # in metres!
1288
1289 ###print STDERR "@@@ circle centre: ($centre_lat, $centre_lng), radius: $radius\n";
1290
1291 my $lat_north = $centre_lat + ($radius/111111);
1292 my $lat_south = $centre_lat - ($radius/111111);
1293
1294 ###print STDERR "### lat_north: $lat_north\n";
1295 ###print STDERR "### lat_south: $lat_south\n";
1296
1297 # our latitude and longitude values are in degrees. But cos and sin etc in perl and generally all prog languages
1298 # all expect their angle to be in radians. So need to convert from degree to radians before we can take the cos of it.
1299 my $centre_lat_radians = $self->degreesToRadians($centre_lat);
1300 my $cos_in_radians = cos($centre_lat_radians);
1301 ###print STDERR "cos $centre_lat_radians " . cos($centre_lat_radians) . "\n";
1302 my $lng_east = $centre_lng + ($radius/(111111 * $cos_in_radians));
1303 my $lng_west = $centre_lng - ($radius/(111111 * $cos_in_radians));
1304 ###print STDERR "### lng_east $lng_east\n";
1305 ###print STDERR "### lng_west $lng_west\n";
1306
1307 my $cos_lat = cos($centre_lat);
1308 ###print STDERR "cos $centre_lat is $cos_lat\n";
1309
1310 $self->processCoordinate($section, $lat_north, $lng_east);
1311 $self->processCoordinate($section, $lat_south, $lng_east);
1312 $self->processCoordinate($section, $lat_south, $lng_west);
1313 $self->processCoordinate($section, $lat_north, $lng_west);
1314
1315 }
1316 elsif ($type eq "marker") {
1317 ###print STDERR "@@ MARKER FOUND WITH LAT: " . $shape->{"position"}->{"lat"} . "\n";
1318 ###print STDERR "@@ MARKER FOUND WITH LNG: " . $shape->{"position"}->{"lng"} . "\n";
1319 $self->processCoordinate($section, $shape->{"position"}->{"lat"}, $shape->{"position"}->{"lng"});
1320 }
1321 elsif ($type eq "polyline" || $type eq "polygon") {
1322 my $path_array = $shape->{"path"};
1323 foreach my $position (@$path_array) {
1324 $self->processCoordinate($section, $position->{"lat"}, $position->{"lng"});
1325 }
1326 }
1327 elsif ($type eq "rectangle") {
1328
1329 my $bounds = $shape->{"bounds"};
1330 $self->processCoordinate($section, $bounds->{"north"}, $bounds->{"east"});
1331 $self->processCoordinate($section, $bounds->{"south"}, $bounds->{"east"});
1332 $self->processCoordinate($section, $bounds->{"south"}, $bounds->{"west"});
1333 $self->processCoordinate($section, $bounds->{"north"}, $bounds->{"west"});
1334 }
1335
1336 } # end for on each shape in GPS.mapOverlay
1337 } # end if(buildcol and text pass)
1338 } # end GPS.mapOverlay meta
1339
1340 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1341}
1342
1343# https://en.wikipedia.org/wiki/Radian
1344sub degreesToRadians
1345{
1346 my $self = shift (@_);
1347 my ($degrees) = @_;
1348
1349 return $degrees * pi /180; # returns radians
1350}
1351
1352sub radiansToDegrees
1353{
1354 my $self = shift (@_);
1355 my ($radians) = @_;
1356
1357 return $radians * 180 / pi; # returns degrees
1358}
1359
1360# Call as:
1361# my $json = JSON->new->allow_nonref;
1362# &printAllShapes($json, $json_array);
1363sub printAllShapes {
1364 my ($json, $json_array) = @_;
1365
1366
1367 #my $pretty_print_shape = $json->pretty->encode( $json_array->[0] );
1368 foreach my $shape (@$json_array) {
1369 my $pretty_print_shape = $json->pretty->encode( $shape );
1370 print STDERR "Shape: $pretty_print_shape\n";
1371 }
1372
1373}
1374
1375# For the (lat, lng) coordinate given,
1376# attaches Coordinate and multiple CoordShort (different precision level) metadata to the doc object
1377sub processCoordinate {
1378 my $self = shift (@_);
1379 my ($section, $latitude, $longitude) = @_;
1380
1381 my $section_ptr = $self->_lookup_section($section);
1382
1383 my $lat_direction = ($latitude =~ m/^-/) ? "S" : "N";
1384 my $lng_direction = ($longitude =~ m/^-/) ? "W" : "E";
1385
1386 # have to store (lat, lng) in pairs, when there are so many coords to store
1387 #push (@{$section_ptr->{'metadata'}}, ["Latitude", $latitude]);
1388 #push (@{$section_ptr->{'metadata'}}, ["Longitude", $longitude]);
1389
1390 push (@{$section_ptr->{'metadata'}}, ["Coordinate", "$latitude $longitude"]); # "$latitude$lat_direction $longitude$lng_direction"
1391
1392 my ($latBeforeDec, $latAfterDec);
1393 my ($lngBeforeDec, $lngAfterDec);
1394
1395 if($latitude !~ m/\./) {
1396 $latBeforeDec = $latitude;
1397 $latAfterDec = "";
1398 } else {
1399 ($latBeforeDec, $latAfterDec) = ($latitude =~ m/^-?([0-9]+)\.([0-9]+)$/);
1400 }
1401 if($longitude !~ m/\./) {
1402 $lngBeforeDec = $longitude;
1403 $lngAfterDec = "";
1404 } else {
1405 ($lngBeforeDec, $lngAfterDec) = ($longitude =~ m/^-?([0-9]+)\.([0-9]+)$/);
1406 }
1407
1408
1409 my $name = "CoordShort";
1410 push (@{$section_ptr->{'metadata'}}, [$name, "$latBeforeDec$lat_direction $lngBeforeDec$lng_direction"]);
1411
1412 for(my $i = 2; $i <= 4; $i++)
1413 {
1414 my $latDecPlaces = (length($latAfterDec) >= $i) ? substr($latAfterDec, 0, $i) : "";
1415 my $lngDecPlaces = (length($lngAfterDec) >= $i) ? substr($lngAfterDec, 0, $i) : "";
1416
1417 push (@{$section_ptr->{'metadata'}}, [$name,
1418 $latBeforeDec . $lat_direction. $latDecPlaces . " " . $lngBeforeDec . $lng_direction. $lngDecPlaces]);
1419
1420 }
1421
1422 #Only add the metadata if it has not already been added
1423 #my $metaMap = $self->get_metadata_hashmap($section); ### TODO: metaMap not used. Unnecesssary step? (Called function has no apparent side-effects.)
1424}
1425
1426
1427# methods for dealing with text
1428
1429# returns the text for a section
1430sub get_text {
1431 my $self = shift (@_);
1432 my ($section) = @_;
1433
1434 my $section_ptr = $self->_lookup_section($section);
1435 if (!defined $section_ptr) {
1436 print STDERR "doc::get_text couldn't find section " .
1437 "$section\n";
1438 return "";
1439 }
1440
1441 return $section_ptr->{'text'};
1442}
1443
1444# returns the (utf-8 encoded) length of the text for a section
1445sub get_text_length {
1446 my $self = shift (@_);
1447 my ($section) = @_;
1448
1449 my $section_ptr = $self->_lookup_section($section);
1450 if (!defined $section_ptr) {
1451 print STDERR "doc::get_text_length couldn't find section " .
1452 "$section\n";
1453 return 0;
1454 }
1455
1456 return length ($section_ptr->{'text'});
1457}
1458
1459# returns the total length for all the sections
1460sub get_total_text_length {
1461 my $self = shift (@_);
1462
1463 my $section = $self->get_top_section();
1464 my $length = 0;
1465 while (defined $section) {
1466 $length += $self->get_text_length($section);
1467 $section = $self->get_next_section($section);
1468 }
1469 return $length;
1470}
1471
1472sub delete_text {
1473 my $self = shift (@_);
1474 my ($section) = @_;
1475
1476 my $section_ptr = $self->_lookup_section($section);
1477 if (!defined $section_ptr) {
1478 print STDERR "doc::delete_text couldn't find section " .
1479 "$section\n";
1480 return;
1481 }
1482
1483 $section_ptr->{'text'} = "";
1484}
1485
1486# add_text assumes the text is in (extended) ascii form. For
1487# text which has been already converted to the UTF-8 format
1488# use add_utf8_text.
1489sub add_text {
1490 my $self = shift (@_);
1491 my ($section, $text) = @_;
1492
1493 # convert the text to UTF-8 encoded unicode characters
1494 # and add the text
1495 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1496}
1497
1498
1499# add_utf8_text assumes the text to be added has already
1500# been converted to the UTF-8 encoding. For ascii text use
1501# add_text
1502# Pass by value version (internally calls pass by ref version
1503# to avoid code duplication)
1504sub add_utf8_text {
1505 my $self = shift (@_);
1506 my ($section, $text) = @_;
1507
1508 $self->add_utf8_textref($section, \$text);
1509}
1510
1511# Pass by reference version, used by GreenstoneSQLPlugin for fulltext
1512sub add_utf8_textref {
1513 my $self = shift (@_);
1514 my ($section, $text_ref) = @_;
1515
1516 my $section_ptr = $self->_lookup_section($section);
1517 if (!defined $section_ptr) {
1518 print STDERR "doc::add_utf8_textref couldn't find section " .
1519 "$section\n";
1520 return;
1521 }
1522
1523 $section_ptr->{'text'} .= $$text_ref;
1524}
1525
1526# returns the Source meta, which is the utf8 filename generated.
1527# Added a separate method here for convenience
1528sub get_source {
1529 my $self = shift (@_);
1530 return $self->get_metadata_element ($self->get_top_section(), "Source");
1531}
1532
1533# returns the SourceFile meta, which is the url reference to the URL-encoded
1534# version of Source (the utf8 filename). Added a separate method here for convenience
1535sub get_sourcefile {
1536 my $self = shift (@_);
1537 return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
1538}
1539
1540# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
1541# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
1542sub get_assocfile_from_sourcefile {
1543 my $self = shift (@_);
1544
1545 # get the SourceFile meta, which is a *URL* to a file on the filesystem
1546 my $top_section = $self->get_top_section();
1547 my $source_file = $self->get_metadata_element($top_section, "SourceFile");
1548
1549 # get the actual filename as it exists on the filesystem which this url refers to
1550 $source_file = &unicode::url_to_filename($source_file);
1551 my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
1552 return $assocfilename;
1553}
1554
1555# methods for dealing with associated files
1556
1557# a file is associated with a document, NOT a section.
1558# if section is defined it is noted in the data structure
1559# only so that files associated from a particular section
1560# may be removed later (using delete_section_assoc_files)
1561sub associate_file {
1562 my $self = shift (@_);
1563 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1564 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1565
1566 # remove all associated files with the same name
1567 $self->delete_assoc_file ($assoc_filename);
1568
1569 # Too harsh a requirement
1570 # Definitely get HTML docs, for example, with some missing
1571 # support files
1572# if (!&util::fd_exists($real_filename)) {
1573# print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n";
1574# exit -1;
1575# }
1576
1577# print STDERR "**** is the following a UTF8 rep of *real* filename?\n $real_filename\n";
1578# print STDERR "****##### so, ensure it is before storing?!?!?\n";
1579## my $utf8_filename = Encode::encode("utf8",$filename);
1580
1581 push (@{$self->{'associated_files'}},
1582 [$real_filename, $assoc_filename, $mime_type, $section]);
1583}
1584
1585# returns a list of associated files in the form
1586# [[real_filename, assoc_filename, mimetype], ...]
1587sub get_assoc_files {
1588 my $self = shift (@_);
1589
1590 return $self->{'associated_files'};
1591}
1592
1593# the following two methods used to keep track of original associated files
1594# for incremental building. eg a txt file used by an item file does not end
1595# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1596# file for incremental build
1597sub associate_source_file {
1598 my $self = shift (@_);
1599 my ($full_filename) = @_;
1600
1601 push (@{$self->{'source_assoc_files'}}, $full_filename);
1602
1603}
1604
1605sub get_source_assoc_files {
1606 my $self = shift (@_);
1607
1608 return $self->{'source_assoc_files'};
1609
1610
1611}
1612sub metadata_file {
1613 my $self = shift (@_);
1614 my ($real_filename, $filename) = @_;
1615
1616 push (@{$self->{'metadata_files'}},
1617 [$real_filename, $filename]);
1618}
1619
1620# used for writing out the archiveinf-doc info database, to list all the metadata files
1621sub get_meta_files {
1622 my $self = shift (@_);
1623
1624 return $self->{'metadata_files'};
1625}
1626
1627sub delete_section_assoc_files {
1628 my $self = shift (@_);
1629 my ($section) = @_;
1630
1631 my $i=0;
1632 while ($i < scalar (@{$self->{'associated_files'}})) {
1633 if (defined $self->{'associated_files'}->[$i]->[3] &&
1634 $self->{'associated_files'}->[$i]->[3] eq $section) {
1635 splice (@{$self->{'associated_files'}}, $i, 1);
1636 } else {
1637 $i++;
1638 }
1639 }
1640}
1641
1642sub delete_assoc_file {
1643 my $self = shift (@_);
1644 my ($assoc_filename) = @_;
1645
1646 my $i=0;
1647 while ($i < scalar (@{$self->{'associated_files'}})) {
1648 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1649 splice (@{$self->{'associated_files'}}, $i, 1);
1650 } else {
1651 $i++;
1652 }
1653 }
1654}
1655
1656sub reset_nextsection_ptr {
1657 my $self = shift (@_);
1658 my ($section) = @_;
1659
1660 my $section_ptr = $self->_lookup_section($section);
1661 $section_ptr->{'next_subsection'} = 1;
1662}
1663
16641;
Note: See TracBrowser for help on using the repository browser.