source: main/trunk/greenstone2/perllib/doc.pm@ 34394

Last change on this file since 34394 was 34394, checked in by ak19, 4 years ago

Bugfix 1 for GLI metadata slowdown: selecting multiple Gathererd files in GLI became very slow. Kathy and Dr Bainbridge had tracked this down to code I had added to support non basic ASCII filenames in GLI, which was making an expensive win operating system function call on Windows for each selected file, launching a Java Process for each. The speed of selecting multiple files is now back to being almost as fast as in 3.09. Tested on Windows and linux. Had to treat windows as a special case because I can't get the code modifications to work on Linux: the perl code stores a hex-encoded string for the filename that GLI now uses when OS is Windows and compares against the hex encoded name of a file selected. But on linux the hex encoded value generated by perl is not the same as that which java generates and after trying repeatedly, I'e not been able to succeed to get it to work. So the code behaves as before for Linux.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 52.8 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use FileUtils;
40use ghtml;
41use File::stat;
42##use hashdoc;
43use docprint;
44use JSON;
45
46# We just need pi from the Trig lib
47# Import constants pi2, pip2, pip4 (2*pi, pi/2, pi/4).
48# But doing
49# use Math::Trig ':pi';
50# resulted in errors on 32 bit linux VM with perl v8 of "bareword pi not allowed while strict subs in use"
51# So, now doing just
52use Math::Trig;
53
54# the document type may be indexed_doc, nonindexed_doc, or
55# classification
56
57our $OIDcount = 0;
58
59
60# Declare the Package (level) Variable that will tell us whether we're in the import or buildcol phase.
61# This var will be set by import.pl, buildcol.pl and export.pl to the then appropriate value.
62# https://perlmaven.com/package-variables-and-lexical-variables-in-perl
63# But: https://stackoverflow.com/questions/7542120/perl-how-to-make-variables-from-requiring-script-available-in-required-script
64# https://stackoverflow.com/questions/33230015/how-do-i-use-a-variable-in-another-perl-script
65#$doc::cmd_line_mode = undef;
66# Decided to follow the usage of $OIDcount above to declare the necessary package-level variables 'globally' accessible
67# actually accessible only for those packages that import this one (with 'use doc')
68my $cmd_line_mode = undef;
69
70# processor_mode keeps track of which buildcol pass we're at: dummy, text (sidx/didx passes) or infodb
71my $processor_mode = undef;
72
73# rename_method can be 'url', 'none', 'base64'
74sub new {
75 my $class = shift (@_);
76 my ($source_filename, $doc_type, $rename_method) = @_;
77
78
79 my $self = bless {'associated_files'=>[],
80 'subsection_order'=>[],
81 'next_subsection'=>1,
82 'subsections'=>{},
83 'metadata'=>[],
84 'text'=>"",
85 'OIDtype'=>"hash"}, $class;
86
87 # used to set lastmodified here, but this can screw up the HASH ids, so
88 # the docsave processor now calls set_lastmodified
89
90 $self->set_source_path($source_filename);
91
92 if (defined $source_filename) {
93 $source_filename = &util::filename_within_collection($source_filename);
94 print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
95 $self->set_source_filename ($source_filename, $rename_method);
96 }
97
98 $self->set_doc_type ($doc_type) if defined $doc_type;
99
100 return $self;
101}
102
103
104sub set_source_path
105{
106 my $self = shift @_;
107 my ($source_filename) = @_;
108
109 if (defined $source_filename) {
110 # On Windows the source_filename can be in terse DOS format
111 # e.g. test~1.txt
112
113 $self->{'terse_source_path'} = $source_filename;
114
115 # Use the FileUtil library methods as they are aware of more special
116 # cases such as HDFS [jmt12]
117 if (&FileUtils::fileExists($source_filename))
118 {
119 # See if we can do better for Windows with a filename
120 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
121 require Win32;
122 $self->{'source_path'} = Win32::GetLongPathName($source_filename);
123 }
124 else {
125 # For Unix-based systems, there is no difference between the two
126 $self->{'source_path'} = $source_filename;
127 }
128
129 # On Windows, the code above has ensured source_path is the Win long (full) path name.
130 # To help GLI associate metadata with an easily calculated and accurate representation of
131 # filenames, we now store the Win long path name, hex encoded.
132 # We're not using this field on Linux, as I can't get the hex encodings generated to match
133 # what GLI Java code generates. But for symmetry we store this field on Unix too, but we need
134 # to hex-encode source_path on Unix too, or it may not be UTF-8 and doc.xml will be invalid
135 my $hexencodedlongsourcepath = &unicode::debug_unicode_string($self->{'source_path'});
136 $self->set_utf8_metadata_element ($self->get_top_section(), "gsdlfullsourcepath", $hexencodedlongsourcepath);
137
138 }
139 else {
140 print STDERR "Warning: In doc::set_source_path(), file\n";
141 print STDERR " $source_filename\n";
142 print STDERR " does not exist\n";
143
144 # (default) Set it to whatever we were given
145 $self->{'source_path'} = $source_filename;
146 }
147 }
148 else {
149 # Previous code for setting source_path allowed for
150 # it to be undefined, so continue this practice
151 $self->{'terse_source_path'} = undef;
152 $self->{'source_path'} = undef;
153 }
154}
155
156
157sub get_source_path
158{
159 my $self = shift @_;
160
161 return $self->{'terse_source_path'};
162}
163
164# set lastmodified for OAI purposes, added by GRB, moved by kjdon
165sub set_oailastmodified {
166 my $self = shift (@_);
167
168 my $source_path = $self->{'terse_source_path'};
169
170 if (defined $source_path && (-e $source_path)) {
171 my $current_time = time;
172
173 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
174 $wday, $yday, $isdst) = localtime($current_time);
175
176 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
177
178 $self->add_utf8_metadata($self->get_top_section(), "oailastmodified", $current_time);
179 $self->add_utf8_metadata($self->get_top_section(), "oailastmodifieddate", $date_modified);
180 }
181}
182
183# no longer used for OAI purposes, since lastmodified is not what we want as the
184# Datestamp of a document. This doc metadata may be useful for general purposes.
185sub set_lastmodified {
186 my $self = shift (@_);
187
188 my $source_path = $self->{'terse_source_path'};
189
190 if (defined $source_path && (-e $source_path)) {
191
192 my $file_stat = stat($source_path);
193 my $mtime = $file_stat->mtime;
194 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
195 $wday, $yday, $isdst) = localtime($mtime);
196
197 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
198
199 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $mtime);
200 $self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
201 }
202}
203
204# clone the $self object
205sub duplicate {
206 my $self = shift (@_);
207
208 my $newobj = {};
209
210 foreach my $k (keys %$self) {
211 $newobj->{$k} = &clone ($self->{$k});
212 }
213
214 bless $newobj, ref($self);
215 return $newobj;
216}
217
218sub clone {
219 my ($from) = @_;
220 my $type = ref ($from);
221
222 if ($type eq "HASH") {
223 my $to = {};
224 foreach my $key (keys %$from) {
225 $to->{$key} = &clone ($from->{$key});
226 }
227 return $to;
228 } elsif ($type eq "ARRAY") {
229 my $to = [];
230 foreach my $v (@$from) {
231 push (@$to, &clone ($v));
232 }
233 return $to;
234 } else {
235 return $from;
236 }
237}
238
239sub set_OIDtype {
240 my $self = shift (@_);
241 my ($type, $metadata) = @_;
242
243 if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|hash_on_full_filename|incremental|filename|dirname|full_filename|assigned)$/) {
244 $self->{'OIDtype'} = $type;
245 } else {
246 $self->{'OIDtype'} = "hash";
247 }
248
249 if ($type =~ /^assigned$/) {
250 if (defined $metadata) {
251 $self->{'OIDmetadata'} = $metadata;
252 } else {
253 $self->{'OIDmetadata'} = "dc.Identifier";
254 }
255 }
256}
257
258# rename_method can be 'url', 'none', 'base64'
259sub set_source_filename {
260 my $self = shift (@_);
261 my ($source_filename, $rename_method) = @_;
262
263 # Since the gsdlsourcefilename element goes into the doc.xml it has
264 # to be utf8. However, it should also *represent* the source filename
265 # (in the import directory) which may not be utf8 at all.
266 # For instance, if this meta element (gsdlsourcefilename) will be used
267 # by other applications that parse doc.xml in order to locate
268 # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
269 # encode the real filename as this is a binary-to-text encoding meaning
270 # that the resulting string is ASCII (utf8). Decoding will give the original.
271
272# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
273## print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename\n";
274
275 # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
276 $source_filename = $self->encode_filename($source_filename, $rename_method);
277# my ($srcfilename,$dirname,$suffix)
278# = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
279# print STDERR "-> $srcfilename -> ";
280# $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
281# $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
282# print STDERR "$source_filename\n";
283
284## print STDERR "AFTER URL/base64 encoding gsdlsourcefilename: $source_filename\n";
285
286 $self->set_utf8_metadata_element ($self->get_top_section(),
287 "gsdlsourcefilename",
288 $source_filename);
289
290 # If we set the file renaming method, at least GLI can decode gsdlsourcefilename to get the
291 # original filename (relative path to file) back for GLI to notice what ex Meta is associated with it
292 $self->set_utf8_metadata_element($self->get_top_section(),
293 "gsdlsourcefilerenamemethod",
294 $rename_method);
295}
296
297sub encode_filename {
298 my $self = shift (@_);
299 my ($source_filename, $rename_method) = @_;
300
301 my ($srcfilename,$dirname,$suffix)
302 = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
303# print STDERR "-> $srcfilename -> ";
304 $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
305
306 # encode any subdirs of "import" also, but not import itself
307 my $sep = ($ENV{'GSDLOS'} =~ /^windows$/i) ? "\\" : "/";
308 my @dirs = ($sep eq "\\") ? split(/[\\\/]+/, $dirname) : split(/\//, $dirname);
309
310 my $dirpath = "";
311
312 # Don't encode the first folder ("import" or "tmp"): GLI's DocXMLFile.java looks for literal "import"
313 # or "tmp" before it knows what the file rename method is that is to be used decode the rest of gsdlsourcefilename.
314 if(scalar (@dirs) >= 1) {
315 $dirpath = shift(@dirs);
316 }
317 foreach my $subdir (@dirs) {
318 #print STDERR "@@@@ Found subdir: $subdir\n";
319
320 $subdir = &util::rename_file($subdir, $rename_method);
321 #print STDERR "@@@@ encoded subdir: $subdir\n";
322
323 $dirpath = &FileUtils::filenameConcatenate($dirpath, $subdir);
324 }
325 if($dirpath ne "") {
326 $dirname = $dirpath;
327 }
328
329 $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
330
331 return $source_filename;
332}
333
334sub set_converted_filename {
335 my $self = shift (@_);
336 my ($converted_filename) = @_;
337
338 # we know the converted filename is utf8
339 $self->set_utf8_metadata_element ($self->get_top_section(),
340 "gsdlconvertedfilename",
341 $converted_filename);
342}
343
344# returns the source_filename as it was provided
345sub get_unmodified_source_filename {
346 my $self = shift (@_);
347
348 return $self->{'terse_source_path'};
349}
350
351# returns the source_filename with whatever rename_method was given
352sub get_source_filename {
353 my $self = shift (@_);
354
355 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
356}
357
358
359
360# returns converted filename if available else returns source filename
361sub get_filename_for_hashing {
362 my $self = shift (@_);
363
364 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
365
366 if (!defined $filename) {
367 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
368 # if NULPlug processed file, then don't give a filename
369 if (defined $plugin_name && $plugin_name eq "NULPlug") {
370 $filename = undef;
371 } else { # returns the URL encoded source filename!
372 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
373 }
374 }
375
376 if (!&FileUtils::isFilenameAbsolute($filename)) {
377 $filename = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$filename);
378 }
379
380 return $filename;
381}
382
383sub set_doc_type {
384 my $self = shift (@_);
385 my ($doc_type) = @_;
386
387 $self->set_metadata_element ($self->get_top_section(),
388 "gsdldoctype",
389 $doc_type);
390}
391
392# returns the gsdldoctype as it was provided
393# the default of "indexed_doc" is used if no document
394# type was provided
395sub get_doc_type {
396 my $self = shift (@_);
397
398 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
399 return $doc_type if (defined $doc_type);
400 return "indexed_doc";
401}
402
403
404# look up the reference to the a particular section
405sub _lookup_section {
406 my $self = shift (@_);
407 my ($section) = @_;
408
409 my ($num);
410 my $sectionref = $self;
411
412 while (defined $section && $section ne "") {
413
414 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
415
416 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
417
418 $section = "" unless defined $section;
419
420
421 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
422 $sectionref = $sectionref->{'subsections'}->{$num};
423 } else {
424 return undef;
425 }
426 }
427
428 return $sectionref;
429}
430
431# calculate OID by hashing the contents of the document
432sub _calc_OID {
433 my $self = shift (@_);
434 my ($filename) = @_;
435
436
437 my $osexe = &util::get_os_exe();
438
439 my $hashfile_exe = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin",
440 $ENV{'GSDLOS'},"hashfile$osexe");
441
442 &util::set_gnomelib_env(); # gnomelib_env (particularly lib/libiconv2.dylib) required to run the hashfile executable on Mac Lions
443 # The subroutine will set the gnomelib env once for each subshell launched, by first testing if GEXTGNOME is not already set
444
445 # A different way to set the gnomelib env would be to do it more locally: exporting the necessary vars
446 # (specifically DYLD/LD_LIB_PATH) for gnome_lib as part of the command executed.
447 # E.g. $result=`export LD_LIBRARY_PATH=../ext/gnome-lib/darwin/lib; hashfile...`
448
449 my $result = "NULL";
450
451
452 if (-e "$hashfile_exe") {
453# $result = `\"$hashfile_exe\" \"$filename\"`;
454# $result = `hashfile$osexe \"$filename\" 2>&1`;
455 $result = `hashfile$osexe \"$filename\"`;
456
457 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
458 } else {
459 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
460 }
461 return "HASH$result";
462}
463
464# methods dealing with OID, not groups of them.
465
466# if $OID is not provided one is calculated
467sub set_OID {
468 my $self = shift (@_);
469 my ($OID) = @_;
470
471 my $use_hash_oid = 0;
472 # if an OID wasn't provided calculate one
473 if (!defined $OID) {
474 $OID = "NULL";
475 if ($self->{'OIDtype'} =~ /^hash/) {
476 $use_hash_oid = 1;
477 } elsif ($self->{'OIDtype'} eq "incremental") {
478 $OID = "D" . $OIDcount;
479 $OIDcount ++;
480 } elsif ($self->{'OIDtype'} eq "filename") {
481 my $filename = $self->get_source_filename();
482 $OID = &File::Basename::fileparse($filename, qr/\.[^.]*/);
483 $OID = &util::tidy_up_oid($OID);
484 } elsif ($self->{'OIDtype'} eq "full_filename") {
485 my $source_filename = $self->get_source_filename();
486 my $dirsep = &util::get_os_dirsep();
487
488 $source_filename =~ s/^import$dirsep//;
489 $source_filename =~ s/$dirsep/-/g;
490 $source_filename =~ s/\./_/g;
491
492 $OID = $source_filename;
493 $OID = &util::tidy_up_oid($OID);
494 } elsif ($self->{'OIDtype'} eq "dirname") {
495 my $filename = $self->get_source_filename();
496 if (defined($filename)) { # && -e $filename) {
497 # get the immediate parent directory
498 $OID = &File::Basename::dirname($filename);
499 if (defined $OID) {
500 $OID = &File::Basename::basename($OID);
501 $OID = &util::tidy_up_oid($OID);
502 } else {
503 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
504 $use_hash_oid = 1;
505 }
506 } else {
507 print STDERR "Failed to find a filename, generating hash id\n";
508 $use_hash_oid = 1;
509 }
510
511 } elsif ($self->{'OIDtype'} eq "assigned") {
512 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
513 if (defined $identifier && $identifier ne "") {
514 $OID = $identifier;
515 $OID = &util::tidy_up_oid($OID);
516 } else {
517 # need a hash id
518 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
519 $use_hash_oid = 1;
520 }
521
522 } else {
523 $use_hash_oid = 1;
524 }
525
526 if ($use_hash_oid) {
527 my $hash_on_file = 1;
528 my $hash_on_ga_xml = 0;
529
530 if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
531 $hash_on_file = 0;
532 $hash_on_ga_xml = 1;
533 }
534
535 if ($self->{'OIDtype'} eq "hash_on_full_filename") {
536 $hash_on_file = 0;
537 $hash_on_ga_xml = 0;
538
539 my $source_filename = $self->get_source_filename();
540 my $dirsep = &util::get_os_dirsep();
541
542 $source_filename =~ s/^import$dirsep//;
543 $source_filename =~ s/$dirsep/-/g;
544 $source_filename =~ s/\./_/g;
545
546 # If the filename is very short then (handled naively)
547 # this can cause conjestion in the hash-values
548 # computed, leading documents sharing the same leading
549 # Hex values in the computed has.
550 #
551 # The solution taken here is to replace the name of
552 # the file name a sufficient number of times (up to
553 # the character limit defined in 'rep_limit' and
554 # make that the content that is hashed on
555
556 # *** Think twice before changing the following value
557 # as it will break backward compatability of computed
558 # document HASH values
559
560 my $rep_limit = 256;
561 my $hash_content = undef;
562
563 if (length($source_filename)<$rep_limit) {
564 my $rep_string = "$source_filename|";
565 my $rs_len = length($rep_string);
566
567 my $clone_times = int(($rep_limit-1)/$rs_len) +1;
568
569 $hash_content = substr($rep_string x $clone_times, 0, $rep_limit);
570 }
571 else {
572 $hash_content = $source_filename;
573 }
574
575 my $filename = &util::get_tmp_filename();
576 if (!open (OUTFILE, ">:utf8", $filename)) {
577 print STDERR "doc::set_OID could not write to $filename\n";
578 } else {
579 print OUTFILE $hash_content;
580 close (OUTFILE);
581 }
582 $OID = $self->_calc_OID ($filename);
583
584 &FileUtils::removeFiles ($filename);
585 }
586
587 if ($hash_on_file) {
588 # "hash" OID - feed file to hashfile.exe
589 my $filename = $self->get_filename_for_hashing();
590
591 # -z: don't want to hash on the file if it is zero size
592 if (defined($filename) && -e $filename && !-z $filename) {
593 $OID = $self->_calc_OID ($filename);
594 } else {
595 $hash_on_ga_xml = 1; # switch to back-up plan, and hash on GA file instead
596 }
597 }
598
599 if ($hash_on_ga_xml) {
600 # In addition being asked to explicity calculate the has based on the GA file,
601 # can also end up coming into this block is doing 'hash_on_file' but the file
602 # itself is of zero bytes (as could be the case with 'doc.nul' file
603
604 my $filename = &util::get_tmp_filename();
605 if (!open (OUTFILE, ">:utf8", $filename)) {
606 print STDERR "doc::set_OID could not write to $filename\n";
607 } else {
608 my $doc_text = &docprint::get_section_xml($self);
609 print OUTFILE $doc_text;
610 close (OUTFILE);
611 }
612 $OID = $self->_calc_OID ($filename);
613 &FileUtils::removeFiles($filename);
614 }
615 }
616 }
617 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
618}
619
620# this uses hashdoc (embedded c thingy) which is faster but still
621# needs a little work to be sufficiently stable
622sub ___set_OID {
623 my $self = shift (@_);
624 my ($OID) = @_;
625
626 # if an OID wasn't provided then calculate hash value based on document
627 if (!defined $OID)
628 {
629 my $hash_text = &docprint::get_section_xml($self);
630 my $hash_len = length($hash_text);
631
632 $OID = &hashdoc::buffer($hash_text,$hash_len);
633 }
634
635 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
636}
637
638# returns the OID for this document
639sub get_OID {
640 my $self = shift (@_);
641 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
642 return $OID if (defined $OID);
643 return "NULL";
644}
645
646sub delete_OID {
647 my $self = shift (@_);
648
649 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
650}
651
652
653# methods for manipulating section names
654
655# returns the name of the top-most section (the top
656# level of the document
657sub get_top_section {
658 my $self = shift (@_);
659
660 return "";
661}
662
663# returns a section
664sub get_parent_section {
665 my $self = shift (@_);
666 my ($section) = @_;
667
668 $section =~ s/(^|\.)\d+$//;
669
670 return $section;
671}
672
673# returns the first child section (or the end child
674# if there isn't any)
675sub get_begin_child {
676 my $self = shift (@_);
677 my ($section) = @_;
678
679 my $section_ptr = $self->_lookup_section($section);
680 return "" unless defined $section_ptr;
681
682 if (defined $section_ptr->{'subsection_order'}->[0]) {
683 return "$section.$section_ptr->{'subsection_order'}->[0]";
684 }
685
686 return $self->get_end_child ($section);
687}
688
689# returns the next child of a parent section
690sub get_next_child {
691 my $self = shift (@_);
692 my ($section) = @_;
693
694 my $parent_section = $self->get_parent_section($section);
695 my $parent_section_ptr = $self->_lookup_section($parent_section);
696 return undef unless defined $parent_section_ptr;
697
698 my ($section_num) = $section =~ /(\d+)$/;
699 return undef unless defined $section_num;
700
701 my $i = 0;
702 my $section_order = $parent_section_ptr->{'subsection_order'};
703 while ($i < scalar(@$section_order)) {
704 last if $section_order->[$i] eq $section_num;
705 $i++;
706 }
707
708 $i++; # the next child
709 if ($i < scalar(@$section_order)) {
710 return $section_order->[$i] if $parent_section eq "";
711 return "$parent_section.$section_order->[$i]";
712 }
713
714 # no more sections in this level
715 return undef;
716}
717
718# returns a reference to a list of children
719sub get_children {
720 my $self = shift (@_);
721 my ($section) = @_;
722
723 my $section_ptr = $self->_lookup_section($section);
724 return [] unless defined $section_ptr;
725
726 my @children = @{$section_ptr->{'subsection_order'}};
727
728 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
729 return \@children;
730}
731
732# returns the child section one past the last one (which
733# is coded as "0")
734sub get_end_child {
735 my $self = shift (@_);
736 my ($section) = @_;
737
738 return $section . ".0" unless $section eq "";
739 return "0";
740}
741
742# returns the next section in book order
743sub get_next_section {
744 my $self = shift (@_);
745 my ($section) = @_;
746
747 return undef unless defined $section;
748
749 my $section_ptr = $self->_lookup_section($section);
750 return undef unless defined $section_ptr;
751
752 # first try to find first child
753 if (defined $section_ptr->{'subsection_order'}->[0]) {
754 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
755 return "$section.$section_ptr->{'subsection_order'}->[0]";
756 }
757
758 do {
759 # try to find sibling
760 my $next_child = $self->get_next_child ($section);
761 return $next_child if (defined $next_child);
762
763 # move up one level
764 $section = $self->get_parent_section ($section);
765 } while $section =~ /\d/;
766
767 return undef;
768}
769
770sub is_leaf_section {
771 my $self = shift (@_);
772 my ($section) = @_;
773
774 my $section_ptr = $self->_lookup_section($section);
775 return 1 unless defined $section_ptr;
776
777 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
778}
779
780# methods for dealing with sections
781
782# returns the name of the inserted section
783sub insert_section {
784 my $self = shift (@_);
785 my ($before_section) = @_;
786
787 # get the child to insert before and its parent section
788 my $parent_section = "";
789 my $before_child = "0";
790 my @before_section = split (/\./, $before_section);
791 if (scalar(@before_section) > 0) {
792 $before_child = pop (@before_section);
793 $parent_section = join (".", @before_section);
794 }
795
796 my $parent_section_ptr = $self->_lookup_section($parent_section);
797 if (!defined $parent_section_ptr) {
798 print STDERR "doc::insert_section couldn't find parent section " .
799 "$parent_section\n";
800 return;
801 }
802
803 # get the next section number
804 my $section_num = $parent_section_ptr->{'next_subsection'}++;
805
806 my $i = 0;
807 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
808 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
809 $i++;
810 }
811
812 # insert the section number into the order list
813 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
814
815 # add this section to the parent section
816 my $section_ptr = {'subsection_order'=>[],
817 'next_subsection'=>1,
818 'subsections'=>{},
819 'metadata'=>[],
820 'text'=>""};
821 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
822
823 # work out the full section number
824 my $section = $parent_section;
825 $section .= "." unless $section eq "";
826 $section .= $section_num;
827
828 return $section;
829}
830
831# creates a pre-named section
832sub create_named_section {
833 my $self = shift (@_);
834 my ($mastersection) = @_;
835
836 my ($num);
837 my $section = $mastersection;
838 my $sectionref = $self;
839
840 while ($section ne "") {
841 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
842 $num =~ s/^0+(\d)/$1/; # remove leading 0s
843 $section = "" unless defined $section;
844
845 if (defined $num) {
846 if (!defined $sectionref->{'subsections'}->{$num}) {
847 push (@{$sectionref->{'subsection_order'}}, $num);
848 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
849 'next_subsection'=>1,
850 'subsections'=>{},
851 'metadata'=>[],
852 'text'=>""};
853 if ($num >= $sectionref->{'next_subsection'}) {
854 $sectionref->{'next_subsection'} = $num + 1;
855 }
856 }
857 $sectionref = $sectionref->{'subsections'}->{$num};
858
859 } else {
860 print STDERR "doc::create_named_section couldn't create section ";
861 print STDERR "$mastersection\n";
862 last;
863 }
864 }
865}
866
867# returns a reference to a list of subsections
868sub list_subsections {
869 my $self = shift (@_);
870 my ($section) = @_;
871
872 my $section_ptr = $self->_lookup_section ($section);
873 if (!defined $section_ptr) {
874 print STDERR "doc::list_subsections couldn't find section $section\n";
875 return [];
876 }
877
878 return [@{$section_ptr->{'subsection_order'}}];
879}
880
881sub delete_section {
882 my $self = shift (@_);
883 my ($section) = @_;
884
885# my $section_ptr = {'subsection_order'=>[],
886# 'next_subsection'=>1,
887# 'subsections'=>{},
888# 'metadata'=>[],
889# 'text'=>""};
890
891 # if this is the top section reset everything
892 if ($section eq "") {
893 $self->{'subsection_order'} = [];
894 $self->{'subsections'} = {};
895 $self->{'metadata'} = [];
896 $self->{'text'} = "";
897 return;
898 }
899
900 # find the parent of the section to delete
901 my $parent_section = "";
902 my $child = "0";
903 my @section = split (/\./, $section);
904 if (scalar(@section) > 0) {
905 $child = pop (@section);
906 $parent_section = join (".", @section);
907 }
908
909 my $parent_section_ptr = $self->_lookup_section($parent_section);
910 if (!defined $parent_section_ptr) {
911 print STDERR "doc::delete_section couldn't find parent section " .
912 "$parent_section\n";
913 return;
914 }
915
916 # remove this section from the subsection_order list
917 my $i = 0;
918 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
919 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
920 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
921 last;
922 }
923 $i++;
924 }
925
926 # remove this section from the subsection hash
927 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
928 undef $parent_section_ptr->{'subsections'}->{$child};
929 }
930}
931
932#--
933# methods for dealing with metadata
934
935# set_metadata_element and get_metadata_element are for metadata
936# which should only have one value. add_meta_data and get_metadata
937# are for metadata which can have more than one value.
938
939# returns the first metadata value which matches field
940
941# This version of get metadata element works much like the one above,
942# except it allows for the namespace portion of a metadata element to
943# be ignored, thus if you are searching for dc.Title, the first piece
944# of matching metadata ending with the name Title (once any namespace
945# is removed) would be returned.
946# 28-11-2003 John Thompson
947sub get_metadata_element {
948 my $self = shift (@_);
949 my ($section, $field, $ignore_namespace) = @_;
950 my ($data);
951
952 $ignore_namespace = 0 unless defined $ignore_namespace;
953
954 my $section_ptr = $self->_lookup_section($section);
955 if (!defined $section_ptr) {
956 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
957 return;
958 }
959
960 # Remove any namespace if we are being told to ignore them
961 if($ignore_namespace) {
962 $field =~ s/^.*\.//; #$field =~ s/^\w*\.//;
963 }
964
965 foreach $data (@{$section_ptr->{'metadata'}}) {
966
967 my $data_name = $data->[0];
968
969 # Remove any namespace if we are being told to ignore them
970 if($ignore_namespace) {
971 $data_name =~ s/^.*\.//; #$data_name =~ s/^\w*\.//;
972 }
973 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
974 $data_name =~ s/^ex\.([^.]+)$/$1/; #$data_name =~ s/^ex\.//;
975
976 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
977 }
978
979 return undef; # was not found
980}
981
982# returns a list of the form [value1, value2, ...]
983sub get_metadata {
984 my $self = shift (@_);
985 my ($section, $field, $ignore_namespace) = @_;
986 my ($data);
987
988 $ignore_namespace = 0 unless defined $ignore_namespace;
989
990 my $section_ptr = $self->_lookup_section($section);
991 if (!defined $section_ptr) {
992 print STDERR "doc::get_metadata couldn't find section ",
993 $section, "\n";
994 return;
995 }
996
997 # Remove any namespace if we are being told to ignore them
998 if($ignore_namespace) {
999 $field =~ s/^.*\.//;
1000 }
1001
1002 my @metadata = ();
1003 foreach $data (@{$section_ptr->{'metadata'}}) {
1004
1005 my $data_name = $data->[0];
1006
1007 # Remove any namespace if we are being told to ignore them
1008 if($ignore_namespace) {
1009 $data_name =~ s/^.*\.//;
1010 }
1011 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
1012 $data_name =~ s/^ex\.([^.]+)$/$1/;
1013
1014 push (@metadata, $data->[1]) if ($data_name eq $field);
1015 }
1016
1017 return \@metadata;
1018}
1019
1020sub get_metadata_hashmap {
1021 my $self = shift (@_);
1022 my ($section, $opt_namespace) = @_;
1023
1024 my $section_ptr = $self->_lookup_section($section);
1025 if (!defined $section_ptr) {
1026 print STDERR "doc::get_metadata couldn't find section ",
1027 $section, "\n";
1028 return;
1029 }
1030
1031 my $metadata_hashmap = {};
1032 foreach my $data (@{$section_ptr->{'metadata'}}) {
1033 my $metaname = $data->[0];
1034
1035 if ((!defined $opt_namespace) || ($metaname =~ m/^$opt_namespace\./)) {
1036 if (!defined $metadata_hashmap->{$metaname}) {
1037 $metadata_hashmap->{$metaname} = [];
1038 }
1039 my $metaval_list = $metadata_hashmap->{$metaname};
1040 push(@$metaval_list, $data->[1]);
1041 }
1042 }
1043
1044 return $metadata_hashmap;
1045}
1046
1047# returns a list of the form [[field,value],[field,value],...]
1048sub get_all_metadata {
1049 my $self = shift (@_);
1050 my ($section) = @_;
1051
1052 my $section_ptr = $self->_lookup_section($section);
1053 if (!defined $section_ptr) {
1054 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
1055 return;
1056 }
1057
1058 return $section_ptr->{'metadata'};
1059}
1060
1061# $value is optional
1062sub delete_metadata {
1063 my $self = shift (@_);
1064 my ($section, $field, $value) = @_;
1065
1066 my $section_ptr = $self->_lookup_section($section);
1067 if (!defined $section_ptr) {
1068 print STDERR "doc::delete_metadata couldn't find section ", $section, "$field\n";
1069 return;
1070 }
1071
1072 my $i = 0;
1073 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
1074 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
1075 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
1076 splice (@{$section_ptr->{'metadata'}}, $i, 1);
1077 } else {
1078 $i++;
1079 }
1080 }
1081}
1082
1083sub delete_all_metadata {
1084 my $self = shift (@_);
1085 my ($section) = @_;
1086
1087 my $section_ptr = $self->_lookup_section($section);
1088 if (!defined $section_ptr) {
1089 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
1090 return;
1091 }
1092
1093 $section_ptr->{'metadata'} = [];
1094}
1095
1096sub set_metadata_element {
1097 my $self = shift (@_);
1098 my ($section, $field, $value) = @_;
1099
1100 $self->set_utf8_metadata_element ($section, $field,
1101 &unicode::ascii2utf8(\$value));
1102}
1103
1104# set_utf8_metadata_element assumes the text has already been
1105# converted to the UTF-8 encoding.
1106sub set_utf8_metadata_element {
1107 my $self = shift (@_);
1108 my ($section, $field, $value) = @_;
1109
1110 $self->delete_metadata ($section, $field);
1111 $self->add_utf8_metadata ($section, $field, $value);
1112}
1113
1114
1115# add_metadata assumes the text is in (extended) ascii form. For
1116# text which has already been converted to the UTF-8 format use
1117# add_utf8_metadata.
1118sub add_metadata {
1119 my $self = shift (@_);
1120 my ($section, $field, $value) = @_;
1121
1122 $self->add_utf8_metadata ($section, $field,
1123 &unicode::ascii2utf8(\$value));
1124}
1125
1126sub add_utf8_metadata {
1127 my $self = shift (@_);
1128 my ($section, $field, $value) = @_;
1129
1130 # my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1131 # my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
1132 # print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
1133
1134 my $section_ptr = $self->_lookup_section($section);
1135 if (!defined $section_ptr) {
1136 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
1137 return;
1138 }
1139 if (!defined $value) {
1140 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
1141 return;
1142 }
1143 if (!defined $field) {
1144 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
1145 return;
1146 }
1147
1148 #print STDERR "###$field=$value\n";
1149
1150 # For now, supress this check. Given that text data read in is now
1151 # Unicode aware, then the following block of code can (ironically enough)
1152 # cause our unicode compliant string to be re-encoded (leading to
1153 # a double-encoded UTF-8 string, which we definitely don't want!).
1154
1155
1156 # double check that the value is utf-8
1157 # if (!&unicode::check_is_utf8($value)) {
1158 # print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
1159 # &unicode::ensure_utf8(\$value);
1160 # print STDERR " Tried converting to utf8: $value\n";
1161 # }
1162
1163 #If the metadata value is either a latitude or a longitude value then we want to save a shortened version for spacial searching purposes
1164 if ($field =~ m/^(.+\.)?Latitude$/ || $field =~ m/^(.+\.)?Longitude$/)
1165 {
1166 my ($mdprefix,$metaname) = ($field =~ m/(.+)\.(.+)$/);
1167 if (defined $mdprefix) {
1168 # Add in a version of Latitude/Longitude without the metadata namespace prefix to keep Runtime happy
1169 push (@{$section_ptr->{'metadata'}}, [$metaname, $value]);
1170 }
1171
1172 my $direction;
1173 # TODO does the following work, with eq latitude? the above code implies can be xxxxx.Latitude. should use metaname??
1174 if($value =~ m/^-/)
1175 {
1176 $direction = ($field eq "Latitude") ? "S" : "W";
1177 }
1178 else
1179 {
1180 $direction = ($field eq "Latitude") ? "N" : "E";
1181 }
1182
1183 my ($beforeDec, $afterDec) = ($value =~ m/^-?([0-9]+)\.([0-9]+)$/);
1184 if(defined $beforeDec && defined $afterDec)
1185 {
1186 my $name = ($field eq "Latitude") ? "LatShort" : "LngShort";
1187 push (@{$section_ptr->{'metadata'}}, [$name, $beforeDec . $direction]);
1188
1189 for(my $i = 2; $i <= 4; $i++)
1190 {
1191 if(length($afterDec) >= $i)
1192 {
1193 push (@{$section_ptr->{'metadata'}}, [$name, substr($afterDec, 0, $i)]);
1194 }
1195 }
1196
1197 # what is this???? completely does nothing...
1198 #Only add the metadata if it has not already been added
1199 my $metaMap = $self->get_metadata_hashmap($section); # metaMap not used and called function has no apparent side-effects,
1200 # but this line appears important for ensuring uniqueness of (Latitude, value) meta
1201 # in the section. Also for LatShort, Longitude, LngShort.
1202 }
1203
1204 # if we have both Latitude and Longitude, then add coordinate metadata (not sure why, but someone else added it)
1205 my $latitude;
1206 my $longitude;
1207 if($field =~ m/^(.+\.)?Longitude$/) {
1208 # if we are dealing with Longitude meta, and we have already seen Latitude, then add Coordinate metadata
1209 $latitude = $self->get_metadata_element ($section, "Latitude");
1210 if (defined $latitude) {
1211 $longitude = $value;
1212 }
1213 } else {
1214 # we are dealing with Latitude - if we have already seen Longitude, then add both as Coordinate metadata
1215 $longitude = $self->get_metadata_element ($section, "Longitude");
1216 if (defined $longitude) {
1217 $latitude = $value;
1218 }
1219 }
1220 if (defined $latitude && defined $longitude) {
1221 ##rint STDERR "adding coords for $section, $latitude, $longitude\n";
1222 $self->processCoordinate($section, $latitude, $longitude);
1223 } else {
1224 ##rint STDERR "lat or long not defined, not adding coords for $section\n";
1225 }
1226
1227 }
1228
1229 elsif($field eq "GPS.mapOverlay") { # then the $value is a JSON string
1230
1231 # In order to allow searching map-data-enriched documents by map shape descriptions,
1232 # and to run rawquery searches for other docs by proximity based on their map data,
1233 # need to store the shape descriptions and Coordinate info for shapes into the text index.
1234 # We add the description for each shape in the mapoverlay into the text index as GPSMapOverlayLabel
1235 # And we add Coordinate (CD) and CoordShort (CS) info for each shape in the mapoverlay in the format (Lat, Lng) as ("37S339 175E342")
1236 # where the digits before the N/S/E/W direction represents the whole number. And the digits after the direction are the
1237 # decimal places which can vary between 0 and 2 to 4 digits.
1238
1239 # However, we only want to process GPS.mapOverlay during buildcol and only in the text indexing passes (e.g. sidx and didx for lucene)
1240 # and certainly never during the infodb pass of buildcol. The latter can end up duplicating Coordinate/CoordShort/GPSMapOverlayLabel
1241 # when rebuilding with the online doc editor, as that runs incremental-rebuild which then calls basebuilder::reconstruct_doc_objs_metadata()
1242 # on all docs NOT being incrementally rebuilt. That call would get meta from the infodb and use it to reconstruct the doc objects of docs
1243 # NOT being incrementally built. If the Coord and Label meta were written to the infodb, they would then be loaded back in memory when the
1244 # collection is incrementally rebuilt for those docs that don't need incremental processing. Then this function would once again add the
1245 # same meta into the infodb, thus duplicating what goes into the infodb. Hence, don't do all the following if doc::processor_mode =~ "infodb",
1246 # or anything other than a text mode.
1247
1248 # Note that for incremental rebuilding, the text pass can be called textreindex for instance (and infodb pass can be incinfodb).
1249 # So don't check for exact string match
1250
1251 if($doc::cmd_line_mode eq "buildcol" && $doc::processor_mode =~ m/^text/) { # currently known text processor_modes:
1252 # text, textreindex, possibly textdelete (see ArchivesInfPlugin.pm::read() for last 2).
1253 # OR: ..."buildcol" && $doc::processor_mode !~ m/infodb$/) # if dummy pass important
1254
1255 ###print STDERR "GPS.mapOverlay before decoding, val = " . $value . "\n";
1256
1257 # TODO: html decode?
1258 $value =~ s@&#091;@[@g;
1259 $value =~ s@&#093;@]@g;
1260 $value =~ s@&quot;@"@g;
1261 ###print STDERR "GPS.mapOverlay after decoding, val = " . $value . "\n";
1262
1263 my $json_array = decode_json $value;
1264
1265 foreach my $shape (@$json_array) {
1266
1267 # Put each available shape description/label into this section's metadata with GPSMapOverlayLabel as metaname.
1268 # Just as for Coordinate meta, don't need to know which shape a label belongs too. This is just so each label
1269 # will be indexed, and therefore can be searched.
1270
1271 my $description = $shape->{"description"};
1272 if($description) {
1273 push (@{$section_ptr->{'metadata'}}, ["GPSMapOverlayLabel", $description]);
1274 ###print STDERR "@@@@############################################ Just added description meta: " . $description . "\n";
1275 }
1276
1277 my $type = $shape->{"type"};
1278 ###print STDERR "Shape type : " . $type . "\n";
1279
1280 if($type eq "circle") {
1281 ###print STDERR "Found a circle:\n" . &printShape($json, $shape);
1282
1283 # work out bounding box
1284 # want the inverse of this useful page:
1285 # https://stackoverflow.com/questions/639695/how-to-convert-latitude-or-longitude-to-meters
1286 # https://www.geodatasource.com/developers/javascript
1287
1288 # Dr Bainbridge wants us to use: https://gis.stackexchange.com/questions/5821/calculating-latitude-longitude-x-miles-from-point
1289 # Since for N,E,S,W the following works out the same as above link, we're using the rule of thumb in the following:
1290 # https://gis.stackexchange.com/questions/2951/algorithm-for-offsetting-a-latitude-longitude-by-some-amount-of-meters
1291 # which states
1292 # "If your displacements aren't too great (less than a few kilometers) and you're not right at the poles,
1293 # use the quick and dirty estimate that 111,111 meters (111.111 km) in the y direction is 1 degree
1294 # (of latitude) and 111,111 * cos(latitude) meters in the x direction is 1 degree (of longitude)."
1295 my $centre_lat = $shape->{"center"}->{"lat"};
1296 my $centre_lng = $shape->{"center"}->{"lng"};
1297 my $radius = $shape->{"radius"}; # in metres!
1298
1299 ###print STDERR "@@@ circle centre: ($centre_lat, $centre_lng), radius: $radius\n";
1300
1301 my $lat_north = $centre_lat + ($radius/111111);
1302 my $lat_south = $centre_lat - ($radius/111111);
1303
1304 ###print STDERR "### lat_north: $lat_north\n";
1305 ###print STDERR "### lat_south: $lat_south\n";
1306
1307 # our latitude and longitude values are in degrees. But cos and sin etc in perl and generally all prog languages
1308 # all expect their angle to be in radians. So need to convert from degree to radians before we can take the cos of it.
1309 my $centre_lat_radians = $self->degreesToRadians($centre_lat);
1310 my $cos_in_radians = cos($centre_lat_radians);
1311 ###print STDERR "cos $centre_lat_radians " . cos($centre_lat_radians) . "\n";
1312 my $lng_east = $centre_lng + ($radius/(111111 * $cos_in_radians));
1313 my $lng_west = $centre_lng - ($radius/(111111 * $cos_in_radians));
1314 ###print STDERR "### lng_east $lng_east\n";
1315 ###print STDERR "### lng_west $lng_west\n";
1316
1317 my $cos_lat = cos($centre_lat);
1318 ###print STDERR "cos $centre_lat is $cos_lat\n";
1319
1320 $self->processCoordinate($section, $lat_north, $lng_east);
1321 $self->processCoordinate($section, $lat_south, $lng_east);
1322 $self->processCoordinate($section, $lat_south, $lng_west);
1323 $self->processCoordinate($section, $lat_north, $lng_west);
1324
1325 }
1326 elsif ($type eq "marker") {
1327 ###print STDERR "@@ MARKER FOUND WITH LAT: " . $shape->{"position"}->{"lat"} . "\n";
1328 ###print STDERR "@@ MARKER FOUND WITH LNG: " . $shape->{"position"}->{"lng"} . "\n";
1329 $self->processCoordinate($section, $shape->{"position"}->{"lat"}, $shape->{"position"}->{"lng"});
1330 }
1331 elsif ($type eq "polyline" || $type eq "polygon") {
1332 my $path_array = $shape->{"path"};
1333 foreach my $position (@$path_array) {
1334 $self->processCoordinate($section, $position->{"lat"}, $position->{"lng"});
1335 }
1336 }
1337 elsif ($type eq "rectangle") {
1338
1339 my $bounds = $shape->{"bounds"};
1340 $self->processCoordinate($section, $bounds->{"north"}, $bounds->{"east"});
1341 $self->processCoordinate($section, $bounds->{"south"}, $bounds->{"east"});
1342 $self->processCoordinate($section, $bounds->{"south"}, $bounds->{"west"});
1343 $self->processCoordinate($section, $bounds->{"north"}, $bounds->{"west"});
1344 }
1345
1346 } # end for on each shape in GPS.mapOverlay
1347 } # end if(buildcol and text pass)
1348 } # end GPS.mapOverlay meta
1349
1350 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1351}
1352
1353# https://en.wikipedia.org/wiki/Radian
1354sub degreesToRadians
1355{
1356 my $self = shift (@_);
1357 my ($degrees) = @_;
1358
1359 return $degrees * pi /180; # returns radians
1360}
1361
1362sub radiansToDegrees
1363{
1364 my $self = shift (@_);
1365 my ($radians) = @_;
1366
1367 return $radians * 180 / pi; # returns degrees
1368}
1369
1370# Call as:
1371# my $json = JSON->new->allow_nonref;
1372# &printAllShapes($json, $json_array);
1373sub printAllShapes {
1374 my ($json, $json_array) = @_;
1375
1376
1377 #my $pretty_print_shape = $json->pretty->encode( $json_array->[0] );
1378 foreach my $shape (@$json_array) {
1379 my $pretty_print_shape = $json->pretty->encode( $shape );
1380 print STDERR "Shape: $pretty_print_shape\n";
1381 }
1382
1383}
1384
1385# For the (lat, lng) coordinate given,
1386# attaches Coordinate and multiple CoordShort (different precision level) metadata to the doc object
1387sub processCoordinate {
1388 my $self = shift (@_);
1389 my ($section, $latitude, $longitude) = @_;
1390
1391 my $section_ptr = $self->_lookup_section($section);
1392
1393 my $lat_direction = ($latitude =~ m/^-/) ? "S" : "N";
1394 my $lng_direction = ($longitude =~ m/^-/) ? "W" : "E";
1395
1396 # have to store (lat, lng) in pairs, when there are so many coords to store
1397 #push (@{$section_ptr->{'metadata'}}, ["Latitude", $latitude]);
1398 #push (@{$section_ptr->{'metadata'}}, ["Longitude", $longitude]);
1399
1400 push (@{$section_ptr->{'metadata'}}, ["Coordinate", "$latitude $longitude"]); # "$latitude$lat_direction $longitude$lng_direction"
1401
1402 my ($latBeforeDec, $latAfterDec);
1403 my ($lngBeforeDec, $lngAfterDec);
1404
1405 if($latitude !~ m/\./) {
1406 $latBeforeDec = $latitude;
1407 $latAfterDec = "";
1408 } else {
1409 ($latBeforeDec, $latAfterDec) = ($latitude =~ m/^-?([0-9]+)\.([0-9]+)$/);
1410 }
1411 if($longitude !~ m/\./) {
1412 $lngBeforeDec = $longitude;
1413 $lngAfterDec = "";
1414 } else {
1415 ($lngBeforeDec, $lngAfterDec) = ($longitude =~ m/^-?([0-9]+)\.([0-9]+)$/);
1416 }
1417
1418
1419 my $name = "CoordShort";
1420 push (@{$section_ptr->{'metadata'}}, [$name, "$latBeforeDec$lat_direction $lngBeforeDec$lng_direction"]);
1421
1422 for(my $i = 2; $i <= 4; $i++)
1423 {
1424 my $latDecPlaces = (length($latAfterDec) >= $i) ? substr($latAfterDec, 0, $i) : "";
1425 my $lngDecPlaces = (length($lngAfterDec) >= $i) ? substr($lngAfterDec, 0, $i) : "";
1426
1427 push (@{$section_ptr->{'metadata'}}, [$name,
1428 $latBeforeDec . $lat_direction. $latDecPlaces . " " . $lngBeforeDec . $lng_direction. $lngDecPlaces]);
1429
1430 }
1431
1432 #Only add the metadata if it has not already been added
1433 #my $metaMap = $self->get_metadata_hashmap($section); ### TODO: metaMap not used. Unnecesssary step? (Called function has no apparent side-effects.)
1434}
1435
1436
1437# methods for dealing with text
1438
1439# returns the text for a section
1440sub get_text {
1441 my $self = shift (@_);
1442 my ($section) = @_;
1443
1444 my $section_ptr = $self->_lookup_section($section);
1445 if (!defined $section_ptr) {
1446 print STDERR "doc::get_text couldn't find section " .
1447 "$section\n";
1448 return "";
1449 }
1450
1451 return $section_ptr->{'text'};
1452}
1453
1454# returns the (utf-8 encoded) length of the text for a section
1455sub get_text_length {
1456 my $self = shift (@_);
1457 my ($section) = @_;
1458
1459 my $section_ptr = $self->_lookup_section($section);
1460 if (!defined $section_ptr) {
1461 print STDERR "doc::get_text_length couldn't find section " .
1462 "$section\n";
1463 return 0;
1464 }
1465
1466 return length ($section_ptr->{'text'});
1467}
1468
1469# returns the total length for all the sections
1470sub get_total_text_length {
1471 my $self = shift (@_);
1472
1473 my $section = $self->get_top_section();
1474 my $length = 0;
1475 while (defined $section) {
1476 $length += $self->get_text_length($section);
1477 $section = $self->get_next_section($section);
1478 }
1479 return $length;
1480}
1481
1482sub delete_text {
1483 my $self = shift (@_);
1484 my ($section) = @_;
1485
1486 my $section_ptr = $self->_lookup_section($section);
1487 if (!defined $section_ptr) {
1488 print STDERR "doc::delete_text couldn't find section " .
1489 "$section\n";
1490 return;
1491 }
1492
1493 $section_ptr->{'text'} = "";
1494}
1495
1496# add_text assumes the text is in (extended) ascii form. For
1497# text which has been already converted to the UTF-8 format
1498# use add_utf8_text.
1499sub add_text {
1500 my $self = shift (@_);
1501 my ($section, $text) = @_;
1502
1503 # convert the text to UTF-8 encoded unicode characters
1504 # and add the text
1505 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1506}
1507
1508
1509# add_utf8_text assumes the text to be added has already
1510# been converted to the UTF-8 encoding. For ascii text use
1511# add_text
1512# Pass by value version (internally calls pass by ref version
1513# to avoid code duplication)
1514sub add_utf8_text {
1515 my $self = shift (@_);
1516 my ($section, $text) = @_;
1517
1518 $self->add_utf8_textref($section, \$text);
1519}
1520
1521# Pass by reference version, used by GreenstoneSQLPlugin for fulltext
1522sub add_utf8_textref {
1523 my $self = shift (@_);
1524 my ($section, $text_ref) = @_;
1525
1526 my $section_ptr = $self->_lookup_section($section);
1527 if (!defined $section_ptr) {
1528 print STDERR "doc::add_utf8_textref couldn't find section " .
1529 "$section\n";
1530 return;
1531 }
1532
1533 $section_ptr->{'text'} .= $$text_ref;
1534}
1535
1536# returns the Source meta, which is the utf8 filename generated.
1537# Added a separate method here for convenience
1538sub get_source {
1539 my $self = shift (@_);
1540 return $self->get_metadata_element ($self->get_top_section(), "Source");
1541}
1542
1543# returns the SourceFile meta, which is the url reference to the URL-encoded
1544# version of Source (the utf8 filename). Added a separate method here for convenience
1545sub get_sourcefile {
1546 my $self = shift (@_);
1547 return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
1548}
1549
1550# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
1551# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
1552sub get_assocfile_from_sourcefile {
1553 my $self = shift (@_);
1554
1555 # get the SourceFile meta, which is a *URL* to a file on the filesystem
1556 my $top_section = $self->get_top_section();
1557 my $source_file = $self->get_metadata_element($top_section, "SourceFile");
1558
1559 # get the actual filename as it exists on the filesystem which this url refers to
1560 $source_file = &unicode::url_to_filename($source_file);
1561 my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
1562 return $assocfilename;
1563}
1564
1565# methods for dealing with associated files
1566
1567# a file is associated with a document, NOT a section.
1568# if section is defined it is noted in the data structure
1569# only so that files associated from a particular section
1570# may be removed later (using delete_section_assoc_files)
1571sub associate_file {
1572 my $self = shift (@_);
1573 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1574 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1575
1576 # remove all associated files with the same name
1577 $self->delete_assoc_file ($assoc_filename);
1578
1579 # Too harsh a requirement
1580 # Definitely get HTML docs, for example, with some missing
1581 # support files
1582# if (!&util::fd_exists($real_filename)) {
1583# print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n";
1584# exit -1;
1585# }
1586
1587# print STDERR "**** is the following a UTF8 rep of *real* filename?\n $real_filename\n";
1588# print STDERR "****##### so, ensure it is before storing?!?!?\n";
1589## my $utf8_filename = Encode::encode("utf8",$filename);
1590
1591 push (@{$self->{'associated_files'}},
1592 [$real_filename, $assoc_filename, $mime_type, $section]);
1593}
1594
1595# returns a list of associated files in the form
1596# [[real_filename, assoc_filename, mimetype], ...]
1597sub get_assoc_files {
1598 my $self = shift (@_);
1599
1600 return $self->{'associated_files'};
1601}
1602
1603# the following two methods used to keep track of original associated files
1604# for incremental building. eg a txt file used by an item file does not end
1605# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1606# file for incremental build
1607sub associate_source_file {
1608 my $self = shift (@_);
1609 my ($full_filename) = @_;
1610
1611 push (@{$self->{'source_assoc_files'}}, $full_filename);
1612
1613}
1614
1615sub get_source_assoc_files {
1616 my $self = shift (@_);
1617
1618 return $self->{'source_assoc_files'};
1619
1620
1621}
1622sub metadata_file {
1623 my $self = shift (@_);
1624 my ($real_filename, $filename) = @_;
1625
1626 push (@{$self->{'metadata_files'}},
1627 [$real_filename, $filename]);
1628}
1629
1630# used for writing out the archiveinf-doc info database, to list all the metadata files
1631sub get_meta_files {
1632 my $self = shift (@_);
1633
1634 return $self->{'metadata_files'};
1635}
1636
1637sub delete_section_assoc_files {
1638 my $self = shift (@_);
1639 my ($section) = @_;
1640
1641 my $i=0;
1642 while ($i < scalar (@{$self->{'associated_files'}})) {
1643 if (defined $self->{'associated_files'}->[$i]->[3] &&
1644 $self->{'associated_files'}->[$i]->[3] eq $section) {
1645 splice (@{$self->{'associated_files'}}, $i, 1);
1646 } else {
1647 $i++;
1648 }
1649 }
1650}
1651
1652sub delete_assoc_file {
1653 my $self = shift (@_);
1654 my ($assoc_filename) = @_;
1655
1656 my $i=0;
1657 while ($i < scalar (@{$self->{'associated_files'}})) {
1658 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1659 splice (@{$self->{'associated_files'}}, $i, 1);
1660 } else {
1661 $i++;
1662 }
1663 }
1664}
1665
1666sub reset_nextsection_ptr {
1667 my $self = shift (@_);
1668 my ($section) = @_;
1669
1670 my $section_ptr = $self->_lookup_section($section);
1671 $section_ptr->{'next_subsection'} = 1;
1672}
1673
16741;
Note: See TracBrowser for help on using the repository browser.