source: main/trunk/greenstone2/perllib/doc.pm@ 35401

Last change on this file since 35401 was 34506, checked in by ak19, 4 years ago

Redoing work of commit revision 34394: Redoing Bugfix 1 for GLI doc.xml metadata slowdown resulting from earlier bugfix to help GLI cope with filenames and assigned meta that have non-ASCII chars in them. The slowdown happened when gathered files got selected in GLI and was fixed in commit 34394, but the fix was not ideal for 2 reasons. 1. A new form of filename encoding (hexed unicode) going into doc.xml, instead of existing encodings like URL and base64, though those existing encodings weren't the right ones for my first solution. 2. The solution was specific to Windows to cope with special chars in filenames and relied on a new meta field gsdlfullsourcepath being written out to doc.xml by doc.pm. So a built collection moved from Linux to Windows won't show up doc.xml meta in GLI, as it won't have the new doc.xml meta field that Windows is expecting. Have a better solution for 1 that doesn't require the new field. But still can't fix all of point 2, as the existing gsdlsourcefilename meta field in doc.xml can contain Windows Short filenames when the coll is built on Windows and this won't be backwards compatible on Linux anyway. This problem existed before too, except I didn't realise it until now. But the new solution fixes more issues. First step: undoing doc.pm adding new metadata field gsdlfullsourcepath.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 52.8 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use FileUtils;
40use ghtml;
41use File::stat;
42##use hashdoc;
43use docprint;
44use JSON;
45
46# We just need pi from the Trig lib
47# Import constants pi2, pip2, pip4 (2*pi, pi/2, pi/4).
48# But doing
49# use Math::Trig ':pi';
50# resulted in errors on 32 bit linux VM with perl v8 of "bareword pi not allowed while strict subs in use"
51# So, now doing just
52use Math::Trig;
53
54# the document type may be indexed_doc, nonindexed_doc, or
55# classification
56
57our $OIDcount = 0;
58
59
60# Declare the Package (level) Variable that will tell us whether we're in the import or buildcol phase.
61# This var will be set by import.pl, buildcol.pl and export.pl to the then appropriate value.
62# https://perlmaven.com/package-variables-and-lexical-variables-in-perl
63# But: https://stackoverflow.com/questions/7542120/perl-how-to-make-variables-from-requiring-script-available-in-required-script
64# https://stackoverflow.com/questions/33230015/how-do-i-use-a-variable-in-another-perl-script
65#$doc::cmd_line_mode = undef;
66# Decided to follow the usage of $OIDcount above to declare the necessary package-level variables 'globally' accessible
67# actually accessible only for those packages that import this one (with 'use doc')
68my $cmd_line_mode = undef;
69
70# processor_mode keeps track of which buildcol pass we're at: dummy, text (sidx/didx passes) or infodb
71my $processor_mode = undef;
72
73# rename_method can be 'url', 'none', 'base64'
74sub new {
75 my $class = shift (@_);
76 my ($source_filename, $doc_type, $rename_method) = @_;
77
78
79 my $self = bless {'associated_files'=>[],
80 'subsection_order'=>[],
81 'next_subsection'=>1,
82 'subsections'=>{},
83 'metadata'=>[],
84 'text'=>"",
85 'OIDtype'=>"hash"}, $class;
86
87 # used to set lastmodified here, but this can screw up the HASH ids, so
88 # the docsave processor now calls set_lastmodified
89
90 $self->set_source_path($source_filename);
91
92 if (defined $source_filename) {
93 $source_filename = &util::filename_within_collection($source_filename);
94 print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
95 $self->set_source_filename ($source_filename, $rename_method);
96 }
97
98 $self->set_doc_type ($doc_type) if defined $doc_type;
99
100 return $self;
101}
102
103
104sub set_source_path
105{
106 my $self = shift @_;
107 my ($source_filename) = @_;
108
109 if (defined $source_filename) {
110 # On Windows the source_filename can be in terse DOS format
111 # e.g. test~1.txt
112
113 $self->{'terse_source_path'} = $source_filename;
114
115 # Use the FileUtil library methods as they are aware of more special
116 # cases such as HDFS [jmt12]
117 if (&FileUtils::fileExists($source_filename))
118 {
119 # See if we can do better for Windows with a filename
120 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
121 require Win32;
122 $self->{'source_path'} = Win32::GetLongPathName($source_filename);
123 }
124 else {
125 # For Unix-based systems, there is no difference between the two
126 $self->{'source_path'} = $source_filename;
127 }
128
129 ## Unused.
130 # On Windows, the code above has ensured source_path is the Win long (full) path name.
131 # To help GLI associate DocXML metadata with an easily calculated and accurate representation
132 # of filenames, we now store the Win long path name, hex encoded.
133 # We're not using this field on Linux, as I can't get the hex encodings generated to match
134 # what GLI Java code generates. But for symmetry we store this field on Unix too, but we need
135 # to hex-encode source_path on Unix too, or it may not be UTF-8 and doc.xml will be invalid
136 ##my $hexencodedlongsourcepath = &unicode::debug_unicode_string($self->{'source_path'});
137 ##$self->set_utf8_metadata_element ($self->get_top_section(), "gsdlfullsourcepath", $hexencodedlongsourcepath);
138
139 }
140 else {
141 print STDERR "Warning: In doc::set_source_path(), file\n";
142 print STDERR " $source_filename\n";
143 print STDERR " does not exist\n";
144
145 # (default) Set it to whatever we were given
146 $self->{'source_path'} = $source_filename;
147 }
148 }
149 else {
150 # Previous code for setting source_path allowed for
151 # it to be undefined, so continue this practice
152 $self->{'terse_source_path'} = undef;
153 $self->{'source_path'} = undef;
154 }
155}
156
157
158sub get_source_path
159{
160 my $self = shift @_;
161
162 return $self->{'terse_source_path'};
163}
164
165# set lastmodified for OAI purposes, added by GRB, moved by kjdon
166sub set_oailastmodified {
167 my $self = shift (@_);
168
169 my $source_path = $self->{'terse_source_path'};
170
171 if (defined $source_path && (-e $source_path)) {
172 my $current_time = time;
173
174 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
175 $wday, $yday, $isdst) = localtime($current_time);
176
177 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
178
179 $self->add_utf8_metadata($self->get_top_section(), "oailastmodified", $current_time);
180 $self->add_utf8_metadata($self->get_top_section(), "oailastmodifieddate", $date_modified);
181 }
182}
183
184# no longer used for OAI purposes, since lastmodified is not what we want as the
185# Datestamp of a document. This doc metadata may be useful for general purposes.
186sub set_lastmodified {
187 my $self = shift (@_);
188
189 my $source_path = $self->{'terse_source_path'};
190
191 if (defined $source_path && (-e $source_path)) {
192
193 my $file_stat = stat($source_path);
194 my $mtime = $file_stat->mtime;
195 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
196 $wday, $yday, $isdst) = localtime($mtime);
197
198 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
199
200 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $mtime);
201 $self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
202 }
203}
204
205# clone the $self object
206sub duplicate {
207 my $self = shift (@_);
208
209 my $newobj = {};
210
211 foreach my $k (keys %$self) {
212 $newobj->{$k} = &clone ($self->{$k});
213 }
214
215 bless $newobj, ref($self);
216 return $newobj;
217}
218
219sub clone {
220 my ($from) = @_;
221 my $type = ref ($from);
222
223 if ($type eq "HASH") {
224 my $to = {};
225 foreach my $key (keys %$from) {
226 $to->{$key} = &clone ($from->{$key});
227 }
228 return $to;
229 } elsif ($type eq "ARRAY") {
230 my $to = [];
231 foreach my $v (@$from) {
232 push (@$to, &clone ($v));
233 }
234 return $to;
235 } else {
236 return $from;
237 }
238}
239
240sub set_OIDtype {
241 my $self = shift (@_);
242 my ($type, $metadata) = @_;
243
244 if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|hash_on_full_filename|incremental|filename|dirname|full_filename|assigned)$/) {
245 $self->{'OIDtype'} = $type;
246 } else {
247 $self->{'OIDtype'} = "hash";
248 }
249
250 if ($type =~ /^assigned$/) {
251 if (defined $metadata) {
252 $self->{'OIDmetadata'} = $metadata;
253 } else {
254 $self->{'OIDmetadata'} = "dc.Identifier";
255 }
256 }
257}
258
259# rename_method can be 'url', 'none', 'base64'
260sub set_source_filename {
261 my $self = shift (@_);
262 my ($source_filename, $rename_method) = @_;
263
264 # Since the gsdlsourcefilename element goes into the doc.xml it has
265 # to be utf8. However, it should also *represent* the source filename
266 # (in the import directory) which may not be utf8 at all.
267 # For instance, if this meta element (gsdlsourcefilename) will be used
268 # by other applications that parse doc.xml in order to locate
269 # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
270 # encode the real filename as this is a binary-to-text encoding meaning
271 # that the resulting string is ASCII (utf8). Decoding will give the original.
272
273# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
274## print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename\n";
275
276 # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
277 $source_filename = $self->encode_filename($source_filename, $rename_method);
278# my ($srcfilename,$dirname,$suffix)
279# = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
280# print STDERR "-> $srcfilename -> ";
281# $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
282# $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
283# print STDERR "$source_filename\n";
284
285## print STDERR "AFTER URL/base64 encoding gsdlsourcefilename: $source_filename\n";
286
287 $self->set_utf8_metadata_element ($self->get_top_section(),
288 "gsdlsourcefilename",
289 $source_filename);
290
291 # If we set the file renaming method, at least GLI can decode gsdlsourcefilename to get the
292 # original filename (relative path to file) back for GLI to notice what ex Meta is associated with it
293 $self->set_utf8_metadata_element($self->get_top_section(),
294 "gsdlsourcefilerenamemethod",
295 $rename_method);
296}
297
298sub encode_filename {
299 my $self = shift (@_);
300 my ($source_filename, $rename_method) = @_;
301
302 my ($srcfilename,$dirname,$suffix)
303 = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
304# print STDERR "-> $srcfilename -> ";
305 $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
306
307 # encode any subdirs of "import" also, but not import itself
308 my $sep = ($ENV{'GSDLOS'} =~ /^windows$/i) ? "\\" : "/";
309 my @dirs = ($sep eq "\\") ? split(/[\\\/]+/, $dirname) : split(/\//, $dirname);
310
311 my $dirpath = "";
312
313 # Don't encode the first folder ("import" or "tmp"): GLI's DocXMLFile.java looks for literal "import"
314 # or "tmp" before it knows what the file rename method is that is to be used decode the rest of gsdlsourcefilename.
315 if(scalar (@dirs) >= 1) {
316 $dirpath = shift(@dirs);
317 }
318 foreach my $subdir (@dirs) {
319 #print STDERR "@@@@ Found subdir: $subdir\n";
320
321 $subdir = &util::rename_file($subdir, $rename_method);
322 #print STDERR "@@@@ encoded subdir: $subdir\n";
323
324 $dirpath = &FileUtils::filenameConcatenate($dirpath, $subdir);
325 }
326 if($dirpath ne "") {
327 $dirname = $dirpath;
328 }
329
330 $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
331
332 return $source_filename;
333}
334
335sub set_converted_filename {
336 my $self = shift (@_);
337 my ($converted_filename) = @_;
338
339 # we know the converted filename is utf8
340 $self->set_utf8_metadata_element ($self->get_top_section(),
341 "gsdlconvertedfilename",
342 $converted_filename);
343}
344
345# returns the source_filename as it was provided
346sub get_unmodified_source_filename {
347 my $self = shift (@_);
348
349 return $self->{'terse_source_path'};
350}
351
352# returns the source_filename with whatever rename_method was given
353sub get_source_filename {
354 my $self = shift (@_);
355
356 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
357}
358
359
360
361# returns converted filename if available else returns source filename
362sub get_filename_for_hashing {
363 my $self = shift (@_);
364
365 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
366
367 if (!defined $filename) {
368 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
369 # if NULPlug processed file, then don't give a filename
370 if (defined $plugin_name && $plugin_name eq "NULPlug") {
371 $filename = undef;
372 } else { # returns the URL encoded source filename!
373 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
374 }
375 }
376
377 if (!&FileUtils::isFilenameAbsolute($filename)) {
378 $filename = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$filename);
379 }
380
381 return $filename;
382}
383
384sub set_doc_type {
385 my $self = shift (@_);
386 my ($doc_type) = @_;
387
388 $self->set_metadata_element ($self->get_top_section(),
389 "gsdldoctype",
390 $doc_type);
391}
392
393# returns the gsdldoctype as it was provided
394# the default of "indexed_doc" is used if no document
395# type was provided
396sub get_doc_type {
397 my $self = shift (@_);
398
399 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
400 return $doc_type if (defined $doc_type);
401 return "indexed_doc";
402}
403
404
405# look up the reference to the a particular section
406sub _lookup_section {
407 my $self = shift (@_);
408 my ($section) = @_;
409
410 my ($num);
411 my $sectionref = $self;
412
413 while (defined $section && $section ne "") {
414
415 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
416
417 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
418
419 $section = "" unless defined $section;
420
421
422 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
423 $sectionref = $sectionref->{'subsections'}->{$num};
424 } else {
425 return undef;
426 }
427 }
428
429 return $sectionref;
430}
431
432# calculate OID by hashing the contents of the document
433sub _calc_OID {
434 my $self = shift (@_);
435 my ($filename) = @_;
436
437
438 my $osexe = &util::get_os_exe();
439
440 my $hashfile_exe = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin",
441 $ENV{'GSDLOS'},"hashfile$osexe");
442
443 &util::set_gnomelib_env(); # gnomelib_env (particularly lib/libiconv2.dylib) required to run the hashfile executable on Mac Lions
444 # The subroutine will set the gnomelib env once for each subshell launched, by first testing if GEXTGNOME is not already set
445
446 # A different way to set the gnomelib env would be to do it more locally: exporting the necessary vars
447 # (specifically DYLD/LD_LIB_PATH) for gnome_lib as part of the command executed.
448 # E.g. $result=`export LD_LIBRARY_PATH=../ext/gnome-lib/darwin/lib; hashfile...`
449
450 my $result = "NULL";
451
452
453 if (-e "$hashfile_exe") {
454# $result = `\"$hashfile_exe\" \"$filename\"`;
455# $result = `hashfile$osexe \"$filename\" 2>&1`;
456 $result = `hashfile$osexe \"$filename\"`;
457
458 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
459 } else {
460 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
461 }
462 return "HASH$result";
463}
464
465# methods dealing with OID, not groups of them.
466
467# if $OID is not provided one is calculated
468sub set_OID {
469 my $self = shift (@_);
470 my ($OID) = @_;
471
472 my $use_hash_oid = 0;
473 # if an OID wasn't provided calculate one
474 if (!defined $OID) {
475 $OID = "NULL";
476 if ($self->{'OIDtype'} =~ /^hash/) {
477 $use_hash_oid = 1;
478 } elsif ($self->{'OIDtype'} eq "incremental") {
479 $OID = "D" . $OIDcount;
480 $OIDcount ++;
481 } elsif ($self->{'OIDtype'} eq "filename") {
482 my $filename = $self->get_source_filename();
483 $OID = &File::Basename::fileparse($filename, qr/\.[^.]*/);
484 $OID = &util::tidy_up_oid($OID);
485 } elsif ($self->{'OIDtype'} eq "full_filename") {
486 my $source_filename = $self->get_source_filename();
487 my $dirsep = &util::get_os_dirsep();
488
489 $source_filename =~ s/^import$dirsep//;
490 $source_filename =~ s/$dirsep/-/g;
491 $source_filename =~ s/\./_/g;
492
493 $OID = $source_filename;
494 $OID = &util::tidy_up_oid($OID);
495 } elsif ($self->{'OIDtype'} eq "dirname") {
496 my $filename = $self->get_source_filename();
497 if (defined($filename)) { # && -e $filename) {
498 # get the immediate parent directory
499 $OID = &File::Basename::dirname($filename);
500 if (defined $OID) {
501 $OID = &File::Basename::basename($OID);
502 $OID = &util::tidy_up_oid($OID);
503 } else {
504 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
505 $use_hash_oid = 1;
506 }
507 } else {
508 print STDERR "Failed to find a filename, generating hash id\n";
509 $use_hash_oid = 1;
510 }
511
512 } elsif ($self->{'OIDtype'} eq "assigned") {
513 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
514 if (defined $identifier && $identifier ne "") {
515 $OID = $identifier;
516 $OID = &util::tidy_up_oid($OID);
517 } else {
518 # need a hash id
519 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
520 $use_hash_oid = 1;
521 }
522
523 } else {
524 $use_hash_oid = 1;
525 }
526
527 if ($use_hash_oid) {
528 my $hash_on_file = 1;
529 my $hash_on_ga_xml = 0;
530
531 if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
532 $hash_on_file = 0;
533 $hash_on_ga_xml = 1;
534 }
535
536 if ($self->{'OIDtype'} eq "hash_on_full_filename") {
537 $hash_on_file = 0;
538 $hash_on_ga_xml = 0;
539
540 my $source_filename = $self->get_source_filename();
541 my $dirsep = &util::get_os_dirsep();
542
543 $source_filename =~ s/^import$dirsep//;
544 $source_filename =~ s/$dirsep/-/g;
545 $source_filename =~ s/\./_/g;
546
547 # If the filename is very short then (handled naively)
548 # this can cause conjestion in the hash-values
549 # computed, leading documents sharing the same leading
550 # Hex values in the computed has.
551 #
552 # The solution taken here is to replace the name of
553 # the file name a sufficient number of times (up to
554 # the character limit defined in 'rep_limit' and
555 # make that the content that is hashed on
556
557 # *** Think twice before changing the following value
558 # as it will break backward compatability of computed
559 # document HASH values
560
561 my $rep_limit = 256;
562 my $hash_content = undef;
563
564 if (length($source_filename)<$rep_limit) {
565 my $rep_string = "$source_filename|";
566 my $rs_len = length($rep_string);
567
568 my $clone_times = int(($rep_limit-1)/$rs_len) +1;
569
570 $hash_content = substr($rep_string x $clone_times, 0, $rep_limit);
571 }
572 else {
573 $hash_content = $source_filename;
574 }
575
576 my $filename = &util::get_tmp_filename();
577 if (!open (OUTFILE, ">:utf8", $filename)) {
578 print STDERR "doc::set_OID could not write to $filename\n";
579 } else {
580 print OUTFILE $hash_content;
581 close (OUTFILE);
582 }
583 $OID = $self->_calc_OID ($filename);
584
585 &FileUtils::removeFiles ($filename);
586 }
587
588 if ($hash_on_file) {
589 # "hash" OID - feed file to hashfile.exe
590 my $filename = $self->get_filename_for_hashing();
591
592 # -z: don't want to hash on the file if it is zero size
593 if (defined($filename) && -e $filename && !-z $filename) {
594 $OID = $self->_calc_OID ($filename);
595 } else {
596 $hash_on_ga_xml = 1; # switch to back-up plan, and hash on GA file instead
597 }
598 }
599
600 if ($hash_on_ga_xml) {
601 # In addition being asked to explicity calculate the has based on the GA file,
602 # can also end up coming into this block is doing 'hash_on_file' but the file
603 # itself is of zero bytes (as could be the case with 'doc.nul' file
604
605 my $filename = &util::get_tmp_filename();
606 if (!open (OUTFILE, ">:utf8", $filename)) {
607 print STDERR "doc::set_OID could not write to $filename\n";
608 } else {
609 my $doc_text = &docprint::get_section_xml($self);
610 print OUTFILE $doc_text;
611 close (OUTFILE);
612 }
613 $OID = $self->_calc_OID ($filename);
614 &FileUtils::removeFiles($filename);
615 }
616 }
617 }
618 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
619}
620
621# this uses hashdoc (embedded c thingy) which is faster but still
622# needs a little work to be sufficiently stable
623sub ___set_OID {
624 my $self = shift (@_);
625 my ($OID) = @_;
626
627 # if an OID wasn't provided then calculate hash value based on document
628 if (!defined $OID)
629 {
630 my $hash_text = &docprint::get_section_xml($self);
631 my $hash_len = length($hash_text);
632
633 $OID = &hashdoc::buffer($hash_text,$hash_len);
634 }
635
636 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
637}
638
639# returns the OID for this document
640sub get_OID {
641 my $self = shift (@_);
642 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
643 return $OID if (defined $OID);
644 return "NULL";
645}
646
647sub delete_OID {
648 my $self = shift (@_);
649
650 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
651}
652
653
654# methods for manipulating section names
655
656# returns the name of the top-most section (the top
657# level of the document
658sub get_top_section {
659 my $self = shift (@_);
660
661 return "";
662}
663
664# returns a section
665sub get_parent_section {
666 my $self = shift (@_);
667 my ($section) = @_;
668
669 $section =~ s/(^|\.)\d+$//;
670
671 return $section;
672}
673
674# returns the first child section (or the end child
675# if there isn't any)
676sub get_begin_child {
677 my $self = shift (@_);
678 my ($section) = @_;
679
680 my $section_ptr = $self->_lookup_section($section);
681 return "" unless defined $section_ptr;
682
683 if (defined $section_ptr->{'subsection_order'}->[0]) {
684 return "$section.$section_ptr->{'subsection_order'}->[0]";
685 }
686
687 return $self->get_end_child ($section);
688}
689
690# returns the next child of a parent section
691sub get_next_child {
692 my $self = shift (@_);
693 my ($section) = @_;
694
695 my $parent_section = $self->get_parent_section($section);
696 my $parent_section_ptr = $self->_lookup_section($parent_section);
697 return undef unless defined $parent_section_ptr;
698
699 my ($section_num) = $section =~ /(\d+)$/;
700 return undef unless defined $section_num;
701
702 my $i = 0;
703 my $section_order = $parent_section_ptr->{'subsection_order'};
704 while ($i < scalar(@$section_order)) {
705 last if $section_order->[$i] eq $section_num;
706 $i++;
707 }
708
709 $i++; # the next child
710 if ($i < scalar(@$section_order)) {
711 return $section_order->[$i] if $parent_section eq "";
712 return "$parent_section.$section_order->[$i]";
713 }
714
715 # no more sections in this level
716 return undef;
717}
718
719# returns a reference to a list of children
720sub get_children {
721 my $self = shift (@_);
722 my ($section) = @_;
723
724 my $section_ptr = $self->_lookup_section($section);
725 return [] unless defined $section_ptr;
726
727 my @children = @{$section_ptr->{'subsection_order'}};
728
729 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
730 return \@children;
731}
732
733# returns the child section one past the last one (which
734# is coded as "0")
735sub get_end_child {
736 my $self = shift (@_);
737 my ($section) = @_;
738
739 return $section . ".0" unless $section eq "";
740 return "0";
741}
742
743# returns the next section in book order
744sub get_next_section {
745 my $self = shift (@_);
746 my ($section) = @_;
747
748 return undef unless defined $section;
749
750 my $section_ptr = $self->_lookup_section($section);
751 return undef unless defined $section_ptr;
752
753 # first try to find first child
754 if (defined $section_ptr->{'subsection_order'}->[0]) {
755 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
756 return "$section.$section_ptr->{'subsection_order'}->[0]";
757 }
758
759 do {
760 # try to find sibling
761 my $next_child = $self->get_next_child ($section);
762 return $next_child if (defined $next_child);
763
764 # move up one level
765 $section = $self->get_parent_section ($section);
766 } while $section =~ /\d/;
767
768 return undef;
769}
770
771sub is_leaf_section {
772 my $self = shift (@_);
773 my ($section) = @_;
774
775 my $section_ptr = $self->_lookup_section($section);
776 return 1 unless defined $section_ptr;
777
778 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
779}
780
781# methods for dealing with sections
782
783# returns the name of the inserted section
784sub insert_section {
785 my $self = shift (@_);
786 my ($before_section) = @_;
787
788 # get the child to insert before and its parent section
789 my $parent_section = "";
790 my $before_child = "0";
791 my @before_section = split (/\./, $before_section);
792 if (scalar(@before_section) > 0) {
793 $before_child = pop (@before_section);
794 $parent_section = join (".", @before_section);
795 }
796
797 my $parent_section_ptr = $self->_lookup_section($parent_section);
798 if (!defined $parent_section_ptr) {
799 print STDERR "doc::insert_section couldn't find parent section " .
800 "$parent_section\n";
801 return;
802 }
803
804 # get the next section number
805 my $section_num = $parent_section_ptr->{'next_subsection'}++;
806
807 my $i = 0;
808 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
809 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
810 $i++;
811 }
812
813 # insert the section number into the order list
814 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
815
816 # add this section to the parent section
817 my $section_ptr = {'subsection_order'=>[],
818 'next_subsection'=>1,
819 'subsections'=>{},
820 'metadata'=>[],
821 'text'=>""};
822 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
823
824 # work out the full section number
825 my $section = $parent_section;
826 $section .= "." unless $section eq "";
827 $section .= $section_num;
828
829 return $section;
830}
831
832# creates a pre-named section
833sub create_named_section {
834 my $self = shift (@_);
835 my ($mastersection) = @_;
836
837 my ($num);
838 my $section = $mastersection;
839 my $sectionref = $self;
840
841 while ($section ne "") {
842 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
843 $num =~ s/^0+(\d)/$1/; # remove leading 0s
844 $section = "" unless defined $section;
845
846 if (defined $num) {
847 if (!defined $sectionref->{'subsections'}->{$num}) {
848 push (@{$sectionref->{'subsection_order'}}, $num);
849 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
850 'next_subsection'=>1,
851 'subsections'=>{},
852 'metadata'=>[],
853 'text'=>""};
854 if ($num >= $sectionref->{'next_subsection'}) {
855 $sectionref->{'next_subsection'} = $num + 1;
856 }
857 }
858 $sectionref = $sectionref->{'subsections'}->{$num};
859
860 } else {
861 print STDERR "doc::create_named_section couldn't create section ";
862 print STDERR "$mastersection\n";
863 last;
864 }
865 }
866}
867
868# returns a reference to a list of subsections
869sub list_subsections {
870 my $self = shift (@_);
871 my ($section) = @_;
872
873 my $section_ptr = $self->_lookup_section ($section);
874 if (!defined $section_ptr) {
875 print STDERR "doc::list_subsections couldn't find section $section\n";
876 return [];
877 }
878
879 return [@{$section_ptr->{'subsection_order'}}];
880}
881
882sub delete_section {
883 my $self = shift (@_);
884 my ($section) = @_;
885
886# my $section_ptr = {'subsection_order'=>[],
887# 'next_subsection'=>1,
888# 'subsections'=>{},
889# 'metadata'=>[],
890# 'text'=>""};
891
892 # if this is the top section reset everything
893 if ($section eq "") {
894 $self->{'subsection_order'} = [];
895 $self->{'subsections'} = {};
896 $self->{'metadata'} = [];
897 $self->{'text'} = "";
898 return;
899 }
900
901 # find the parent of the section to delete
902 my $parent_section = "";
903 my $child = "0";
904 my @section = split (/\./, $section);
905 if (scalar(@section) > 0) {
906 $child = pop (@section);
907 $parent_section = join (".", @section);
908 }
909
910 my $parent_section_ptr = $self->_lookup_section($parent_section);
911 if (!defined $parent_section_ptr) {
912 print STDERR "doc::delete_section couldn't find parent section " .
913 "$parent_section\n";
914 return;
915 }
916
917 # remove this section from the subsection_order list
918 my $i = 0;
919 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
920 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
921 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
922 last;
923 }
924 $i++;
925 }
926
927 # remove this section from the subsection hash
928 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
929 undef $parent_section_ptr->{'subsections'}->{$child};
930 }
931}
932
933#--
934# methods for dealing with metadata
935
936# set_metadata_element and get_metadata_element are for metadata
937# which should only have one value. add_meta_data and get_metadata
938# are for metadata which can have more than one value.
939
940# returns the first metadata value which matches field
941
942# This version of get metadata element works much like the one above,
943# except it allows for the namespace portion of a metadata element to
944# be ignored, thus if you are searching for dc.Title, the first piece
945# of matching metadata ending with the name Title (once any namespace
946# is removed) would be returned.
947# 28-11-2003 John Thompson
948sub get_metadata_element {
949 my $self = shift (@_);
950 my ($section, $field, $ignore_namespace) = @_;
951 my ($data);
952
953 $ignore_namespace = 0 unless defined $ignore_namespace;
954
955 my $section_ptr = $self->_lookup_section($section);
956 if (!defined $section_ptr) {
957 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
958 return;
959 }
960
961 # Remove any namespace if we are being told to ignore them
962 if($ignore_namespace) {
963 $field =~ s/^.*\.//; #$field =~ s/^\w*\.//;
964 }
965
966 foreach $data (@{$section_ptr->{'metadata'}}) {
967
968 my $data_name = $data->[0];
969
970 # Remove any namespace if we are being told to ignore them
971 if($ignore_namespace) {
972 $data_name =~ s/^.*\.//; #$data_name =~ s/^\w*\.//;
973 }
974 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
975 $data_name =~ s/^ex\.([^.]+)$/$1/; #$data_name =~ s/^ex\.//;
976
977 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
978 }
979
980 return undef; # was not found
981}
982
983# returns a list of the form [value1, value2, ...]
984sub get_metadata {
985 my $self = shift (@_);
986 my ($section, $field, $ignore_namespace) = @_;
987 my ($data);
988
989 $ignore_namespace = 0 unless defined $ignore_namespace;
990
991 my $section_ptr = $self->_lookup_section($section);
992 if (!defined $section_ptr) {
993 print STDERR "doc::get_metadata couldn't find section ",
994 $section, "\n";
995 return;
996 }
997
998 # Remove any namespace if we are being told to ignore them
999 if($ignore_namespace) {
1000 $field =~ s/^.*\.//;
1001 }
1002
1003 my @metadata = ();
1004 foreach $data (@{$section_ptr->{'metadata'}}) {
1005
1006 my $data_name = $data->[0];
1007
1008 # Remove any namespace if we are being told to ignore them
1009 if($ignore_namespace) {
1010 $data_name =~ s/^.*\.//;
1011 }
1012 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
1013 $data_name =~ s/^ex\.([^.]+)$/$1/;
1014
1015 push (@metadata, $data->[1]) if ($data_name eq $field);
1016 }
1017
1018 return \@metadata;
1019}
1020
1021sub get_metadata_hashmap {
1022 my $self = shift (@_);
1023 my ($section, $opt_namespace) = @_;
1024
1025 my $section_ptr = $self->_lookup_section($section);
1026 if (!defined $section_ptr) {
1027 print STDERR "doc::get_metadata couldn't find section ",
1028 $section, "\n";
1029 return;
1030 }
1031
1032 my $metadata_hashmap = {};
1033 foreach my $data (@{$section_ptr->{'metadata'}}) {
1034 my $metaname = $data->[0];
1035
1036 if ((!defined $opt_namespace) || ($metaname =~ m/^$opt_namespace\./)) {
1037 if (!defined $metadata_hashmap->{$metaname}) {
1038 $metadata_hashmap->{$metaname} = [];
1039 }
1040 my $metaval_list = $metadata_hashmap->{$metaname};
1041 push(@$metaval_list, $data->[1]);
1042 }
1043 }
1044
1045 return $metadata_hashmap;
1046}
1047
1048# returns a list of the form [[field,value],[field,value],...]
1049sub get_all_metadata {
1050 my $self = shift (@_);
1051 my ($section) = @_;
1052
1053 my $section_ptr = $self->_lookup_section($section);
1054 if (!defined $section_ptr) {
1055 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
1056 return;
1057 }
1058
1059 return $section_ptr->{'metadata'};
1060}
1061
1062# $value is optional
1063sub delete_metadata {
1064 my $self = shift (@_);
1065 my ($section, $field, $value) = @_;
1066
1067 my $section_ptr = $self->_lookup_section($section);
1068 if (!defined $section_ptr) {
1069 print STDERR "doc::delete_metadata couldn't find section ", $section, "$field\n";
1070 return;
1071 }
1072
1073 my $i = 0;
1074 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
1075 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
1076 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
1077 splice (@{$section_ptr->{'metadata'}}, $i, 1);
1078 } else {
1079 $i++;
1080 }
1081 }
1082}
1083
1084sub delete_all_metadata {
1085 my $self = shift (@_);
1086 my ($section) = @_;
1087
1088 my $section_ptr = $self->_lookup_section($section);
1089 if (!defined $section_ptr) {
1090 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
1091 return;
1092 }
1093
1094 $section_ptr->{'metadata'} = [];
1095}
1096
1097sub set_metadata_element {
1098 my $self = shift (@_);
1099 my ($section, $field, $value) = @_;
1100
1101 $self->set_utf8_metadata_element ($section, $field,
1102 &unicode::ascii2utf8(\$value));
1103}
1104
1105# set_utf8_metadata_element assumes the text has already been
1106# converted to the UTF-8 encoding.
1107sub set_utf8_metadata_element {
1108 my $self = shift (@_);
1109 my ($section, $field, $value) = @_;
1110
1111 $self->delete_metadata ($section, $field);
1112 $self->add_utf8_metadata ($section, $field, $value);
1113}
1114
1115
1116# add_metadata assumes the text is in (extended) ascii form. For
1117# text which has already been converted to the UTF-8 format use
1118# add_utf8_metadata.
1119sub add_metadata {
1120 my $self = shift (@_);
1121 my ($section, $field, $value) = @_;
1122
1123 $self->add_utf8_metadata ($section, $field,
1124 &unicode::ascii2utf8(\$value));
1125}
1126
1127sub add_utf8_metadata {
1128 my $self = shift (@_);
1129 my ($section, $field, $value) = @_;
1130
1131 # my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1132 # my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
1133 # print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
1134
1135 my $section_ptr = $self->_lookup_section($section);
1136 if (!defined $section_ptr) {
1137 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
1138 return;
1139 }
1140 if (!defined $value) {
1141 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
1142 return;
1143 }
1144 if (!defined $field) {
1145 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
1146 return;
1147 }
1148
1149 #print STDERR "###$field=$value\n";
1150
1151 # For now, supress this check. Given that text data read in is now
1152 # Unicode aware, then the following block of code can (ironically enough)
1153 # cause our unicode compliant string to be re-encoded (leading to
1154 # a double-encoded UTF-8 string, which we definitely don't want!).
1155
1156
1157 # double check that the value is utf-8
1158 # if (!&unicode::check_is_utf8($value)) {
1159 # print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
1160 # &unicode::ensure_utf8(\$value);
1161 # print STDERR " Tried converting to utf8: $value\n";
1162 # }
1163
1164 #If the metadata value is either a latitude or a longitude value then we want to save a shortened version for spacial searching purposes
1165 if ($field =~ m/^(.+\.)?Latitude$/ || $field =~ m/^(.+\.)?Longitude$/)
1166 {
1167 my ($mdprefix,$metaname) = ($field =~ m/(.+)\.(.+)$/);
1168 if (defined $mdprefix) {
1169 # Add in a version of Latitude/Longitude without the metadata namespace prefix to keep Runtime happy
1170 push (@{$section_ptr->{'metadata'}}, [$metaname, $value]);
1171 }
1172
1173 my $direction;
1174 # TODO does the following work, with eq latitude? the above code implies can be xxxxx.Latitude. should use metaname??
1175 if($value =~ m/^-/)
1176 {
1177 $direction = ($field eq "Latitude") ? "S" : "W";
1178 }
1179 else
1180 {
1181 $direction = ($field eq "Latitude") ? "N" : "E";
1182 }
1183
1184 my ($beforeDec, $afterDec) = ($value =~ m/^-?([0-9]+)\.([0-9]+)$/);
1185 if(defined $beforeDec && defined $afterDec)
1186 {
1187 my $name = ($field eq "Latitude") ? "LatShort" : "LngShort";
1188 push (@{$section_ptr->{'metadata'}}, [$name, $beforeDec . $direction]);
1189
1190 for(my $i = 2; $i <= 4; $i++)
1191 {
1192 if(length($afterDec) >= $i)
1193 {
1194 push (@{$section_ptr->{'metadata'}}, [$name, substr($afterDec, 0, $i)]);
1195 }
1196 }
1197
1198 # what is this???? completely does nothing...
1199 #Only add the metadata if it has not already been added
1200 my $metaMap = $self->get_metadata_hashmap($section); # metaMap not used and called function has no apparent side-effects,
1201 # but this line appears important for ensuring uniqueness of (Latitude, value) meta
1202 # in the section. Also for LatShort, Longitude, LngShort.
1203 }
1204
1205 # if we have both Latitude and Longitude, then add coordinate metadata (not sure why, but someone else added it)
1206 my $latitude;
1207 my $longitude;
1208 if($field =~ m/^(.+\.)?Longitude$/) {
1209 # if we are dealing with Longitude meta, and we have already seen Latitude, then add Coordinate metadata
1210 $latitude = $self->get_metadata_element ($section, "Latitude");
1211 if (defined $latitude) {
1212 $longitude = $value;
1213 }
1214 } else {
1215 # we are dealing with Latitude - if we have already seen Longitude, then add both as Coordinate metadata
1216 $longitude = $self->get_metadata_element ($section, "Longitude");
1217 if (defined $longitude) {
1218 $latitude = $value;
1219 }
1220 }
1221 if (defined $latitude && defined $longitude) {
1222 ##rint STDERR "adding coords for $section, $latitude, $longitude\n";
1223 $self->processCoordinate($section, $latitude, $longitude);
1224 } else {
1225 ##rint STDERR "lat or long not defined, not adding coords for $section\n";
1226 }
1227
1228 }
1229
1230 elsif($field eq "GPS.mapOverlay") { # then the $value is a JSON string
1231
1232 # In order to allow searching map-data-enriched documents by map shape descriptions,
1233 # and to run rawquery searches for other docs by proximity based on their map data,
1234 # need to store the shape descriptions and Coordinate info for shapes into the text index.
1235 # We add the description for each shape in the mapoverlay into the text index as GPSMapOverlayLabel
1236 # And we add Coordinate (CD) and CoordShort (CS) info for each shape in the mapoverlay in the format (Lat, Lng) as ("37S339 175E342")
1237 # where the digits before the N/S/E/W direction represents the whole number. And the digits after the direction are the
1238 # decimal places which can vary between 0 and 2 to 4 digits.
1239
1240 # However, we only want to process GPS.mapOverlay during buildcol and only in the text indexing passes (e.g. sidx and didx for lucene)
1241 # and certainly never during the infodb pass of buildcol. The latter can end up duplicating Coordinate/CoordShort/GPSMapOverlayLabel
1242 # when rebuilding with the online doc editor, as that runs incremental-rebuild which then calls basebuilder::reconstruct_doc_objs_metadata()
1243 # on all docs NOT being incrementally rebuilt. That call would get meta from the infodb and use it to reconstruct the doc objects of docs
1244 # NOT being incrementally built. If the Coord and Label meta were written to the infodb, they would then be loaded back in memory when the
1245 # collection is incrementally rebuilt for those docs that don't need incremental processing. Then this function would once again add the
1246 # same meta into the infodb, thus duplicating what goes into the infodb. Hence, don't do all the following if doc::processor_mode =~ "infodb",
1247 # or anything other than a text mode.
1248
1249 # Note that for incremental rebuilding, the text pass can be called textreindex for instance (and infodb pass can be incinfodb).
1250 # So don't check for exact string match
1251
1252 if($doc::cmd_line_mode eq "buildcol" && $doc::processor_mode =~ m/^text/) { # currently known text processor_modes:
1253 # text, textreindex, possibly textdelete (see ArchivesInfPlugin.pm::read() for last 2).
1254 # OR: ..."buildcol" && $doc::processor_mode !~ m/infodb$/) # if dummy pass important
1255
1256 ###print STDERR "GPS.mapOverlay before decoding, val = " . $value . "\n";
1257
1258 # TODO: html decode?
1259 $value =~ s@&#091;@[@g;
1260 $value =~ s@&#093;@]@g;
1261 $value =~ s@&quot;@"@g;
1262 ###print STDERR "GPS.mapOverlay after decoding, val = " . $value . "\n";
1263
1264 my $json_array = decode_json $value;
1265
1266 foreach my $shape (@$json_array) {
1267
1268 # Put each available shape description/label into this section's metadata with GPSMapOverlayLabel as metaname.
1269 # Just as for Coordinate meta, don't need to know which shape a label belongs too. This is just so each label
1270 # will be indexed, and therefore can be searched.
1271
1272 my $description = $shape->{"description"};
1273 if($description) {
1274 push (@{$section_ptr->{'metadata'}}, ["GPSMapOverlayLabel", $description]);
1275 ###print STDERR "@@@@############################################ Just added description meta: " . $description . "\n";
1276 }
1277
1278 my $type = $shape->{"type"};
1279 ###print STDERR "Shape type : " . $type . "\n";
1280
1281 if($type eq "circle") {
1282 ###print STDERR "Found a circle:\n" . &printShape($json, $shape);
1283
1284 # work out bounding box
1285 # want the inverse of this useful page:
1286 # https://stackoverflow.com/questions/639695/how-to-convert-latitude-or-longitude-to-meters
1287 # https://www.geodatasource.com/developers/javascript
1288
1289 # Dr Bainbridge wants us to use: https://gis.stackexchange.com/questions/5821/calculating-latitude-longitude-x-miles-from-point
1290 # Since for N,E,S,W the following works out the same as above link, we're using the rule of thumb in the following:
1291 # https://gis.stackexchange.com/questions/2951/algorithm-for-offsetting-a-latitude-longitude-by-some-amount-of-meters
1292 # which states
1293 # "If your displacements aren't too great (less than a few kilometers) and you're not right at the poles,
1294 # use the quick and dirty estimate that 111,111 meters (111.111 km) in the y direction is 1 degree
1295 # (of latitude) and 111,111 * cos(latitude) meters in the x direction is 1 degree (of longitude)."
1296 my $centre_lat = $shape->{"center"}->{"lat"};
1297 my $centre_lng = $shape->{"center"}->{"lng"};
1298 my $radius = $shape->{"radius"}; # in metres!
1299
1300 ###print STDERR "@@@ circle centre: ($centre_lat, $centre_lng), radius: $radius\n";
1301
1302 my $lat_north = $centre_lat + ($radius/111111);
1303 my $lat_south = $centre_lat - ($radius/111111);
1304
1305 ###print STDERR "### lat_north: $lat_north\n";
1306 ###print STDERR "### lat_south: $lat_south\n";
1307
1308 # our latitude and longitude values are in degrees. But cos and sin etc in perl and generally all prog languages
1309 # all expect their angle to be in radians. So need to convert from degree to radians before we can take the cos of it.
1310 my $centre_lat_radians = $self->degreesToRadians($centre_lat);
1311 my $cos_in_radians = cos($centre_lat_radians);
1312 ###print STDERR "cos $centre_lat_radians " . cos($centre_lat_radians) . "\n";
1313 my $lng_east = $centre_lng + ($radius/(111111 * $cos_in_radians));
1314 my $lng_west = $centre_lng - ($radius/(111111 * $cos_in_radians));
1315 ###print STDERR "### lng_east $lng_east\n";
1316 ###print STDERR "### lng_west $lng_west\n";
1317
1318 my $cos_lat = cos($centre_lat);
1319 ###print STDERR "cos $centre_lat is $cos_lat\n";
1320
1321 $self->processCoordinate($section, $lat_north, $lng_east);
1322 $self->processCoordinate($section, $lat_south, $lng_east);
1323 $self->processCoordinate($section, $lat_south, $lng_west);
1324 $self->processCoordinate($section, $lat_north, $lng_west);
1325
1326 }
1327 elsif ($type eq "marker") {
1328 ###print STDERR "@@ MARKER FOUND WITH LAT: " . $shape->{"position"}->{"lat"} . "\n";
1329 ###print STDERR "@@ MARKER FOUND WITH LNG: " . $shape->{"position"}->{"lng"} . "\n";
1330 $self->processCoordinate($section, $shape->{"position"}->{"lat"}, $shape->{"position"}->{"lng"});
1331 }
1332 elsif ($type eq "polyline" || $type eq "polygon") {
1333 my $path_array = $shape->{"path"};
1334 foreach my $position (@$path_array) {
1335 $self->processCoordinate($section, $position->{"lat"}, $position->{"lng"});
1336 }
1337 }
1338 elsif ($type eq "rectangle") {
1339
1340 my $bounds = $shape->{"bounds"};
1341 $self->processCoordinate($section, $bounds->{"north"}, $bounds->{"east"});
1342 $self->processCoordinate($section, $bounds->{"south"}, $bounds->{"east"});
1343 $self->processCoordinate($section, $bounds->{"south"}, $bounds->{"west"});
1344 $self->processCoordinate($section, $bounds->{"north"}, $bounds->{"west"});
1345 }
1346
1347 } # end for on each shape in GPS.mapOverlay
1348 } # end if(buildcol and text pass)
1349 } # end GPS.mapOverlay meta
1350
1351 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1352}
1353
1354# https://en.wikipedia.org/wiki/Radian
1355sub degreesToRadians
1356{
1357 my $self = shift (@_);
1358 my ($degrees) = @_;
1359
1360 return $degrees * pi /180; # returns radians
1361}
1362
1363sub radiansToDegrees
1364{
1365 my $self = shift (@_);
1366 my ($radians) = @_;
1367
1368 return $radians * 180 / pi; # returns degrees
1369}
1370
1371# Call as:
1372# my $json = JSON->new->allow_nonref;
1373# &printAllShapes($json, $json_array);
1374sub printAllShapes {
1375 my ($json, $json_array) = @_;
1376
1377
1378 #my $pretty_print_shape = $json->pretty->encode( $json_array->[0] );
1379 foreach my $shape (@$json_array) {
1380 my $pretty_print_shape = $json->pretty->encode( $shape );
1381 print STDERR "Shape: $pretty_print_shape\n";
1382 }
1383
1384}
1385
1386# For the (lat, lng) coordinate given,
1387# attaches Coordinate and multiple CoordShort (different precision level) metadata to the doc object
1388sub processCoordinate {
1389 my $self = shift (@_);
1390 my ($section, $latitude, $longitude) = @_;
1391
1392 my $section_ptr = $self->_lookup_section($section);
1393
1394 my $lat_direction = ($latitude =~ m/^-/) ? "S" : "N";
1395 my $lng_direction = ($longitude =~ m/^-/) ? "W" : "E";
1396
1397 # have to store (lat, lng) in pairs, when there are so many coords to store
1398 #push (@{$section_ptr->{'metadata'}}, ["Latitude", $latitude]);
1399 #push (@{$section_ptr->{'metadata'}}, ["Longitude", $longitude]);
1400
1401 push (@{$section_ptr->{'metadata'}}, ["Coordinate", "$latitude $longitude"]); # "$latitude$lat_direction $longitude$lng_direction"
1402
1403 my ($latBeforeDec, $latAfterDec);
1404 my ($lngBeforeDec, $lngAfterDec);
1405
1406 if($latitude !~ m/\./) {
1407 $latBeforeDec = $latitude;
1408 $latAfterDec = "";
1409 } else {
1410 ($latBeforeDec, $latAfterDec) = ($latitude =~ m/^-?([0-9]+)\.([0-9]+)$/);
1411 }
1412 if($longitude !~ m/\./) {
1413 $lngBeforeDec = $longitude;
1414 $lngAfterDec = "";
1415 } else {
1416 ($lngBeforeDec, $lngAfterDec) = ($longitude =~ m/^-?([0-9]+)\.([0-9]+)$/);
1417 }
1418
1419
1420 my $name = "CoordShort";
1421 push (@{$section_ptr->{'metadata'}}, [$name, "$latBeforeDec$lat_direction $lngBeforeDec$lng_direction"]);
1422
1423 for(my $i = 2; $i <= 4; $i++)
1424 {
1425 my $latDecPlaces = (length($latAfterDec) >= $i) ? substr($latAfterDec, 0, $i) : "";
1426 my $lngDecPlaces = (length($lngAfterDec) >= $i) ? substr($lngAfterDec, 0, $i) : "";
1427
1428 push (@{$section_ptr->{'metadata'}}, [$name,
1429 $latBeforeDec . $lat_direction. $latDecPlaces . " " . $lngBeforeDec . $lng_direction. $lngDecPlaces]);
1430
1431 }
1432
1433 #Only add the metadata if it has not already been added
1434 #my $metaMap = $self->get_metadata_hashmap($section); ### TODO: metaMap not used. Unnecesssary step? (Called function has no apparent side-effects.)
1435}
1436
1437
1438# methods for dealing with text
1439
1440# returns the text for a section
1441sub get_text {
1442 my $self = shift (@_);
1443 my ($section) = @_;
1444
1445 my $section_ptr = $self->_lookup_section($section);
1446 if (!defined $section_ptr) {
1447 print STDERR "doc::get_text couldn't find section " .
1448 "$section\n";
1449 return "";
1450 }
1451
1452 return $section_ptr->{'text'};
1453}
1454
1455# returns the (utf-8 encoded) length of the text for a section
1456sub get_text_length {
1457 my $self = shift (@_);
1458 my ($section) = @_;
1459
1460 my $section_ptr = $self->_lookup_section($section);
1461 if (!defined $section_ptr) {
1462 print STDERR "doc::get_text_length couldn't find section " .
1463 "$section\n";
1464 return 0;
1465 }
1466
1467 return length ($section_ptr->{'text'});
1468}
1469
1470# returns the total length for all the sections
1471sub get_total_text_length {
1472 my $self = shift (@_);
1473
1474 my $section = $self->get_top_section();
1475 my $length = 0;
1476 while (defined $section) {
1477 $length += $self->get_text_length($section);
1478 $section = $self->get_next_section($section);
1479 }
1480 return $length;
1481}
1482
1483sub delete_text {
1484 my $self = shift (@_);
1485 my ($section) = @_;
1486
1487 my $section_ptr = $self->_lookup_section($section);
1488 if (!defined $section_ptr) {
1489 print STDERR "doc::delete_text couldn't find section " .
1490 "$section\n";
1491 return;
1492 }
1493
1494 $section_ptr->{'text'} = "";
1495}
1496
1497# add_text assumes the text is in (extended) ascii form. For
1498# text which has been already converted to the UTF-8 format
1499# use add_utf8_text.
1500sub add_text {
1501 my $self = shift (@_);
1502 my ($section, $text) = @_;
1503
1504 # convert the text to UTF-8 encoded unicode characters
1505 # and add the text
1506 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1507}
1508
1509
1510# add_utf8_text assumes the text to be added has already
1511# been converted to the UTF-8 encoding. For ascii text use
1512# add_text
1513# Pass by value version (internally calls pass by ref version
1514# to avoid code duplication)
1515sub add_utf8_text {
1516 my $self = shift (@_);
1517 my ($section, $text) = @_;
1518
1519 $self->add_utf8_textref($section, \$text);
1520}
1521
1522# Pass by reference version, used by GreenstoneSQLPlugin for fulltext
1523sub add_utf8_textref {
1524 my $self = shift (@_);
1525 my ($section, $text_ref) = @_;
1526
1527 my $section_ptr = $self->_lookup_section($section);
1528 if (!defined $section_ptr) {
1529 print STDERR "doc::add_utf8_textref couldn't find section " .
1530 "$section\n";
1531 return;
1532 }
1533
1534 $section_ptr->{'text'} .= $$text_ref;
1535}
1536
1537# returns the Source meta, which is the utf8 filename generated.
1538# Added a separate method here for convenience
1539sub get_source {
1540 my $self = shift (@_);
1541 return $self->get_metadata_element ($self->get_top_section(), "Source");
1542}
1543
1544# returns the SourceFile meta, which is the url reference to the URL-encoded
1545# version of Source (the utf8 filename). Added a separate method here for convenience
1546sub get_sourcefile {
1547 my $self = shift (@_);
1548 return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
1549}
1550
1551# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
1552# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
1553sub get_assocfile_from_sourcefile {
1554 my $self = shift (@_);
1555
1556 # get the SourceFile meta, which is a *URL* to a file on the filesystem
1557 my $top_section = $self->get_top_section();
1558 my $source_file = $self->get_metadata_element($top_section, "SourceFile");
1559
1560 # get the actual filename as it exists on the filesystem which this url refers to
1561 $source_file = &unicode::url_to_filename($source_file);
1562 my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
1563 return $assocfilename;
1564}
1565
1566# methods for dealing with associated files
1567
1568# a file is associated with a document, NOT a section.
1569# if section is defined it is noted in the data structure
1570# only so that files associated from a particular section
1571# may be removed later (using delete_section_assoc_files)
1572sub associate_file {
1573 my $self = shift (@_);
1574 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1575 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1576
1577 # remove all associated files with the same name
1578 $self->delete_assoc_file ($assoc_filename);
1579
1580 # Too harsh a requirement
1581 # Definitely get HTML docs, for example, with some missing
1582 # support files
1583# if (!&util::fd_exists($real_filename)) {
1584# print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n";
1585# exit -1;
1586# }
1587
1588# print STDERR "**** is the following a UTF8 rep of *real* filename?\n $real_filename\n";
1589# print STDERR "****##### so, ensure it is before storing?!?!?\n";
1590## my $utf8_filename = Encode::encode("utf8",$filename);
1591
1592 push (@{$self->{'associated_files'}},
1593 [$real_filename, $assoc_filename, $mime_type, $section]);
1594}
1595
1596# returns a list of associated files in the form
1597# [[real_filename, assoc_filename, mimetype], ...]
1598sub get_assoc_files {
1599 my $self = shift (@_);
1600
1601 return $self->{'associated_files'};
1602}
1603
1604# the following two methods used to keep track of original associated files
1605# for incremental building. eg a txt file used by an item file does not end
1606# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1607# file for incremental build
1608sub associate_source_file {
1609 my $self = shift (@_);
1610 my ($full_filename) = @_;
1611
1612 push (@{$self->{'source_assoc_files'}}, $full_filename);
1613
1614}
1615
1616sub get_source_assoc_files {
1617 my $self = shift (@_);
1618
1619 return $self->{'source_assoc_files'};
1620
1621
1622}
1623sub metadata_file {
1624 my $self = shift (@_);
1625 my ($real_filename, $filename) = @_;
1626
1627 push (@{$self->{'metadata_files'}},
1628 [$real_filename, $filename]);
1629}
1630
1631# used for writing out the archiveinf-doc info database, to list all the metadata files
1632sub get_meta_files {
1633 my $self = shift (@_);
1634
1635 return $self->{'metadata_files'};
1636}
1637
1638sub delete_section_assoc_files {
1639 my $self = shift (@_);
1640 my ($section) = @_;
1641
1642 my $i=0;
1643 while ($i < scalar (@{$self->{'associated_files'}})) {
1644 if (defined $self->{'associated_files'}->[$i]->[3] &&
1645 $self->{'associated_files'}->[$i]->[3] eq $section) {
1646 splice (@{$self->{'associated_files'}}, $i, 1);
1647 } else {
1648 $i++;
1649 }
1650 }
1651}
1652
1653sub delete_assoc_file {
1654 my $self = shift (@_);
1655 my ($assoc_filename) = @_;
1656
1657 my $i=0;
1658 while ($i < scalar (@{$self->{'associated_files'}})) {
1659 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1660 splice (@{$self->{'associated_files'}}, $i, 1);
1661 } else {
1662 $i++;
1663 }
1664 }
1665}
1666
1667sub reset_nextsection_ptr {
1668 my $self = shift (@_);
1669 my ($section) = @_;
1670
1671 my $section_ptr = $self->_lookup_section($section);
1672 $section_ptr->{'next_subsection'} = 1;
1673}
1674
16751;
Note: See TracBrowser for help on using the repository browser.