source: main/trunk/greenstone2/perllib/doc.pm@ 34105

Last change on this file since 34105 was 34105, checked in by ak19, 4 years ago

A bugfix related to the commits 33737 and thereabouts which is the work I did to get encoded filenames in subdirs (or with & and + signs in the relative import path) to work with metadata assigned to such files through GLI. Metadata in doc.xml and metadata.xml should stick to such filenames, and which wasn't happening before that work. This latest bugfix would strike when the file_rename_method was base64 and there was no subdir, just import/file. The import folder itself is not supposed to get renamed with base64 or url encoding. The bug wasn't apparent during the earlier testing which focused on file_rename_method =url, as the import folder's name just resolves to being import if url encoded. However, it became noticeable now when debugging a warning message for Renate, when import got base64 encoded and GLI didn't know what was going on as import is not ever supposed to get encoded, only subdirs and files therein. The bug was an off by one error. Still need to create a ticket wherein I bring together all the related commits for the GLI with metadata work that this bugfix goes with.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 51.5 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use FileUtils;
40use ghtml;
41use File::stat;
42##use hashdoc;
43use docprint;
44use JSON;
45
46# We just need pi from the Trig lib
47# Import constants pi2, pip2, pip4 (2*pi, pi/2, pi/4).
48# But doing
49# use Math::Trig ':pi';
50# resulted in errors on 32 bit linux VM with perl v8 of "bareword pi not allowed while strict subs in use"
51# So, now doing just
52use Math::Trig;
53
54# the document type may be indexed_doc, nonindexed_doc, or
55# classification
56
57our $OIDcount = 0;
58
59
60# Declare the Package (level) Variable that will tell us whether we're in the import or buildcol phase.
61# This var will be set by import.pl, buildcol.pl and export.pl to the then appropriate value.
62# https://perlmaven.com/package-variables-and-lexical-variables-in-perl
63# But: https://stackoverflow.com/questions/7542120/perl-how-to-make-variables-from-requiring-script-available-in-required-script
64# https://stackoverflow.com/questions/33230015/how-do-i-use-a-variable-in-another-perl-script
65#$doc::cmd_line_mode = undef;
66# Decided to follow the usage of $OIDcount above to declare the necessary package-level variables 'globally' accessible
67# actually accessible only for those packages that import this one (with 'use doc')
68my $cmd_line_mode = undef;
69
70# processor_mode keeps track of which buildcol pass we're at: dummy, text (sidx/didx passes) or infodb
71my $processor_mode = undef;
72
73# rename_method can be 'url', 'none', 'base64'
74sub new {
75 my $class = shift (@_);
76 my ($source_filename, $doc_type, $rename_method) = @_;
77
78
79 my $self = bless {'associated_files'=>[],
80 'subsection_order'=>[],
81 'next_subsection'=>1,
82 'subsections'=>{},
83 'metadata'=>[],
84 'text'=>"",
85 'OIDtype'=>"hash"}, $class;
86
87 # used to set lastmodified here, but this can screw up the HASH ids, so
88 # the docsave processor now calls set_lastmodified
89
90 $self->set_source_path($source_filename);
91
92 if (defined $source_filename) {
93 $source_filename = &util::filename_within_collection($source_filename);
94 print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
95 $self->set_source_filename ($source_filename, $rename_method);
96 }
97
98 $self->set_doc_type ($doc_type) if defined $doc_type;
99
100 return $self;
101}
102
103
104sub set_source_path
105{
106 my $self = shift @_;
107 my ($source_filename) = @_;
108
109 if (defined $source_filename) {
110 # On Windows the source_filename can be in terse DOS format
111 # e.g. test~1.txt
112
113 $self->{'terse_source_path'} = $source_filename;
114
115 # Use the FileUtil library methods as they are aware of more special
116 # cases such as HDFS [jmt12]
117 if (&FileUtils::fileExists($source_filename))
118 {
119 # See if we can do better for Windows with a filename
120 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
121 require Win32;
122 $self->{'source_path'} = Win32::GetLongPathName($source_filename);
123 }
124 else {
125 # For Unix-based systems, there is no difference between the two
126 $self->{'source_path'} = $source_filename;
127 }
128 }
129 else {
130 print STDERR "Warning: In doc::set_source_path(), file\n";
131 print STDERR " $source_filename\n";
132 print STDERR " does not exist\n";
133
134 # (default) Set it to whatever we were given
135 $self->{'source_path'} = $source_filename;
136 }
137 }
138 else {
139 # Previous code for setting source_path allowed for
140 # it to be undefined, so continue this practice
141 $self->{'terse_source_path'} = undef;
142 $self->{'source_path'} = undef;
143 }
144}
145
146
147sub get_source_path
148{
149 my $self = shift @_;
150
151 return $self->{'terse_source_path'};
152}
153
154# set lastmodified for OAI purposes, added by GRB, moved by kjdon
155sub set_oailastmodified {
156 my $self = shift (@_);
157
158 my $source_path = $self->{'terse_source_path'};
159
160 if (defined $source_path && (-e $source_path)) {
161 my $current_time = time;
162
163 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
164 $wday, $yday, $isdst) = localtime($current_time);
165
166 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
167
168 $self->add_utf8_metadata($self->get_top_section(), "oailastmodified", $current_time);
169 $self->add_utf8_metadata($self->get_top_section(), "oailastmodifieddate", $date_modified);
170 }
171}
172
173# no longer used for OAI purposes, since lastmodified is not what we want as the
174# Datestamp of a document. This doc metadata may be useful for general purposes.
175sub set_lastmodified {
176 my $self = shift (@_);
177
178 my $source_path = $self->{'terse_source_path'};
179
180 if (defined $source_path && (-e $source_path)) {
181
182 my $file_stat = stat($source_path);
183 my $mtime = $file_stat->mtime;
184 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
185 $wday, $yday, $isdst) = localtime($mtime);
186
187 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
188
189 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $mtime);
190 $self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
191 }
192}
193
194# clone the $self object
195sub duplicate {
196 my $self = shift (@_);
197
198 my $newobj = {};
199
200 foreach my $k (keys %$self) {
201 $newobj->{$k} = &clone ($self->{$k});
202 }
203
204 bless $newobj, ref($self);
205 return $newobj;
206}
207
208sub clone {
209 my ($from) = @_;
210 my $type = ref ($from);
211
212 if ($type eq "HASH") {
213 my $to = {};
214 foreach my $key (keys %$from) {
215 $to->{$key} = &clone ($from->{$key});
216 }
217 return $to;
218 } elsif ($type eq "ARRAY") {
219 my $to = [];
220 foreach my $v (@$from) {
221 push (@$to, &clone ($v));
222 }
223 return $to;
224 } else {
225 return $from;
226 }
227}
228
229sub set_OIDtype {
230 my $self = shift (@_);
231 my ($type, $metadata) = @_;
232
233 if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|hash_on_full_filename|incremental|filename|dirname|full_filename|assigned)$/) {
234 $self->{'OIDtype'} = $type;
235 } else {
236 $self->{'OIDtype'} = "hash";
237 }
238
239 if ($type =~ /^assigned$/) {
240 if (defined $metadata) {
241 $self->{'OIDmetadata'} = $metadata;
242 } else {
243 $self->{'OIDmetadata'} = "dc.Identifier";
244 }
245 }
246}
247
248# rename_method can be 'url', 'none', 'base64'
249sub set_source_filename {
250 my $self = shift (@_);
251 my ($source_filename, $rename_method) = @_;
252
253 # Since the gsdlsourcefilename element goes into the doc.xml it has
254 # to be utf8. However, it should also *represent* the source filename
255 # (in the import directory) which may not be utf8 at all.
256 # For instance, if this meta element (gsdlsourcefilename) will be used
257 # by other applications that parse doc.xml in order to locate
258 # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
259 # encode the real filename as this is a binary-to-text encoding meaning
260 # that the resulting string is ASCII (utf8). Decoding will give the original.
261
262# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
263## print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename\n";
264
265 # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
266 $source_filename = $self->encode_filename($source_filename, $rename_method);
267# my ($srcfilename,$dirname,$suffix)
268# = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
269# print STDERR "-> $srcfilename -> ";
270# $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
271# $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
272# print STDERR "$source_filename\n";
273
274## print STDERR "AFTER URL/base64 encoding gsdlsourcefilename: $source_filename\n";
275
276 $self->set_utf8_metadata_element ($self->get_top_section(),
277 "gsdlsourcefilename",
278 $source_filename);
279
280 # If we set the file renaming method, at least GLI can decode gsdlsourcefilename to get the
281 # original filename (relative path to file) back for GLI to notice what ex Meta is associated with it
282 $self->set_utf8_metadata_element($self->get_top_section(),
283 "gsdlsourcefilerenamemethod",
284 $rename_method);
285}
286
287sub encode_filename {
288 my $self = shift (@_);
289 my ($source_filename, $rename_method) = @_;
290
291 my ($srcfilename,$dirname,$suffix)
292 = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
293# print STDERR "-> $srcfilename -> ";
294 $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
295
296 # encode any subdirs of "import" also, but not import itself
297 my $sep = ($ENV{'GSDLOS'} =~ /^windows$/i) ? "\\" : "/";
298 my @dirs = ($sep eq "\\") ? split(/[\\\/]+/, $dirname) : split(/\//, $dirname);
299
300 my $dirpath = "";
301
302 # Don't encode the first folder ("import" or "tmp"): GLI's DocXMLFile.java looks for literal "import"
303 # or "tmp" before it knows what the file rename method is that is to be used decode the rest of gsdlsourcefilename.
304 if(scalar (@dirs) >= 1) {
305 $dirpath = shift(@dirs);
306 }
307 foreach my $subdir (@dirs) {
308 #print STDERR "@@@@ Found subdir: $subdir\n";
309
310 $subdir = &util::rename_file($subdir, $rename_method);
311 #print STDERR "@@@@ encoded subdir: $subdir\n";
312
313 $dirpath = &FileUtils::filenameConcatenate($dirpath, $subdir);
314 }
315 if($dirpath ne "") {
316 $dirname = $dirpath;
317 }
318
319 $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
320
321 return $source_filename;
322}
323
324sub set_converted_filename {
325 my $self = shift (@_);
326 my ($converted_filename) = @_;
327
328 # we know the converted filename is utf8
329 $self->set_utf8_metadata_element ($self->get_top_section(),
330 "gsdlconvertedfilename",
331 $converted_filename);
332}
333
334# returns the source_filename as it was provided
335sub get_unmodified_source_filename {
336 my $self = shift (@_);
337
338 return $self->{'terse_source_path'};
339}
340
341# returns the source_filename with whatever rename_method was given
342sub get_source_filename {
343 my $self = shift (@_);
344
345 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
346}
347
348
349
350# returns converted filename if available else returns source filename
351sub get_filename_for_hashing {
352 my $self = shift (@_);
353
354 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
355
356 if (!defined $filename) {
357 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
358 # if NULPlug processed file, then don't give a filename
359 if (defined $plugin_name && $plugin_name eq "NULPlug") {
360 $filename = undef;
361 } else { # returns the URL encoded source filename!
362 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
363 }
364 }
365
366 if (!&FileUtils::isFilenameAbsolute($filename)) {
367 $filename = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$filename);
368 }
369
370 return $filename;
371}
372
373sub set_doc_type {
374 my $self = shift (@_);
375 my ($doc_type) = @_;
376
377 $self->set_metadata_element ($self->get_top_section(),
378 "gsdldoctype",
379 $doc_type);
380}
381
382# returns the gsdldoctype as it was provided
383# the default of "indexed_doc" is used if no document
384# type was provided
385sub get_doc_type {
386 my $self = shift (@_);
387
388 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
389 return $doc_type if (defined $doc_type);
390 return "indexed_doc";
391}
392
393
394# look up the reference to the a particular section
395sub _lookup_section {
396 my $self = shift (@_);
397 my ($section) = @_;
398
399 my ($num);
400 my $sectionref = $self;
401
402 while (defined $section && $section ne "") {
403
404 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
405
406 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
407
408 $section = "" unless defined $section;
409
410
411 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
412 $sectionref = $sectionref->{'subsections'}->{$num};
413 } else {
414 return undef;
415 }
416 }
417
418 return $sectionref;
419}
420
421# calculate OID by hashing the contents of the document
422sub _calc_OID {
423 my $self = shift (@_);
424 my ($filename) = @_;
425
426
427 my $osexe = &util::get_os_exe();
428
429 my $hashfile_exe = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin",
430 $ENV{'GSDLOS'},"hashfile$osexe");
431
432 &util::set_gnomelib_env(); # gnomelib_env (particularly lib/libiconv2.dylib) required to run the hashfile executable on Mac Lions
433 # The subroutine will set the gnomelib env once for each subshell launched, by first testing if GEXTGNOME is not already set
434
435 # A different way to set the gnomelib env would be to do it more locally: exporting the necessary vars
436 # (specifically DYLD/LD_LIB_PATH) for gnome_lib as part of the command executed.
437 # E.g. $result=`export LD_LIBRARY_PATH=../ext/gnome-lib/darwin/lib; hashfile...`
438
439 my $result = "NULL";
440
441
442 if (-e "$hashfile_exe") {
443# $result = `\"$hashfile_exe\" \"$filename\"`;
444# $result = `hashfile$osexe \"$filename\" 2>&1`;
445 $result = `hashfile$osexe \"$filename\"`;
446
447 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
448 } else {
449 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
450 }
451 return "HASH$result";
452}
453
454# methods dealing with OID, not groups of them.
455
456# if $OID is not provided one is calculated
457sub set_OID {
458 my $self = shift (@_);
459 my ($OID) = @_;
460
461 my $use_hash_oid = 0;
462 # if an OID wasn't provided calculate one
463 if (!defined $OID) {
464 $OID = "NULL";
465 if ($self->{'OIDtype'} =~ /^hash/) {
466 $use_hash_oid = 1;
467 } elsif ($self->{'OIDtype'} eq "incremental") {
468 $OID = "D" . $OIDcount;
469 $OIDcount ++;
470 } elsif ($self->{'OIDtype'} eq "filename") {
471 my $filename = $self->get_source_filename();
472 $OID = &File::Basename::fileparse($filename, qr/\.[^.]*/);
473 $OID = &util::tidy_up_oid($OID);
474 } elsif ($self->{'OIDtype'} eq "full_filename") {
475 my $source_filename = $self->get_source_filename();
476 my $dirsep = &util::get_os_dirsep();
477
478 $source_filename =~ s/^import$dirsep//;
479 $source_filename =~ s/$dirsep/-/g;
480 $source_filename =~ s/\./_/g;
481
482 $OID = $source_filename;
483 $OID = &util::tidy_up_oid($OID);
484 } elsif ($self->{'OIDtype'} eq "dirname") {
485 my $filename = $self->get_source_filename();
486 if (defined($filename)) { # && -e $filename) {
487 # get the immediate parent directory
488 $OID = &File::Basename::dirname($filename);
489 if (defined $OID) {
490 $OID = &File::Basename::basename($OID);
491 $OID = &util::tidy_up_oid($OID);
492 } else {
493 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
494 $use_hash_oid = 1;
495 }
496 } else {
497 print STDERR "Failed to find a filename, generating hash id\n";
498 $use_hash_oid = 1;
499 }
500
501 } elsif ($self->{'OIDtype'} eq "assigned") {
502 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
503 if (defined $identifier && $identifier ne "") {
504 $OID = $identifier;
505 $OID = &util::tidy_up_oid($OID);
506 } else {
507 # need a hash id
508 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
509 $use_hash_oid = 1;
510 }
511
512 } else {
513 $use_hash_oid = 1;
514 }
515
516 if ($use_hash_oid) {
517 my $hash_on_file = 1;
518 my $hash_on_ga_xml = 0;
519
520 if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
521 $hash_on_file = 0;
522 $hash_on_ga_xml = 1;
523 }
524
525 if ($self->{'OIDtype'} eq "hash_on_full_filename") {
526 $hash_on_file = 0;
527 $hash_on_ga_xml = 0;
528
529 my $source_filename = $self->get_source_filename();
530 my $dirsep = &util::get_os_dirsep();
531
532 $source_filename =~ s/^import$dirsep//;
533 $source_filename =~ s/$dirsep/-/g;
534 $source_filename =~ s/\./_/g;
535
536 # If the filename is very short then (handled naively)
537 # this can cause conjestion in the hash-values
538 # computed, leading documents sharing the same leading
539 # Hex values in the computed has.
540 #
541 # The solution taken here is to replace the name of
542 # the file name a sufficient number of times (up to
543 # the character limit defined in 'rep_limit' and
544 # make that the content that is hashed on
545
546 # *** Think twice before changing the following value
547 # as it will break backward compatability of computed
548 # document HASH values
549
550 my $rep_limit = 256;
551 my $hash_content = undef;
552
553 if (length($source_filename)<$rep_limit) {
554 my $rep_string = "$source_filename|";
555 my $rs_len = length($rep_string);
556
557 my $clone_times = int(($rep_limit-1)/$rs_len) +1;
558
559 $hash_content = substr($rep_string x $clone_times, 0, $rep_limit);
560 }
561 else {
562 $hash_content = $source_filename;
563 }
564
565 my $filename = &util::get_tmp_filename();
566 if (!open (OUTFILE, ">:utf8", $filename)) {
567 print STDERR "doc::set_OID could not write to $filename\n";
568 } else {
569 print OUTFILE $hash_content;
570 close (OUTFILE);
571 }
572 $OID = $self->_calc_OID ($filename);
573
574 &FileUtils::removeFiles ($filename);
575 }
576
577 if ($hash_on_file) {
578 # "hash" OID - feed file to hashfile.exe
579 my $filename = $self->get_filename_for_hashing();
580
581 # -z: don't want to hash on the file if it is zero size
582 if (defined($filename) && -e $filename && !-z $filename) {
583 $OID = $self->_calc_OID ($filename);
584 } else {
585 $hash_on_ga_xml = 1; # switch to back-up plan, and hash on GA file instead
586 }
587 }
588
589 if ($hash_on_ga_xml) {
590 # In addition being asked to explicity calculate the has based on the GA file,
591 # can also end up coming into this block is doing 'hash_on_file' but the file
592 # itself is of zero bytes (as could be the case with 'doc.nul' file
593
594 my $filename = &util::get_tmp_filename();
595 if (!open (OUTFILE, ">:utf8", $filename)) {
596 print STDERR "doc::set_OID could not write to $filename\n";
597 } else {
598 my $doc_text = &docprint::get_section_xml($self);
599 print OUTFILE $doc_text;
600 close (OUTFILE);
601 }
602 $OID = $self->_calc_OID ($filename);
603 &FileUtils::removeFiles($filename);
604 }
605 }
606 }
607 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
608}
609
610# this uses hashdoc (embedded c thingy) which is faster but still
611# needs a little work to be sufficiently stable
612sub ___set_OID {
613 my $self = shift (@_);
614 my ($OID) = @_;
615
616 # if an OID wasn't provided then calculate hash value based on document
617 if (!defined $OID)
618 {
619 my $hash_text = &docprint::get_section_xml($self);
620 my $hash_len = length($hash_text);
621
622 $OID = &hashdoc::buffer($hash_text,$hash_len);
623 }
624
625 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
626}
627
628# returns the OID for this document
629sub get_OID {
630 my $self = shift (@_);
631 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
632 return $OID if (defined $OID);
633 return "NULL";
634}
635
636sub delete_OID {
637 my $self = shift (@_);
638
639 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
640}
641
642
643# methods for manipulating section names
644
645# returns the name of the top-most section (the top
646# level of the document
647sub get_top_section {
648 my $self = shift (@_);
649
650 return "";
651}
652
653# returns a section
654sub get_parent_section {
655 my $self = shift (@_);
656 my ($section) = @_;
657
658 $section =~ s/(^|\.)\d+$//;
659
660 return $section;
661}
662
663# returns the first child section (or the end child
664# if there isn't any)
665sub get_begin_child {
666 my $self = shift (@_);
667 my ($section) = @_;
668
669 my $section_ptr = $self->_lookup_section($section);
670 return "" unless defined $section_ptr;
671
672 if (defined $section_ptr->{'subsection_order'}->[0]) {
673 return "$section.$section_ptr->{'subsection_order'}->[0]";
674 }
675
676 return $self->get_end_child ($section);
677}
678
679# returns the next child of a parent section
680sub get_next_child {
681 my $self = shift (@_);
682 my ($section) = @_;
683
684 my $parent_section = $self->get_parent_section($section);
685 my $parent_section_ptr = $self->_lookup_section($parent_section);
686 return undef unless defined $parent_section_ptr;
687
688 my ($section_num) = $section =~ /(\d+)$/;
689 return undef unless defined $section_num;
690
691 my $i = 0;
692 my $section_order = $parent_section_ptr->{'subsection_order'};
693 while ($i < scalar(@$section_order)) {
694 last if $section_order->[$i] eq $section_num;
695 $i++;
696 }
697
698 $i++; # the next child
699 if ($i < scalar(@$section_order)) {
700 return $section_order->[$i] if $parent_section eq "";
701 return "$parent_section.$section_order->[$i]";
702 }
703
704 # no more sections in this level
705 return undef;
706}
707
708# returns a reference to a list of children
709sub get_children {
710 my $self = shift (@_);
711 my ($section) = @_;
712
713 my $section_ptr = $self->_lookup_section($section);
714 return [] unless defined $section_ptr;
715
716 my @children = @{$section_ptr->{'subsection_order'}};
717
718 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
719 return \@children;
720}
721
722# returns the child section one past the last one (which
723# is coded as "0")
724sub get_end_child {
725 my $self = shift (@_);
726 my ($section) = @_;
727
728 return $section . ".0" unless $section eq "";
729 return "0";
730}
731
732# returns the next section in book order
733sub get_next_section {
734 my $self = shift (@_);
735 my ($section) = @_;
736
737 return undef unless defined $section;
738
739 my $section_ptr = $self->_lookup_section($section);
740 return undef unless defined $section_ptr;
741
742 # first try to find first child
743 if (defined $section_ptr->{'subsection_order'}->[0]) {
744 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
745 return "$section.$section_ptr->{'subsection_order'}->[0]";
746 }
747
748 do {
749 # try to find sibling
750 my $next_child = $self->get_next_child ($section);
751 return $next_child if (defined $next_child);
752
753 # move up one level
754 $section = $self->get_parent_section ($section);
755 } while $section =~ /\d/;
756
757 return undef;
758}
759
760sub is_leaf_section {
761 my $self = shift (@_);
762 my ($section) = @_;
763
764 my $section_ptr = $self->_lookup_section($section);
765 return 1 unless defined $section_ptr;
766
767 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
768}
769
770# methods for dealing with sections
771
772# returns the name of the inserted section
773sub insert_section {
774 my $self = shift (@_);
775 my ($before_section) = @_;
776
777 # get the child to insert before and its parent section
778 my $parent_section = "";
779 my $before_child = "0";
780 my @before_section = split (/\./, $before_section);
781 if (scalar(@before_section) > 0) {
782 $before_child = pop (@before_section);
783 $parent_section = join (".", @before_section);
784 }
785
786 my $parent_section_ptr = $self->_lookup_section($parent_section);
787 if (!defined $parent_section_ptr) {
788 print STDERR "doc::insert_section couldn't find parent section " .
789 "$parent_section\n";
790 return;
791 }
792
793 # get the next section number
794 my $section_num = $parent_section_ptr->{'next_subsection'}++;
795
796 my $i = 0;
797 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
798 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
799 $i++;
800 }
801
802 # insert the section number into the order list
803 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
804
805 # add this section to the parent section
806 my $section_ptr = {'subsection_order'=>[],
807 'next_subsection'=>1,
808 'subsections'=>{},
809 'metadata'=>[],
810 'text'=>""};
811 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
812
813 # work out the full section number
814 my $section = $parent_section;
815 $section .= "." unless $section eq "";
816 $section .= $section_num;
817
818 return $section;
819}
820
821# creates a pre-named section
822sub create_named_section {
823 my $self = shift (@_);
824 my ($mastersection) = @_;
825
826 my ($num);
827 my $section = $mastersection;
828 my $sectionref = $self;
829
830 while ($section ne "") {
831 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
832 $num =~ s/^0+(\d)/$1/; # remove leading 0s
833 $section = "" unless defined $section;
834
835 if (defined $num) {
836 if (!defined $sectionref->{'subsections'}->{$num}) {
837 push (@{$sectionref->{'subsection_order'}}, $num);
838 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
839 'next_subsection'=>1,
840 'subsections'=>{},
841 'metadata'=>[],
842 'text'=>""};
843 if ($num >= $sectionref->{'next_subsection'}) {
844 $sectionref->{'next_subsection'} = $num + 1;
845 }
846 }
847 $sectionref = $sectionref->{'subsections'}->{$num};
848
849 } else {
850 print STDERR "doc::create_named_section couldn't create section ";
851 print STDERR "$mastersection\n";
852 last;
853 }
854 }
855}
856
857# returns a reference to a list of subsections
858sub list_subsections {
859 my $self = shift (@_);
860 my ($section) = @_;
861
862 my $section_ptr = $self->_lookup_section ($section);
863 if (!defined $section_ptr) {
864 print STDERR "doc::list_subsections couldn't find section $section\n";
865 return [];
866 }
867
868 return [@{$section_ptr->{'subsection_order'}}];
869}
870
871sub delete_section {
872 my $self = shift (@_);
873 my ($section) = @_;
874
875# my $section_ptr = {'subsection_order'=>[],
876# 'next_subsection'=>1,
877# 'subsections'=>{},
878# 'metadata'=>[],
879# 'text'=>""};
880
881 # if this is the top section reset everything
882 if ($section eq "") {
883 $self->{'subsection_order'} = [];
884 $self->{'subsections'} = {};
885 $self->{'metadata'} = [];
886 $self->{'text'} = "";
887 return;
888 }
889
890 # find the parent of the section to delete
891 my $parent_section = "";
892 my $child = "0";
893 my @section = split (/\./, $section);
894 if (scalar(@section) > 0) {
895 $child = pop (@section);
896 $parent_section = join (".", @section);
897 }
898
899 my $parent_section_ptr = $self->_lookup_section($parent_section);
900 if (!defined $parent_section_ptr) {
901 print STDERR "doc::delete_section couldn't find parent section " .
902 "$parent_section\n";
903 return;
904 }
905
906 # remove this section from the subsection_order list
907 my $i = 0;
908 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
909 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
910 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
911 last;
912 }
913 $i++;
914 }
915
916 # remove this section from the subsection hash
917 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
918 undef $parent_section_ptr->{'subsections'}->{$child};
919 }
920}
921
922#--
923# methods for dealing with metadata
924
925# set_metadata_element and get_metadata_element are for metadata
926# which should only have one value. add_meta_data and get_metadata
927# are for metadata which can have more than one value.
928
929# returns the first metadata value which matches field
930
931# This version of get metadata element works much like the one above,
932# except it allows for the namespace portion of a metadata element to
933# be ignored, thus if you are searching for dc.Title, the first piece
934# of matching metadata ending with the name Title (once any namespace
935# is removed) would be returned.
936# 28-11-2003 John Thompson
937sub get_metadata_element {
938 my $self = shift (@_);
939 my ($section, $field, $ignore_namespace) = @_;
940 my ($data);
941
942 $ignore_namespace = 0 unless defined $ignore_namespace;
943
944 my $section_ptr = $self->_lookup_section($section);
945 if (!defined $section_ptr) {
946 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
947 return;
948 }
949
950 # Remove any namespace if we are being told to ignore them
951 if($ignore_namespace) {
952 $field =~ s/^.*\.//; #$field =~ s/^\w*\.//;
953 }
954
955 foreach $data (@{$section_ptr->{'metadata'}}) {
956
957 my $data_name = $data->[0];
958
959 # Remove any namespace if we are being told to ignore them
960 if($ignore_namespace) {
961 $data_name =~ s/^.*\.//; #$data_name =~ s/^\w*\.//;
962 }
963 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
964 $data_name =~ s/^ex\.([^.]+)$/$1/; #$data_name =~ s/^ex\.//;
965
966 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
967 }
968
969 return undef; # was not found
970}
971
972# returns a list of the form [value1, value2, ...]
973sub get_metadata {
974 my $self = shift (@_);
975 my ($section, $field, $ignore_namespace) = @_;
976 my ($data);
977
978 $ignore_namespace = 0 unless defined $ignore_namespace;
979
980 my $section_ptr = $self->_lookup_section($section);
981 if (!defined $section_ptr) {
982 print STDERR "doc::get_metadata couldn't find section ",
983 $section, "\n";
984 return;
985 }
986
987 # Remove any namespace if we are being told to ignore them
988 if($ignore_namespace) {
989 $field =~ s/^.*\.//;
990 }
991
992 my @metadata = ();
993 foreach $data (@{$section_ptr->{'metadata'}}) {
994
995 my $data_name = $data->[0];
996
997 # Remove any namespace if we are being told to ignore them
998 if($ignore_namespace) {
999 $data_name =~ s/^.*\.//;
1000 }
1001 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
1002 $data_name =~ s/^ex\.([^.]+)$/$1/;
1003
1004 push (@metadata, $data->[1]) if ($data_name eq $field);
1005 }
1006
1007 return \@metadata;
1008}
1009
1010sub get_metadata_hashmap {
1011 my $self = shift (@_);
1012 my ($section, $opt_namespace) = @_;
1013
1014 my $section_ptr = $self->_lookup_section($section);
1015 if (!defined $section_ptr) {
1016 print STDERR "doc::get_metadata couldn't find section ",
1017 $section, "\n";
1018 return;
1019 }
1020
1021 my $metadata_hashmap = {};
1022 foreach my $data (@{$section_ptr->{'metadata'}}) {
1023 my $metaname = $data->[0];
1024
1025 if ((!defined $opt_namespace) || ($metaname =~ m/^$opt_namespace\./)) {
1026 if (!defined $metadata_hashmap->{$metaname}) {
1027 $metadata_hashmap->{$metaname} = [];
1028 }
1029 my $metaval_list = $metadata_hashmap->{$metaname};
1030 push(@$metaval_list, $data->[1]);
1031 }
1032 }
1033
1034 return $metadata_hashmap;
1035}
1036
1037# returns a list of the form [[field,value],[field,value],...]
1038sub get_all_metadata {
1039 my $self = shift (@_);
1040 my ($section) = @_;
1041
1042 my $section_ptr = $self->_lookup_section($section);
1043 if (!defined $section_ptr) {
1044 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
1045 return;
1046 }
1047
1048 return $section_ptr->{'metadata'};
1049}
1050
1051# $value is optional
1052sub delete_metadata {
1053 my $self = shift (@_);
1054 my ($section, $field, $value) = @_;
1055
1056 my $section_ptr = $self->_lookup_section($section);
1057 if (!defined $section_ptr) {
1058 print STDERR "doc::delete_metadata couldn't find section ", $section, "$field\n";
1059 return;
1060 }
1061
1062 my $i = 0;
1063 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
1064 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
1065 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
1066 splice (@{$section_ptr->{'metadata'}}, $i, 1);
1067 } else {
1068 $i++;
1069 }
1070 }
1071}
1072
1073sub delete_all_metadata {
1074 my $self = shift (@_);
1075 my ($section) = @_;
1076
1077 my $section_ptr = $self->_lookup_section($section);
1078 if (!defined $section_ptr) {
1079 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
1080 return;
1081 }
1082
1083 $section_ptr->{'metadata'} = [];
1084}
1085
1086sub set_metadata_element {
1087 my $self = shift (@_);
1088 my ($section, $field, $value) = @_;
1089
1090 $self->set_utf8_metadata_element ($section, $field,
1091 &unicode::ascii2utf8(\$value));
1092}
1093
1094# set_utf8_metadata_element assumes the text has already been
1095# converted to the UTF-8 encoding.
1096sub set_utf8_metadata_element {
1097 my $self = shift (@_);
1098 my ($section, $field, $value) = @_;
1099
1100 $self->delete_metadata ($section, $field);
1101 $self->add_utf8_metadata ($section, $field, $value);
1102}
1103
1104
1105# add_metadata assumes the text is in (extended) ascii form. For
1106# text which has already been converted to the UTF-8 format use
1107# add_utf8_metadata.
1108sub add_metadata {
1109 my $self = shift (@_);
1110 my ($section, $field, $value) = @_;
1111
1112 $self->add_utf8_metadata ($section, $field,
1113 &unicode::ascii2utf8(\$value));
1114}
1115
1116sub add_utf8_metadata {
1117 my $self = shift (@_);
1118 my ($section, $field, $value) = @_;
1119
1120 # my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1121 # my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
1122 # print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
1123
1124 my $section_ptr = $self->_lookup_section($section);
1125 if (!defined $section_ptr) {
1126 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
1127 return;
1128 }
1129 if (!defined $value) {
1130 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
1131 return;
1132 }
1133 if (!defined $field) {
1134 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
1135 return;
1136 }
1137
1138 #print STDERR "###$field=$value\n";
1139
1140 # For now, supress this check. Given that text data read in is now
1141 # Unicode aware, then the following block of code can (ironically enough)
1142 # cause our unicode compliant string to be re-encoded (leading to
1143 # a double-encoded UTF-8 string, which we definitely don't want!).
1144
1145
1146 # double check that the value is utf-8
1147 # if (!&unicode::check_is_utf8($value)) {
1148 # print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
1149 # &unicode::ensure_utf8(\$value);
1150 # print STDERR " Tried converting to utf8: $value\n";
1151 # }
1152
1153 #If the metadata value is either a latitude or a longitude value then we want to save a shortened version for spacial searching purposes
1154 if ($field =~ m/^(.+\.)?Latitude$/ || $field =~ m/^(.+\.)?Longitude$/)
1155 {
1156 my ($mdprefix,$metaname) = ($field =~ m/(.+)\.(.+)$/);
1157 if (defined $mdprefix) {
1158 # Add in a version of Latitude/Longitude without the metadata namespace prefix to keep Runtime happy
1159 push (@{$section_ptr->{'metadata'}}, [$metaname, $value]);
1160 }
1161
1162 my $direction;
1163 if($value =~ m/^-/)
1164 {
1165 $direction = ($field eq "Latitude") ? "S" : "W";
1166 }
1167 else
1168 {
1169 $direction = ($field eq "Latitude") ? "N" : "E";
1170 }
1171
1172 my ($beforeDec, $afterDec) = ($value =~ m/^-?([0-9]+)\.([0-9]+)$/);
1173 if(defined $beforeDec && defined $afterDec)
1174 {
1175 my $name = ($field eq "Latitude") ? "LatShort" : "LngShort";
1176 push (@{$section_ptr->{'metadata'}}, [$name, $beforeDec . $direction]);
1177
1178 for(my $i = 2; $i <= 4; $i++)
1179 {
1180 if(length($afterDec) >= $i)
1181 {
1182 push (@{$section_ptr->{'metadata'}}, [$name, substr($afterDec, 0, $i)]);
1183 }
1184 }
1185
1186 #Only add the metadata if it has not already been added
1187 my $metaMap = $self->get_metadata_hashmap($section); # metaMap not used and called function has no apparent side-effects,
1188 # but this line appears important for ensuring uniqueness of (Latitude, value) meta
1189 # in the section. Also for LatShort, Longitude, LngShort.
1190 }
1191
1192 if($field =~ m/^(.+\.)?Longitude$/) {
1193 # if we are dealing with Longitude meta, we should 1. have Latitude meta too; 2. already have processed Latitude meta
1194 # in that case, add both Lat and Lng of this section as a Coordinate meta
1195 my $latitude = $self->get_metadata_element ($section, "Latitude");
1196 my $longitude = $value;
1197 # TODO: would like all Longitude meta placed together, followed by all Coordinate meta,
1198 # but the following will add all such coord meta to the doc object and the end of this function will add Longitude meta
1199 $self->processCoordinate($section, $latitude, $longitude);
1200 }
1201 }
1202
1203 elsif($field eq "GPS.mapOverlay") { # then the $value is a JSON string
1204
1205 # In order to allow searching map-data-enriched documents by map shape descriptions,
1206 # and to run rawquery searches for other docs by proximity based on their map data,
1207 # need to store the shape descriptions and Coordinate info for shapes into the text index.
1208 # We add the description for each shape in the mapoverlay into the text index as GPSMapOverlayLabel
1209 # And we add Coordinate (CD) and CoordShort (CS) info for each shape in the mapoverlay in the format (Lat, Lng) as ("37S339 175E342")
1210 # where the digits before the N/S/E/W direction represents the whole number. And the digits after the direction are the
1211 # decimal places which can vary between 0 and 2 to 4 digits.
1212
1213 # However, we only want to process GPS.mapOverlay during buildcol and only in the text indexing passes (e.g. sidx and didx for lucene)
1214 # and certainly never during the infodb pass of buildcol. The latter can end up duplicating Coordinate/CoordShort/GPSMapOverlayLabel
1215 # when rebuilding with the online doc editor, as that runs incremental-rebuild which then calls basebuilder::reconstruct_doc_objs_metadata()
1216 # on all docs NOT being incrementally rebuilt. That call would get meta from the infodb and use it to reconstruct the doc objects of docs
1217 # NOT being incrementally built. If the Coord and Label meta were written to the infodb, they would then be loaded back in memory when the
1218 # collection is incrementally rebuilt for those docs that don't need incremental processing. Then this function would once again add the
1219 # same meta into the infodb, thus duplicating what goes into the infodb. Hence, don't do all the following if doc::processor_mode =~ "infodb",
1220 # or anything other than a text mode.
1221
1222 # Note that for incremental rebuilding, the text pass can be called textreindex for instance (and infodb pass can be incinfodb).
1223 # So don't check for exact string match
1224
1225 if($doc::cmd_line_mode eq "buildcol" && $doc::processor_mode =~ m/^text/) { # currently known text processor_modes:
1226 # text, textreindex, possibly textdelete (see ArchivesInfPlugin.pm::read() for last 2).
1227 # OR: ..."buildcol" && $doc::processor_mode !~ m/infodb$/) # if dummy pass important
1228
1229 ###print STDERR "GPS.mapOverlay before decoding, val = " . $value . "\n";
1230
1231 # TODO: html decode?
1232 $value =~ s@&#091;@[@g;
1233 $value =~ s@&#093;@]@g;
1234 $value =~ s@&quot;@"@g;
1235 ###print STDERR "GPS.mapOverlay after decoding, val = " . $value . "\n";
1236
1237 my $json_array = decode_json $value;
1238
1239 foreach my $shape (@$json_array) {
1240
1241 # Put each available shape description/label into this section's metadata with GPSMapOverlayLabel as metaname.
1242 # Just as for Coordinate meta, don't need to know which shape a label belongs too. This is just so each label
1243 # will be indexed, and therefore can be searched.
1244
1245 my $description = $shape->{"description"};
1246 if($description) {
1247 push (@{$section_ptr->{'metadata'}}, ["GPSMapOverlayLabel", $description]);
1248 ###print STDERR "@@@@############################################ Just added description meta: " . $description . "\n";
1249 }
1250
1251 my $type = $shape->{"type"};
1252 ###print STDERR "Shape type : " . $type . "\n";
1253
1254 if($type eq "circle") {
1255 ###print STDERR "Found a circle:\n" . &printShape($json, $shape);
1256
1257 # work out bounding box
1258 # want the inverse of this useful page:
1259 # https://stackoverflow.com/questions/639695/how-to-convert-latitude-or-longitude-to-meters
1260 # https://www.geodatasource.com/developers/javascript
1261
1262 # Dr Bainbridge wants us to use: https://gis.stackexchange.com/questions/5821/calculating-latitude-longitude-x-miles-from-point
1263 # Since for N,E,S,W the following works out the same as above link, we're using the rule of thumb in the following:
1264 # https://gis.stackexchange.com/questions/2951/algorithm-for-offsetting-a-latitude-longitude-by-some-amount-of-meters
1265 # which states
1266 # "If your displacements aren't too great (less than a few kilometers) and you're not right at the poles,
1267 # use the quick and dirty estimate that 111,111 meters (111.111 km) in the y direction is 1 degree
1268 # (of latitude) and 111,111 * cos(latitude) meters in the x direction is 1 degree (of longitude)."
1269 my $centre_lat = $shape->{"center"}->{"lat"};
1270 my $centre_lng = $shape->{"center"}->{"lng"};
1271 my $radius = $shape->{"radius"}; # in metres!
1272
1273 ###print STDERR "@@@ circle centre: ($centre_lat, $centre_lng), radius: $radius\n";
1274
1275 my $lat_north = $centre_lat + ($radius/111111);
1276 my $lat_south = $centre_lat - ($radius/111111);
1277
1278 ###print STDERR "### lat_north: $lat_north\n";
1279 ###print STDERR "### lat_south: $lat_south\n";
1280
1281 # our latitude and longitude values are in degrees. But cos and sin etc in perl and generally all prog languages
1282 # all expect their angle to be in radians. So need to convert from degree to radians before we can take the cos of it.
1283 my $centre_lat_radians = $self->degreesToRadians($centre_lat);
1284 my $cos_in_radians = cos($centre_lat_radians);
1285 ###print STDERR "cos $centre_lat_radians " . cos($centre_lat_radians) . "\n";
1286 my $lng_east = $centre_lng + ($radius/(111111 * $cos_in_radians));
1287 my $lng_west = $centre_lng - ($radius/(111111 * $cos_in_radians));
1288 ###print STDERR "### lng_east $lng_east\n";
1289 ###print STDERR "### lng_west $lng_west\n";
1290
1291 my $cos_lat = cos($centre_lat);
1292 ###print STDERR "cos $centre_lat is $cos_lat\n";
1293
1294 $self->processCoordinate($section, $lat_north, $lng_east);
1295 $self->processCoordinate($section, $lat_south, $lng_east);
1296 $self->processCoordinate($section, $lat_south, $lng_west);
1297 $self->processCoordinate($section, $lat_north, $lng_west);
1298
1299 }
1300 elsif ($type eq "marker") {
1301 ###print STDERR "@@ MARKER FOUND WITH LAT: " . $shape->{"position"}->{"lat"} . "\n";
1302 ###print STDERR "@@ MARKER FOUND WITH LNG: " . $shape->{"position"}->{"lng"} . "\n";
1303 $self->processCoordinate($section, $shape->{"position"}->{"lat"}, $shape->{"position"}->{"lng"});
1304 }
1305 elsif ($type eq "polyline" || $type eq "polygon") {
1306 my $path_array = $shape->{"path"};
1307 foreach my $position (@$path_array) {
1308 $self->processCoordinate($section, $position->{"lat"}, $position->{"lng"});
1309 }
1310 }
1311 elsif ($type eq "rectangle") {
1312
1313 my $bounds = $shape->{"bounds"};
1314 $self->processCoordinate($section, $bounds->{"north"}, $bounds->{"east"});
1315 $self->processCoordinate($section, $bounds->{"south"}, $bounds->{"east"});
1316 $self->processCoordinate($section, $bounds->{"south"}, $bounds->{"west"});
1317 $self->processCoordinate($section, $bounds->{"north"}, $bounds->{"west"});
1318 }
1319
1320 } # end for on each shape in GPS.mapOverlay
1321 } # end if(buildcol and text pass)
1322 } # end GPS.mapOverlay meta
1323
1324 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1325}
1326
1327# https://en.wikipedia.org/wiki/Radian
1328sub degreesToRadians
1329{
1330 my $self = shift (@_);
1331 my ($degrees) = @_;
1332
1333 return $degrees * pi /180; # returns radians
1334}
1335
1336sub radiansToDegrees
1337{
1338 my $self = shift (@_);
1339 my ($radians) = @_;
1340
1341 return $radians * 180 / pi; # returns degrees
1342}
1343
1344# Call as:
1345# my $json = JSON->new->allow_nonref;
1346# &printAllShapes($json, $json_array);
1347sub printAllShapes {
1348 my ($json, $json_array) = @_;
1349
1350
1351 #my $pretty_print_shape = $json->pretty->encode( $json_array->[0] );
1352 foreach my $shape (@$json_array) {
1353 my $pretty_print_shape = $json->pretty->encode( $shape );
1354 print STDERR "Shape: $pretty_print_shape\n";
1355 }
1356
1357}
1358
1359# For the (lat, lng) coordinate given,
1360# attaches Coordinate and multiple CoordShort (different precision level) metadata to the doc object
1361sub processCoordinate {
1362 my $self = shift (@_);
1363 my ($section, $latitude, $longitude) = @_;
1364
1365 my $section_ptr = $self->_lookup_section($section);
1366
1367 my $lat_direction = ($latitude =~ m/^-/) ? "S" : "N";
1368 my $lng_direction = ($longitude =~ m/^-/) ? "W" : "E";
1369
1370 # have to store (lat, lng) in pairs, when there are so many coords to store
1371 #push (@{$section_ptr->{'metadata'}}, ["Latitude", $latitude]);
1372 #push (@{$section_ptr->{'metadata'}}, ["Longitude", $longitude]);
1373
1374 push (@{$section_ptr->{'metadata'}}, ["Coordinate", "$latitude $longitude"]); # "$latitude$lat_direction $longitude$lng_direction"
1375
1376 my ($latBeforeDec, $latAfterDec);
1377 my ($lngBeforeDec, $lngAfterDec);
1378
1379 if($latitude !~ m/\./) {
1380 $latBeforeDec = $latitude;
1381 $latAfterDec = "";
1382 } else {
1383 ($latBeforeDec, $latAfterDec) = ($latitude =~ m/^-?([0-9]+)\.([0-9]+)$/);
1384 }
1385 if($longitude !~ m/\./) {
1386 $lngBeforeDec = $longitude;
1387 $lngAfterDec = "";
1388 } else {
1389 ($lngBeforeDec, $lngAfterDec) = ($longitude =~ m/^-?([0-9]+)\.([0-9]+)$/);
1390 }
1391
1392
1393 my $name = "CoordShort";
1394 push (@{$section_ptr->{'metadata'}}, [$name, "$latBeforeDec$lat_direction $lngBeforeDec$lng_direction"]);
1395
1396 for(my $i = 2; $i <= 4; $i++)
1397 {
1398 my $latDecPlaces = (length($latAfterDec) >= $i) ? substr($latAfterDec, 0, $i) : "";
1399 my $lngDecPlaces = (length($lngAfterDec) >= $i) ? substr($lngAfterDec, 0, $i) : "";
1400
1401 push (@{$section_ptr->{'metadata'}}, [$name,
1402 $latBeforeDec . $lat_direction. $latDecPlaces . " " . $lngBeforeDec . $lng_direction. $lngDecPlaces]);
1403
1404 }
1405
1406 #Only add the metadata if it has not already been added
1407 #my $metaMap = $self->get_metadata_hashmap($section); ### TODO: metaMap not used. Unnecesssary step? (Called function has no apparent side-effects.)
1408}
1409
1410
1411# methods for dealing with text
1412
1413# returns the text for a section
1414sub get_text {
1415 my $self = shift (@_);
1416 my ($section) = @_;
1417
1418 my $section_ptr = $self->_lookup_section($section);
1419 if (!defined $section_ptr) {
1420 print STDERR "doc::get_text couldn't find section " .
1421 "$section\n";
1422 return "";
1423 }
1424
1425 return $section_ptr->{'text'};
1426}
1427
1428# returns the (utf-8 encoded) length of the text for a section
1429sub get_text_length {
1430 my $self = shift (@_);
1431 my ($section) = @_;
1432
1433 my $section_ptr = $self->_lookup_section($section);
1434 if (!defined $section_ptr) {
1435 print STDERR "doc::get_text_length couldn't find section " .
1436 "$section\n";
1437 return 0;
1438 }
1439
1440 return length ($section_ptr->{'text'});
1441}
1442
1443# returns the total length for all the sections
1444sub get_total_text_length {
1445 my $self = shift (@_);
1446
1447 my $section = $self->get_top_section();
1448 my $length = 0;
1449 while (defined $section) {
1450 $length += $self->get_text_length($section);
1451 $section = $self->get_next_section($section);
1452 }
1453 return $length;
1454}
1455
1456sub delete_text {
1457 my $self = shift (@_);
1458 my ($section) = @_;
1459
1460 my $section_ptr = $self->_lookup_section($section);
1461 if (!defined $section_ptr) {
1462 print STDERR "doc::delete_text couldn't find section " .
1463 "$section\n";
1464 return;
1465 }
1466
1467 $section_ptr->{'text'} = "";
1468}
1469
1470# add_text assumes the text is in (extended) ascii form. For
1471# text which has been already converted to the UTF-8 format
1472# use add_utf8_text.
1473sub add_text {
1474 my $self = shift (@_);
1475 my ($section, $text) = @_;
1476
1477 # convert the text to UTF-8 encoded unicode characters
1478 # and add the text
1479 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1480}
1481
1482
1483# add_utf8_text assumes the text to be added has already
1484# been converted to the UTF-8 encoding. For ascii text use
1485# add_text
1486# Pass by value version (internally calls pass by ref version
1487# to avoid code duplication)
1488sub add_utf8_text {
1489 my $self = shift (@_);
1490 my ($section, $text) = @_;
1491
1492 $self->add_utf8_textref($section, \$text);
1493}
1494
1495# Pass by reference version, used by GreenstoneSQLPlugin for fulltext
1496sub add_utf8_textref {
1497 my $self = shift (@_);
1498 my ($section, $text_ref) = @_;
1499
1500 my $section_ptr = $self->_lookup_section($section);
1501 if (!defined $section_ptr) {
1502 print STDERR "doc::add_utf8_textref couldn't find section " .
1503 "$section\n";
1504 return;
1505 }
1506
1507 $section_ptr->{'text'} .= $$text_ref;
1508}
1509
1510# returns the Source meta, which is the utf8 filename generated.
1511# Added a separate method here for convenience
1512sub get_source {
1513 my $self = shift (@_);
1514 return $self->get_metadata_element ($self->get_top_section(), "Source");
1515}
1516
1517# returns the SourceFile meta, which is the url reference to the URL-encoded
1518# version of Source (the utf8 filename). Added a separate method here for convenience
1519sub get_sourcefile {
1520 my $self = shift (@_);
1521 return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
1522}
1523
1524# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
1525# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
1526sub get_assocfile_from_sourcefile {
1527 my $self = shift (@_);
1528
1529 # get the SourceFile meta, which is a *URL* to a file on the filesystem
1530 my $top_section = $self->get_top_section();
1531 my $source_file = $self->get_metadata_element($top_section, "SourceFile");
1532
1533 # get the actual filename as it exists on the filesystem which this url refers to
1534 $source_file = &unicode::url_to_filename($source_file);
1535 my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
1536 return $assocfilename;
1537}
1538
1539# methods for dealing with associated files
1540
1541# a file is associated with a document, NOT a section.
1542# if section is defined it is noted in the data structure
1543# only so that files associated from a particular section
1544# may be removed later (using delete_section_assoc_files)
1545sub associate_file {
1546 my $self = shift (@_);
1547 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1548 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1549
1550 # remove all associated files with the same name
1551 $self->delete_assoc_file ($assoc_filename);
1552
1553 # Too harsh a requirement
1554 # Definitely get HTML docs, for example, with some missing
1555 # support files
1556# if (!&util::fd_exists($real_filename)) {
1557# print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n";
1558# exit -1;
1559# }
1560
1561# print STDERR "**** is the following a UTF8 rep of *real* filename?\n $real_filename\n";
1562# print STDERR "****##### so, ensure it is before storing?!?!?\n";
1563## my $utf8_filename = Encode::encode("utf8",$filename);
1564
1565 push (@{$self->{'associated_files'}},
1566 [$real_filename, $assoc_filename, $mime_type, $section]);
1567}
1568
1569# returns a list of associated files in the form
1570# [[real_filename, assoc_filename, mimetype], ...]
1571sub get_assoc_files {
1572 my $self = shift (@_);
1573
1574 return $self->{'associated_files'};
1575}
1576
1577# the following two methods used to keep track of original associated files
1578# for incremental building. eg a txt file used by an item file does not end
1579# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1580# file for incremental build
1581sub associate_source_file {
1582 my $self = shift (@_);
1583 my ($full_filename) = @_;
1584
1585 push (@{$self->{'source_assoc_files'}}, $full_filename);
1586
1587}
1588
1589sub get_source_assoc_files {
1590 my $self = shift (@_);
1591
1592 return $self->{'source_assoc_files'};
1593
1594
1595}
1596sub metadata_file {
1597 my $self = shift (@_);
1598 my ($real_filename, $filename) = @_;
1599
1600 push (@{$self->{'metadata_files'}},
1601 [$real_filename, $filename]);
1602}
1603
1604# used for writing out the archiveinf-doc info database, to list all the metadata files
1605sub get_meta_files {
1606 my $self = shift (@_);
1607
1608 return $self->{'metadata_files'};
1609}
1610
1611sub delete_section_assoc_files {
1612 my $self = shift (@_);
1613 my ($section) = @_;
1614
1615 my $i=0;
1616 while ($i < scalar (@{$self->{'associated_files'}})) {
1617 if (defined $self->{'associated_files'}->[$i]->[3] &&
1618 $self->{'associated_files'}->[$i]->[3] eq $section) {
1619 splice (@{$self->{'associated_files'}}, $i, 1);
1620 } else {
1621 $i++;
1622 }
1623 }
1624}
1625
1626sub delete_assoc_file {
1627 my $self = shift (@_);
1628 my ($assoc_filename) = @_;
1629
1630 my $i=0;
1631 while ($i < scalar (@{$self->{'associated_files'}})) {
1632 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1633 splice (@{$self->{'associated_files'}}, $i, 1);
1634 } else {
1635 $i++;
1636 }
1637 }
1638}
1639
1640sub reset_nextsection_ptr {
1641 my $self = shift (@_);
1642 my ($section) = @_;
1643
1644 my $section_ptr = $self->_lookup_section($section);
1645 $section_ptr->{'next_subsection'} = 1;
1646}
1647
16481;
Note: See TracBrowser for help on using the repository browser.