source: main/trunk/greenstone2/perllib/doc.pm@ 33185

Last change on this file since 33185 was 33185, checked in by wy59, 5 years ago
  1. Corrections to map-scripts.js code (e.g. a var had been omitted from a declaration). Reinstated Dr Bainbridge's setting of the bounds during updateMap, as his way preserved the cumulative bounds we were building up and the recently committed 'improvement' had actually tossed this important bit of code away. 3. Major change to doc.pm to store all combinations of Lat and Lng components of CoordShort, since each Lat and Lng can have a different number of decimal places. We need to store all possible combinations since searching requires that, as searching can and does in fact have unequal number of dec places for coordinates (that end up using CoordShort) on occasion. Without this fix, shapes of nearby docs/subsections don't get returned in such cases.
  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 49.8 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use FileUtils;
40use ghtml;
41use File::stat;
42##use hashdoc;
43use docprint;
44use JSON;
45
46# We just need pi from the Trig lib
47# Import constants pi2, pip2, pip4 (2*pi, pi/2, pi/4).
48use Math::Trig ':pi';
49
50# the document type may be indexed_doc, nonindexed_doc, or
51# classification
52
53our $OIDcount = 0;
54
55
56# Declare the Package (level) Variable that will tell us whether we're in the import or buildcol phase.
57# This var will be set by import.pl, buildcol.pl and export.pl to the then appropriate value.
58# https://perlmaven.com/package-variables-and-lexical-variables-in-perl
59# But: https://stackoverflow.com/questions/7542120/perl-how-to-make-variables-from-requiring-script-available-in-required-script
60# https://stackoverflow.com/questions/33230015/how-do-i-use-a-variable-in-another-perl-script
61#$doc::cmd_line_mode = undef;
62# Decided to follow the usage of $OIDcount above to declare the necessary package-level variables 'globally' accessible
63# actually accessible only for those packages that import this one (with 'use doc')
64our $cmd_line_mode = undef;
65
66# rename_method can be 'url', 'none', 'base64'
67sub new {
68 my $class = shift (@_);
69 my ($source_filename, $doc_type, $rename_method) = @_;
70
71
72 my $self = bless {'associated_files'=>[],
73 'subsection_order'=>[],
74 'next_subsection'=>1,
75 'subsections'=>{},
76 'metadata'=>[],
77 'text'=>"",
78 'OIDtype'=>"hash"}, $class;
79
80 # used to set lastmodified here, but this can screw up the HASH ids, so
81 # the docsave processor now calls set_lastmodified
82
83 $self->set_source_path($source_filename);
84
85 if (defined $source_filename) {
86 $source_filename = &util::filename_within_collection($source_filename);
87 print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
88 $self->set_source_filename ($source_filename, $rename_method);
89 }
90
91 $self->set_doc_type ($doc_type) if defined $doc_type;
92
93 return $self;
94}
95
96
97sub set_source_path
98{
99 my $self = shift @_;
100 my ($source_filename) = @_;
101
102 if (defined $source_filename) {
103 # On Windows the source_filename can be in terse DOS format
104 # e.g. test~1.txt
105
106 $self->{'terse_source_path'} = $source_filename;
107
108 # Use the FileUtil library methods as they are aware of more special
109 # cases such as HDFS [jmt12]
110 if (&FileUtils::fileExists($source_filename))
111 {
112 # See if we can do better for Windows with a filename
113 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
114 require Win32;
115 $self->{'source_path'} = Win32::GetLongPathName($source_filename);
116 }
117 else {
118 # For Unix-based systems, there is no difference between the two
119 $self->{'source_path'} = $source_filename;
120 }
121 }
122 else {
123 print STDERR "Warning: In doc::set_source_path(), file\n";
124 print STDERR " $source_filename\n";
125 print STDERR " does not exist\n";
126
127 # (default) Set it to whatever we were given
128 $self->{'source_path'} = $source_filename;
129 }
130 }
131 else {
132 # Previous code for setting source_path allowed for
133 # it to be undefined, so continue this practice
134 $self->{'terse_source_path'} = undef;
135 $self->{'source_path'} = undef;
136 }
137}
138
139
140sub get_source_path
141{
142 my $self = shift @_;
143
144 return $self->{'terse_source_path'};
145}
146
147# set lastmodified for OAI purposes, added by GRB, moved by kjdon
148sub set_oailastmodified {
149 my $self = shift (@_);
150
151 my $source_path = $self->{'terse_source_path'};
152
153 if (defined $source_path && (-e $source_path)) {
154 my $current_time = time;
155
156 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
157 $wday, $yday, $isdst) = localtime($current_time);
158
159 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
160
161 $self->add_utf8_metadata($self->get_top_section(), "oailastmodified", $current_time);
162 $self->add_utf8_metadata($self->get_top_section(), "oailastmodifieddate", $date_modified);
163 }
164}
165
166# no longer used for OAI purposes, since lastmodified is not what we want as the
167# Datestamp of a document. This doc metadata may be useful for general purposes.
168sub set_lastmodified {
169 my $self = shift (@_);
170
171 my $source_path = $self->{'terse_source_path'};
172
173 if (defined $source_path && (-e $source_path)) {
174
175 my $file_stat = stat($source_path);
176 my $mtime = $file_stat->mtime;
177 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
178 $wday, $yday, $isdst) = localtime($mtime);
179
180 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
181
182 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $mtime);
183 $self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
184 }
185}
186
187# clone the $self object
188sub duplicate {
189 my $self = shift (@_);
190
191 my $newobj = {};
192
193 foreach my $k (keys %$self) {
194 $newobj->{$k} = &clone ($self->{$k});
195 }
196
197 bless $newobj, ref($self);
198 return $newobj;
199}
200
201sub clone {
202 my ($from) = @_;
203 my $type = ref ($from);
204
205 if ($type eq "HASH") {
206 my $to = {};
207 foreach my $key (keys %$from) {
208 $to->{$key} = &clone ($from->{$key});
209 }
210 return $to;
211 } elsif ($type eq "ARRAY") {
212 my $to = [];
213 foreach my $v (@$from) {
214 push (@$to, &clone ($v));
215 }
216 return $to;
217 } else {
218 return $from;
219 }
220}
221
222sub set_OIDtype {
223 my $self = shift (@_);
224 my ($type, $metadata) = @_;
225
226 if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|hash_on_full_filename|incremental|filename|dirname|full_filename|assigned)$/) {
227 $self->{'OIDtype'} = $type;
228 } else {
229 $self->{'OIDtype'} = "hash";
230 }
231
232 if ($type =~ /^assigned$/) {
233 if (defined $metadata) {
234 $self->{'OIDmetadata'} = $metadata;
235 } else {
236 $self->{'OIDmetadata'} = "dc.Identifier";
237 }
238 }
239}
240
241# rename_method can be 'url', 'none', 'base64'
242sub set_source_filename {
243 my $self = shift (@_);
244 my ($source_filename, $rename_method) = @_;
245
246 # Since the gsdlsourcefilename element goes into the doc.xml it has
247 # to be utf8. However, it should also *represent* the source filename
248 # (in the import directory) which may not be utf8 at all.
249 # For instance, if this meta element (gsdlsourcefilename) will be used
250 # by other applications that parse doc.xml in order to locate
251 # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
252 # encode the real filename as this is a binary-to-text encoding meaning
253 # that the resulting string is ASCII (utf8). Decoding will give the original.
254
255# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
256
257 # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
258 $source_filename = $self->encode_filename($source_filename, $rename_method);
259# my ($srcfilename,$dirname,$suffix)
260# = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
261# print STDERR "-> $srcfilename -> ";
262# $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
263# $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
264# print STDERR "$source_filename\n";
265
266 $self->set_utf8_metadata_element ($self->get_top_section(),
267 "gsdlsourcefilename",
268 $source_filename);
269}
270
271sub encode_filename {
272 my $self = shift (@_);
273 my ($source_filename, $rename_method) = @_;
274
275 my ($srcfilename,$dirname,$suffix)
276 = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
277# print STDERR "-> $srcfilename -> ";
278 $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
279 $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
280
281 return $source_filename;
282}
283
284sub set_converted_filename {
285 my $self = shift (@_);
286 my ($converted_filename) = @_;
287
288 # we know the converted filename is utf8
289 $self->set_utf8_metadata_element ($self->get_top_section(),
290 "gsdlconvertedfilename",
291 $converted_filename);
292}
293
294# returns the source_filename as it was provided
295sub get_unmodified_source_filename {
296 my $self = shift (@_);
297
298 return $self->{'terse_source_path'};
299}
300
301# returns the source_filename with whatever rename_method was given
302sub get_source_filename {
303 my $self = shift (@_);
304
305 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
306}
307
308
309
310# returns converted filename if available else returns source filename
311sub get_filename_for_hashing {
312 my $self = shift (@_);
313
314 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
315
316 if (!defined $filename) {
317 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
318 # if NULPlug processed file, then don't give a filename
319 if (defined $plugin_name && $plugin_name eq "NULPlug") {
320 $filename = undef;
321 } else { # returns the URL encoded source filename!
322 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
323 }
324 }
325
326 if (!&FileUtils::isFilenameAbsolute($filename)) {
327 $filename = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$filename);
328 }
329
330 return $filename;
331}
332
333sub set_doc_type {
334 my $self = shift (@_);
335 my ($doc_type) = @_;
336
337 $self->set_metadata_element ($self->get_top_section(),
338 "gsdldoctype",
339 $doc_type);
340}
341
342# returns the gsdldoctype as it was provided
343# the default of "indexed_doc" is used if no document
344# type was provided
345sub get_doc_type {
346 my $self = shift (@_);
347
348 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
349 return $doc_type if (defined $doc_type);
350 return "indexed_doc";
351}
352
353
354# look up the reference to the a particular section
355sub _lookup_section {
356 my $self = shift (@_);
357 my ($section) = @_;
358
359 my ($num);
360 my $sectionref = $self;
361
362 while (defined $section && $section ne "") {
363
364 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
365
366 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
367
368 $section = "" unless defined $section;
369
370
371 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
372 $sectionref = $sectionref->{'subsections'}->{$num};
373 } else {
374 return undef;
375 }
376 }
377
378 return $sectionref;
379}
380
381# calculate OID by hashing the contents of the document
382sub _calc_OID {
383 my $self = shift (@_);
384 my ($filename) = @_;
385
386
387 my $osexe = &util::get_os_exe();
388
389 my $hashfile_exe = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin",
390 $ENV{'GSDLOS'},"hashfile$osexe");
391
392 &util::set_gnomelib_env(); # gnomelib_env (particularly lib/libiconv2.dylib) required to run the hashfile executable on Mac Lions
393 # The subroutine will set the gnomelib env once for each subshell launched, by first testing if GEXTGNOME is not already set
394
395 # A different way to set the gnomelib env would be to do it more locally: exporting the necessary vars
396 # (specifically DYLD/LD_LIB_PATH) for gnome_lib as part of the command executed.
397 # E.g. $result=`export LD_LIBRARY_PATH=../ext/gnome-lib/darwin/lib; hashfile...`
398
399 my $result = "NULL";
400
401
402 if (-e "$hashfile_exe") {
403# $result = `\"$hashfile_exe\" \"$filename\"`;
404# $result = `hashfile$osexe \"$filename\" 2>&1`;
405 $result = `hashfile$osexe \"$filename\"`;
406
407 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
408 } else {
409 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
410 }
411 return "HASH$result";
412}
413
414# methods dealing with OID, not groups of them.
415
416# if $OID is not provided one is calculated
417sub set_OID {
418 my $self = shift (@_);
419 my ($OID) = @_;
420
421 my $use_hash_oid = 0;
422 # if an OID wasn't provided calculate one
423 if (!defined $OID) {
424 $OID = "NULL";
425 if ($self->{'OIDtype'} =~ /^hash/) {
426 $use_hash_oid = 1;
427 } elsif ($self->{'OIDtype'} eq "incremental") {
428 $OID = "D" . $OIDcount;
429 $OIDcount ++;
430 } elsif ($self->{'OIDtype'} eq "filename") {
431 my $filename = $self->get_source_filename();
432 $OID = &File::Basename::fileparse($filename, qr/\.[^.]*/);
433 $OID = &util::tidy_up_oid($OID);
434 } elsif ($self->{'OIDtype'} eq "full_filename") {
435 my $source_filename = $self->get_source_filename();
436 my $dirsep = &util::get_os_dirsep();
437
438 $source_filename =~ s/^import$dirsep//;
439 $source_filename =~ s/$dirsep/-/g;
440 $source_filename =~ s/\./_/g;
441
442 $OID = $source_filename;
443 $OID = &util::tidy_up_oid($OID);
444 } elsif ($self->{'OIDtype'} eq "dirname") {
445 my $filename = $self->get_source_filename();
446 if (defined($filename)) { # && -e $filename) {
447 # get the immediate parent directory
448 $OID = &File::Basename::dirname($filename);
449 if (defined $OID) {
450 $OID = &File::Basename::basename($OID);
451 $OID = &util::tidy_up_oid($OID);
452 } else {
453 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
454 $use_hash_oid = 1;
455 }
456 } else {
457 print STDERR "Failed to find a filename, generating hash id\n";
458 $use_hash_oid = 1;
459 }
460
461 } elsif ($self->{'OIDtype'} eq "assigned") {
462 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
463 if (defined $identifier && $identifier ne "") {
464 $OID = $identifier;
465 $OID = &util::tidy_up_oid($OID);
466 } else {
467 # need a hash id
468 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
469 $use_hash_oid = 1;
470 }
471
472 } else {
473 $use_hash_oid = 1;
474 }
475
476 if ($use_hash_oid) {
477 my $hash_on_file = 1;
478 my $hash_on_ga_xml = 0;
479
480 if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
481 $hash_on_file = 0;
482 $hash_on_ga_xml = 1;
483 }
484
485 if ($self->{'OIDtype'} eq "hash_on_full_filename") {
486 $hash_on_file = 0;
487 $hash_on_ga_xml = 0;
488
489 my $source_filename = $self->get_source_filename();
490 my $dirsep = &util::get_os_dirsep();
491
492 $source_filename =~ s/^import$dirsep//;
493 $source_filename =~ s/$dirsep/-/g;
494 $source_filename =~ s/\./_/g;
495
496 # If the filename is very short then (handled naively)
497 # this can cause conjestion in the hash-values
498 # computed, leading documents sharing the same leading
499 # Hex values in the computed has.
500 #
501 # The solution taken here is to replace the name of
502 # the file name a sufficient number of times (up to
503 # the character limit defined in 'rep_limit' and
504 # make that the content that is hashed on
505
506 # *** Think twice before changing the following value
507 # as it will break backward compatability of computed
508 # document HASH values
509
510 my $rep_limit = 256;
511 my $hash_content = undef;
512
513 if (length($source_filename)<$rep_limit) {
514 my $rep_string = "$source_filename|";
515 my $rs_len = length($rep_string);
516
517 my $clone_times = int(($rep_limit-1)/$rs_len) +1;
518
519 $hash_content = substr($rep_string x $clone_times, 0, $rep_limit);
520 }
521 else {
522 $hash_content = $source_filename;
523 }
524
525 my $filename = &util::get_tmp_filename();
526 if (!open (OUTFILE, ">:utf8", $filename)) {
527 print STDERR "doc::set_OID could not write to $filename\n";
528 } else {
529 print OUTFILE $hash_content;
530 close (OUTFILE);
531 }
532 $OID = $self->_calc_OID ($filename);
533
534 &FileUtils::removeFiles ($filename);
535 }
536
537 if ($hash_on_file) {
538 # "hash" OID - feed file to hashfile.exe
539 my $filename = $self->get_filename_for_hashing();
540
541 # -z: don't want to hash on the file if it is zero size
542 if (defined($filename) && -e $filename && !-z $filename) {
543 $OID = $self->_calc_OID ($filename);
544 } else {
545 $hash_on_ga_xml = 1; # switch to back-up plan, and hash on GA file instead
546 }
547 }
548
549 if ($hash_on_ga_xml) {
550 # In addition being asked to explicity calculate the has based on the GA file,
551 # can also end up coming into this block is doing 'hash_on_file' but the file
552 # itself is of zero bytes (as could be the case with 'doc.nul' file
553
554 my $filename = &util::get_tmp_filename();
555 if (!open (OUTFILE, ">:utf8", $filename)) {
556 print STDERR "doc::set_OID could not write to $filename\n";
557 } else {
558 my $doc_text = &docprint::get_section_xml($self);
559 print OUTFILE $doc_text;
560 close (OUTFILE);
561 }
562 $OID = $self->_calc_OID ($filename);
563 &FileUtils::removeFiles($filename);
564 }
565 }
566 }
567 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
568}
569
570# this uses hashdoc (embedded c thingy) which is faster but still
571# needs a little work to be sufficiently stable
572sub ___set_OID {
573 my $self = shift (@_);
574 my ($OID) = @_;
575
576 # if an OID wasn't provided then calculate hash value based on document
577 if (!defined $OID)
578 {
579 my $hash_text = &docprint::get_section_xml($self);
580 my $hash_len = length($hash_text);
581
582 $OID = &hashdoc::buffer($hash_text,$hash_len);
583 }
584
585 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
586}
587
588# returns the OID for this document
589sub get_OID {
590 my $self = shift (@_);
591 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
592 return $OID if (defined $OID);
593 return "NULL";
594}
595
596sub delete_OID {
597 my $self = shift (@_);
598
599 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
600}
601
602
603# methods for manipulating section names
604
605# returns the name of the top-most section (the top
606# level of the document
607sub get_top_section {
608 my $self = shift (@_);
609
610 return "";
611}
612
613# returns a section
614sub get_parent_section {
615 my $self = shift (@_);
616 my ($section) = @_;
617
618 $section =~ s/(^|\.)\d+$//;
619
620 return $section;
621}
622
623# returns the first child section (or the end child
624# if there isn't any)
625sub get_begin_child {
626 my $self = shift (@_);
627 my ($section) = @_;
628
629 my $section_ptr = $self->_lookup_section($section);
630 return "" unless defined $section_ptr;
631
632 if (defined $section_ptr->{'subsection_order'}->[0]) {
633 return "$section.$section_ptr->{'subsection_order'}->[0]";
634 }
635
636 return $self->get_end_child ($section);
637}
638
639# returns the next child of a parent section
640sub get_next_child {
641 my $self = shift (@_);
642 my ($section) = @_;
643
644 my $parent_section = $self->get_parent_section($section);
645 my $parent_section_ptr = $self->_lookup_section($parent_section);
646 return undef unless defined $parent_section_ptr;
647
648 my ($section_num) = $section =~ /(\d+)$/;
649 return undef unless defined $section_num;
650
651 my $i = 0;
652 my $section_order = $parent_section_ptr->{'subsection_order'};
653 while ($i < scalar(@$section_order)) {
654 last if $section_order->[$i] eq $section_num;
655 $i++;
656 }
657
658 $i++; # the next child
659 if ($i < scalar(@$section_order)) {
660 return $section_order->[$i] if $parent_section eq "";
661 return "$parent_section.$section_order->[$i]";
662 }
663
664 # no more sections in this level
665 return undef;
666}
667
668# returns a reference to a list of children
669sub get_children {
670 my $self = shift (@_);
671 my ($section) = @_;
672
673 my $section_ptr = $self->_lookup_section($section);
674 return [] unless defined $section_ptr;
675
676 my @children = @{$section_ptr->{'subsection_order'}};
677
678 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
679 return \@children;
680}
681
682# returns the child section one past the last one (which
683# is coded as "0")
684sub get_end_child {
685 my $self = shift (@_);
686 my ($section) = @_;
687
688 return $section . ".0" unless $section eq "";
689 return "0";
690}
691
692# returns the next section in book order
693sub get_next_section {
694 my $self = shift (@_);
695 my ($section) = @_;
696
697 return undef unless defined $section;
698
699 my $section_ptr = $self->_lookup_section($section);
700 return undef unless defined $section_ptr;
701
702 # first try to find first child
703 if (defined $section_ptr->{'subsection_order'}->[0]) {
704 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
705 return "$section.$section_ptr->{'subsection_order'}->[0]";
706 }
707
708 do {
709 # try to find sibling
710 my $next_child = $self->get_next_child ($section);
711 return $next_child if (defined $next_child);
712
713 # move up one level
714 $section = $self->get_parent_section ($section);
715 } while $section =~ /\d/;
716
717 return undef;
718}
719
720sub is_leaf_section {
721 my $self = shift (@_);
722 my ($section) = @_;
723
724 my $section_ptr = $self->_lookup_section($section);
725 return 1 unless defined $section_ptr;
726
727 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
728}
729
730# methods for dealing with sections
731
732# returns the name of the inserted section
733sub insert_section {
734 my $self = shift (@_);
735 my ($before_section) = @_;
736
737 # get the child to insert before and its parent section
738 my $parent_section = "";
739 my $before_child = "0";
740 my @before_section = split (/\./, $before_section);
741 if (scalar(@before_section) > 0) {
742 $before_child = pop (@before_section);
743 $parent_section = join (".", @before_section);
744 }
745
746 my $parent_section_ptr = $self->_lookup_section($parent_section);
747 if (!defined $parent_section_ptr) {
748 print STDERR "doc::insert_section couldn't find parent section " .
749 "$parent_section\n";
750 return;
751 }
752
753 # get the next section number
754 my $section_num = $parent_section_ptr->{'next_subsection'}++;
755
756 my $i = 0;
757 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
758 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
759 $i++;
760 }
761
762 # insert the section number into the order list
763 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
764
765 # add this section to the parent section
766 my $section_ptr = {'subsection_order'=>[],
767 'next_subsection'=>1,
768 'subsections'=>{},
769 'metadata'=>[],
770 'text'=>""};
771 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
772
773 # work out the full section number
774 my $section = $parent_section;
775 $section .= "." unless $section eq "";
776 $section .= $section_num;
777
778 return $section;
779}
780
781# creates a pre-named section
782sub create_named_section {
783 my $self = shift (@_);
784 my ($mastersection) = @_;
785
786 my ($num);
787 my $section = $mastersection;
788 my $sectionref = $self;
789
790 while ($section ne "") {
791 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
792 $num =~ s/^0+(\d)/$1/; # remove leading 0s
793 $section = "" unless defined $section;
794
795 if (defined $num) {
796 if (!defined $sectionref->{'subsections'}->{$num}) {
797 push (@{$sectionref->{'subsection_order'}}, $num);
798 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
799 'next_subsection'=>1,
800 'subsections'=>{},
801 'metadata'=>[],
802 'text'=>""};
803 if ($num >= $sectionref->{'next_subsection'}) {
804 $sectionref->{'next_subsection'} = $num + 1;
805 }
806 }
807 $sectionref = $sectionref->{'subsections'}->{$num};
808
809 } else {
810 print STDERR "doc::create_named_section couldn't create section ";
811 print STDERR "$mastersection\n";
812 last;
813 }
814 }
815}
816
817# returns a reference to a list of subsections
818sub list_subsections {
819 my $self = shift (@_);
820 my ($section) = @_;
821
822 my $section_ptr = $self->_lookup_section ($section);
823 if (!defined $section_ptr) {
824 print STDERR "doc::list_subsections couldn't find section $section\n";
825 return [];
826 }
827
828 return [@{$section_ptr->{'subsection_order'}}];
829}
830
831sub delete_section {
832 my $self = shift (@_);
833 my ($section) = @_;
834
835# my $section_ptr = {'subsection_order'=>[],
836# 'next_subsection'=>1,
837# 'subsections'=>{},
838# 'metadata'=>[],
839# 'text'=>""};
840
841 # if this is the top section reset everything
842 if ($section eq "") {
843 $self->{'subsection_order'} = [];
844 $self->{'subsections'} = {};
845 $self->{'metadata'} = [];
846 $self->{'text'} = "";
847 return;
848 }
849
850 # find the parent of the section to delete
851 my $parent_section = "";
852 my $child = "0";
853 my @section = split (/\./, $section);
854 if (scalar(@section) > 0) {
855 $child = pop (@section);
856 $parent_section = join (".", @section);
857 }
858
859 my $parent_section_ptr = $self->_lookup_section($parent_section);
860 if (!defined $parent_section_ptr) {
861 print STDERR "doc::delete_section couldn't find parent section " .
862 "$parent_section\n";
863 return;
864 }
865
866 # remove this section from the subsection_order list
867 my $i = 0;
868 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
869 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
870 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
871 last;
872 }
873 $i++;
874 }
875
876 # remove this section from the subsection hash
877 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
878 undef $parent_section_ptr->{'subsections'}->{$child};
879 }
880}
881
882#--
883# methods for dealing with metadata
884
885# set_metadata_element and get_metadata_element are for metadata
886# which should only have one value. add_meta_data and get_metadata
887# are for metadata which can have more than one value.
888
889# returns the first metadata value which matches field
890
891# This version of get metadata element works much like the one above,
892# except it allows for the namespace portion of a metadata element to
893# be ignored, thus if you are searching for dc.Title, the first piece
894# of matching metadata ending with the name Title (once any namespace
895# is removed) would be returned.
896# 28-11-2003 John Thompson
897sub get_metadata_element {
898 my $self = shift (@_);
899 my ($section, $field, $ignore_namespace) = @_;
900 my ($data);
901
902 $ignore_namespace = 0 unless defined $ignore_namespace;
903
904 my $section_ptr = $self->_lookup_section($section);
905 if (!defined $section_ptr) {
906 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
907 return;
908 }
909
910 # Remove any namespace if we are being told to ignore them
911 if($ignore_namespace) {
912 $field =~ s/^.*\.//; #$field =~ s/^\w*\.//;
913 }
914
915 foreach $data (@{$section_ptr->{'metadata'}}) {
916
917 my $data_name = $data->[0];
918
919 # Remove any namespace if we are being told to ignore them
920 if($ignore_namespace) {
921 $data_name =~ s/^.*\.//; #$data_name =~ s/^\w*\.//;
922 }
923 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
924 $data_name =~ s/^ex\.([^.]+)$/$1/; #$data_name =~ s/^ex\.//;
925
926 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
927 }
928
929 return undef; # was not found
930}
931
932# returns a list of the form [value1, value2, ...]
933sub get_metadata {
934 my $self = shift (@_);
935 my ($section, $field, $ignore_namespace) = @_;
936 my ($data);
937
938 $ignore_namespace = 0 unless defined $ignore_namespace;
939
940 my $section_ptr = $self->_lookup_section($section);
941 if (!defined $section_ptr) {
942 print STDERR "doc::get_metadata couldn't find section ",
943 $section, "\n";
944 return;
945 }
946
947 # Remove any namespace if we are being told to ignore them
948 if($ignore_namespace) {
949 $field =~ s/^.*\.//;
950 }
951
952 my @metadata = ();
953 foreach $data (@{$section_ptr->{'metadata'}}) {
954
955 my $data_name = $data->[0];
956
957 # Remove any namespace if we are being told to ignore them
958 if($ignore_namespace) {
959 $data_name =~ s/^.*\.//;
960 }
961 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
962 $data_name =~ s/^ex\.([^.]+)$/$1/;
963
964 push (@metadata, $data->[1]) if ($data_name eq $field);
965 }
966
967 return \@metadata;
968}
969
970sub get_metadata_hashmap {
971 my $self = shift (@_);
972 my ($section, $opt_namespace) = @_;
973
974 my $section_ptr = $self->_lookup_section($section);
975 if (!defined $section_ptr) {
976 print STDERR "doc::get_metadata couldn't find section ",
977 $section, "\n";
978 return;
979 }
980
981 my $metadata_hashmap = {};
982 foreach my $data (@{$section_ptr->{'metadata'}}) {
983 my $metaname = $data->[0];
984
985 if ((!defined $opt_namespace) || ($metaname =~ m/^$opt_namespace\./)) {
986 if (!defined $metadata_hashmap->{$metaname}) {
987 $metadata_hashmap->{$metaname} = [];
988 }
989 my $metaval_list = $metadata_hashmap->{$metaname};
990 push(@$metaval_list, $data->[1]);
991 }
992 }
993
994 return $metadata_hashmap;
995}
996
997# returns a list of the form [[field,value],[field,value],...]
998sub get_all_metadata {
999 my $self = shift (@_);
1000 my ($section) = @_;
1001
1002 my $section_ptr = $self->_lookup_section($section);
1003 if (!defined $section_ptr) {
1004 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
1005 return;
1006 }
1007
1008 return $section_ptr->{'metadata'};
1009}
1010
1011# $value is optional
1012sub delete_metadata {
1013 my $self = shift (@_);
1014 my ($section, $field, $value) = @_;
1015
1016 my $section_ptr = $self->_lookup_section($section);
1017 if (!defined $section_ptr) {
1018 print STDERR "doc::delete_metadata couldn't find section ", $section, "$field\n";
1019 return;
1020 }
1021
1022 my $i = 0;
1023 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
1024 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
1025 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
1026 splice (@{$section_ptr->{'metadata'}}, $i, 1);
1027 } else {
1028 $i++;
1029 }
1030 }
1031}
1032
1033sub delete_all_metadata {
1034 my $self = shift (@_);
1035 my ($section) = @_;
1036
1037 my $section_ptr = $self->_lookup_section($section);
1038 if (!defined $section_ptr) {
1039 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
1040 return;
1041 }
1042
1043 $section_ptr->{'metadata'} = [];
1044}
1045
1046sub set_metadata_element {
1047 my $self = shift (@_);
1048 my ($section, $field, $value) = @_;
1049
1050 $self->set_utf8_metadata_element ($section, $field,
1051 &unicode::ascii2utf8(\$value));
1052}
1053
1054# set_utf8_metadata_element assumes the text has already been
1055# converted to the UTF-8 encoding.
1056sub set_utf8_metadata_element {
1057 my $self = shift (@_);
1058 my ($section, $field, $value) = @_;
1059
1060 $self->delete_metadata ($section, $field);
1061 $self->add_utf8_metadata ($section, $field, $value);
1062}
1063
1064
1065# add_metadata assumes the text is in (extended) ascii form. For
1066# text which has already been converted to the UTF-8 format use
1067# add_utf8_metadata.
1068sub add_metadata {
1069 my $self = shift (@_);
1070 my ($section, $field, $value) = @_;
1071
1072 $self->add_utf8_metadata ($section, $field,
1073 &unicode::ascii2utf8(\$value));
1074}
1075
1076sub add_utf8_metadata {
1077 my $self = shift (@_);
1078 my ($section, $field, $value) = @_;
1079
1080 # my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1081 # my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
1082 # print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
1083
1084 my $section_ptr = $self->_lookup_section($section);
1085 if (!defined $section_ptr) {
1086 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
1087 return;
1088 }
1089 if (!defined $value) {
1090 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
1091 return;
1092 }
1093 if (!defined $field) {
1094 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
1095 return;
1096 }
1097
1098 #print STDERR "###$field=$value\n";
1099
1100 # For now, supress this check. Given that text data read in is now
1101 # Unicode aware, then the following block of code can (ironically enough)
1102 # cause our unicode compliant string to be re-encoded (leading to
1103 # a double-encoded UTF-8 string, which we definitely don't want!).
1104
1105
1106 # double check that the value is utf-8
1107 # if (!&unicode::check_is_utf8($value)) {
1108 # print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
1109 # &unicode::ensure_utf8(\$value);
1110 # print STDERR " Tried converting to utf8: $value\n";
1111 # }
1112
1113
1114 # for every GPS.mapOverlay/sectionID (but sadly not for every doc, let alone every collection),
1115 # we use a map to ensure uniqueness of CoordShorts for optimisation purposes
1116 # That way we don't end up with the same CoordShort values a large number of times in the index
1117 # We only get any savings with GPS.mapOverlay when using this map, not for "Latitude" and "Longitude" metadata
1118 my $unique_coordshort_map = {};
1119
1120
1121 #If the metadata value is either a latitude or a longitude value then we want to save a shortened version for spacial searching purposes
1122 if ($field =~ m/^(.+\.)?Latitude$/ || $field =~ m/^(.+\.)?Longitude$/)
1123 {
1124 my ($mdprefix,$metaname) = ($field =~ m/(.+)\.(.+)$/);
1125 if (defined $mdprefix) {
1126 # Add in a version of Latitude/Longitude without the metadata namespace prefix to keep Runtime happy
1127 push (@{$section_ptr->{'metadata'}}, [$metaname, $value]);
1128 }
1129
1130 my $direction;
1131 if($value =~ m/^-/)
1132 {
1133 $direction = ($field eq "Latitude") ? "S" : "W";
1134 }
1135 else
1136 {
1137 $direction = ($field eq "Latitude") ? "N" : "E";
1138 }
1139
1140 my ($beforeDec, $afterDec) = ($value =~ m/^-?([0-9]+)\.([0-9]+)$/);
1141 if(defined $beforeDec && defined $afterDec)
1142 {
1143 my $name = ($field eq "Latitude") ? "LatShort" : "LngShort";
1144 push (@{$section_ptr->{'metadata'}}, [$name, $beforeDec . $direction]);
1145
1146 for(my $i = 2; $i <= 4; $i++)
1147 {
1148 if(length($afterDec) >= $i)
1149 {
1150 push (@{$section_ptr->{'metadata'}}, [$name, substr($afterDec, 0, $i)]);
1151 }
1152 }
1153
1154 #Only add the metadata if it has not already been added
1155 my $metaMap = $self->get_metadata_hashmap($section); ### TODO: metaMap not used. Unnecesssary step? (Called function has no side-effects.)
1156 }
1157
1158 if($field =~ m/^(.+\.)?Longitude$/) {
1159 # if we are dealing with Longitude meta, we should 1. have Latitude meta too; 2. already have processed Latitude meta
1160 # in that case, add both Lat and Lng of this section as a Coordinate meta
1161 my $latitude = $self->get_metadata_element ($section, "Latitude");
1162 # TODO: would like all Longitude info together followed by all Coordinate info, but the following will add all coord info meta and end of this function will add Longitude meta
1163 $self->processCoordinate($section, $latitude, $value, $unique_coordshort_map); # value is Longitude
1164 }
1165 }
1166
1167 elsif($field eq "GPS.mapOverlay") { # then the value is a JSON string
1168
1169 # TODO: How can we prevent GPS.mapOverlay from being processed 4 or so times by buildcol.pl??
1170 # Once for each pass (dummy, sidx, didx for lucene, info db pass)
1171
1172 if($cmd_line_mode eq "buildcol") {
1173 #my $metaMap = $self->get_metadata_hashmap($section); ## TODO: Check if necessary to avoid duplication of <Coordinate> meta in index\text\<coll>.jdb
1174
1175 #if(!$metaMap->{'Coordinate'}) {
1176 #print STDERR "@@@@@@@@@@@@@@ cmd line mode (build phase) is now: $doc::cmd_line_mode\n";
1177
1178
1179 print STDERR "GPS.mapOverlay before decode, val: " . $value . "\n";
1180
1181 # TODO html decode?
1182 $value =~ s@&#091;@[@g;
1183 $value =~ s@&#093;@]@g;
1184 $value =~ s@&quot;@"@g;
1185 print STDERR "GPS.mapOverlay after decode, val: " . $value . "\n";
1186
1187 my $json_array = decode_json $value;
1188 #my $json = JSON->new->allow_nonref;
1189 #&printAllShapes($json, $json_array);
1190
1191 foreach my $shape (@$json_array) {
1192
1193 my $type = $shape->{"type"};
1194 print STDERR "Type : " . $type . "\n";
1195
1196 if($type eq "circle") {
1197 #print STDERR "Found a circle:\n" . &printShape($json, $shape);
1198
1199 # Work out bounding box
1200 # want the inverse of this useful page:
1201 # https://stackoverflow.com/questions/639695/how-to-convert-latitude-or-longitude-to-meters
1202 # https://www.geodatasource.com/developers/javascript
1203
1204
1205 # for now, just process the circle centre
1206 #my $centre = $shape->{"center"};
1207 #$self->processLatOrLng($section_ptr, "Latitude", $centre->{"lat"});
1208 #$self->processLatOrLng($section_ptr, "Longitude", $centre->{"lng"});
1209
1210
1211 # Dr Bainbridge wants us to use: https://gis.stackexchange.com/questions/5821/calculating-latitude-longitude-x-miles-from-point
1212 # But we're using the rule of thumb here, since for N,E,S,W it works out the same:
1213 # https://gis.stackexchange.com/questions/2951/algorithm-for-offsetting-a-latitude-longitude-by-some-amount-of-meters
1214 # which states
1215 # "If your displacements aren't too great (less than a few kilometers) and you're not right at the poles,
1216 # use the quick and dirty estimate that 111,111 meters (111.111 km) in the y direction is 1 degree
1217 # (of latitude) and 111,111 * cos(latitude) meters in the x direction is 1 degree (of longitude)."
1218 my $centre_lat = $shape->{"center"}->{"lat"};
1219 my $centre_lng = $shape->{"center"}->{"lng"};
1220 my $radius = $shape->{"radius"}; # in metres!
1221
1222 print STDERR "@@@ circle centre: ($centre_lat, $centre_lng), radius: $radius\n";
1223
1224 my $lat_north = $centre_lat + ($radius/111111);
1225 my $lat_south = $centre_lat - ($radius/111111);
1226
1227 print STDERR "### lat_north: $lat_north\n";
1228 print STDERR "### lat_south: $lat_south\n";
1229
1230 # our latitude and longitude values are in degrees. But cos and sin etc in perl and generally all prog languages
1231 # all expect their angle to be in radians. So need to convert from degree to radians before we can take the cose of it.
1232 my $centre_lat_radians = $self->degreesToRadians($centre_lat);
1233 my $cos_in_radians = cos($centre_lat_radians);
1234 print STDERR "cos $centre_lat_radians " . cos($centre_lat_radians) . "\n";
1235 my $lng_east = $centre_lng + ($radius/(111111 * $cos_in_radians));
1236 my $lng_west = $centre_lng - ($radius/(111111 * $cos_in_radians));
1237 print STDERR "### lng_east $lng_east\n";
1238 print STDERR "### lng_west $lng_west\n";
1239
1240
1241
1242 my $cos_lat = cos($centre_lat);
1243 print STDERR "cos $centre_lat is $cos_lat\n";
1244
1245 $self->processCoordinate($section, $lat_north, $lng_east, $unique_coordshort_map);
1246 $self->processCoordinate($section, $lat_south, $lng_east, $unique_coordshort_map);
1247 $self->processCoordinate($section, $lat_south, $lng_west, $unique_coordshort_map);
1248 $self->processCoordinate($section, $lat_north, $lng_west, $unique_coordshort_map);
1249
1250 }
1251 elsif ($type eq "marker") {
1252 print STDERR "@@ MARKER FOUND WITH LAT: " . $shape->{"position"}->{"lat"} . "\n";
1253 print STDERR "@@ MARKER FOUND WITH LNG: " . $shape->{"position"}->{"lng"} . "\n";
1254 $self->processCoordinate($section, $shape->{"position"}->{"lat"}, $shape->{"position"}->{"lng"}, $unique_coordshort_map);
1255 }
1256 elsif ($type eq "polyline" || $type eq "polygon") {
1257 my $path_array = $shape->{"path"};
1258 foreach my $position (@$path_array) {
1259 $self->processCoordinate($section, $position->{"lat"}, $position->{"lng"}, $unique_coordshort_map);
1260 }
1261 }
1262 elsif ($type eq "rectangle") {
1263
1264 my $bounds = $shape->{"bounds"};
1265 $self->processCoordinate($section, $bounds->{"north"}, $bounds->{"east"}, $unique_coordshort_map);
1266 $self->processCoordinate($section, $bounds->{"south"}, $bounds->{"east"}, $unique_coordshort_map);
1267 $self->processCoordinate($section, $bounds->{"south"}, $bounds->{"west"}, $unique_coordshort_map);
1268 $self->processCoordinate($section, $bounds->{"north"}, $bounds->{"west"}, $unique_coordshort_map);
1269 }
1270
1271 } # end for on each shape in GPS.mapOverlay
1272 #}
1273 }
1274
1275 # We haven't yet written out any Coordshort meta! Write those out now,
1276 # since we've now finally ensured they're all unique CoordShort meta for this GPS.mapOverlay at least.
1277 foreach my $coordshort (keys %$unique_coordshort_map) {
1278
1279 #print STDERR "@@@@ creating new coordshort: $coordshort\n";
1280 push (@{$section_ptr->{'metadata'}}, ["CoordShort", $coordshort ]);
1281 }
1282
1283 } # end GPS.mapOverlay meta
1284
1285 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1286}
1287
1288# https://en.wikipedia.org/wiki/Radian
1289sub degreesToRadians
1290{
1291 my $self = shift (@_);
1292 my ($degrees) = @_;
1293
1294 return $degrees * pi /180; # returns radians
1295}
1296
1297sub radiansToDegrees
1298{
1299 my $self = shift (@_);
1300 my ($radians) = @_;
1301
1302 return $radians * 180 / pi; # returns degrees
1303}
1304
1305sub printAllShapes {
1306 my ($json, $json_array) = @_;
1307
1308
1309 #my $pretty_print_shape = $json->pretty->encode( $json_array->[0] );
1310 foreach my $shape (@$json_array) {
1311 my $pretty_print_shape = $json->pretty->encode( $shape );
1312 print STDERR "Shape: $pretty_print_shape\n";
1313 #&printShape($shape);
1314 }
1315
1316}
1317
1318# Beware: this method now ONLY writes out the Coordinate meta, but not the CoordShorts
1319# At method's end, the CoordShorts are stored in the $unique_coordshort_map (for optimisation),
1320# ready to be added to the doc section's meta once all Coordinates in a GPS.mapOverlay are done being processed.
1321sub processCoordinate {
1322 my $self = shift (@_);
1323 my ($section, $latitude, $longitude, $unique_coordshort_map) = @_;
1324
1325 my $section_ptr = $self->_lookup_section($section);
1326
1327 my $lat_direction = ($latitude =~ m/^-/) ? "S" : "N";
1328 my $lng_direction = ($longitude =~ m/^-/) ? "W" : "E";
1329
1330 # have to store (lat, lng) in pairs, when there are so many coords to store
1331 #push (@{$section_ptr->{'metadata'}}, ["Latitude", $latitude]);
1332 #push (@{$section_ptr->{'metadata'}}, ["Longitude", $longitude]);
1333
1334 push (@{$section_ptr->{'metadata'}}, ["Coordinate", "$latitude $longitude"]); # "$latitude$lat_direction $longitude$lng_direction"
1335
1336 my ($latBeforeDec, $latAfterDec);
1337 my ($lngBeforeDec, $lngAfterDec);
1338
1339 if($latitude !~ m/\./) {
1340 $latBeforeDec = $latitude;
1341 $latAfterDec = "";
1342 } else {
1343 ($latBeforeDec, $latAfterDec) = ($latitude =~ m/^-?([0-9]+)\.([0-9]+)$/);
1344 }
1345 if($longitude !~ m/\./) {
1346 $lngBeforeDec = $longitude;
1347 $lngAfterDec = "";
1348 } else {
1349 ($lngBeforeDec, $lngAfterDec) = ($longitude =~ m/^-?([0-9]+)\.([0-9]+)$/);
1350 }
1351
1352 # We have to deal with 0 decimal places for either lat or lng, or no dec places for only one of them
1353 # And we have to store all combinations of lats and lngs as CoordShort.
1354
1355 #my $name = "CoordShort";
1356 #push (@{$section_ptr->{'metadata'}}, [$name, "$latBeforeDec$lat_direction $lngBeforeDec$lng_direction"]);
1357
1358 my @lat_array = ("$latBeforeDec$lat_direction");
1359 my @lng_array = ("$lngBeforeDec$lng_direction");
1360
1361 for(my $i = 2; $i <= 4; $i++)
1362 {
1363 my $latDecPlaces = (length($latAfterDec) >= $i) ? substr($latAfterDec, 0, $i) : "";
1364 my $lngDecPlaces = (length($lngAfterDec) >= $i) ? substr($lngAfterDec, 0, $i) : "";
1365
1366 push(@lat_array, $latBeforeDec . $lat_direction. $latDecPlaces) unless $latDecPlaces eq "";
1367 push(@lng_array, $lngBeforeDec . $lng_direction. $lngDecPlaces) unless $lngDecPlaces eq "";
1368
1369 #push (@{$section_ptr->{'metadata'}}, [$name,
1370 # $latBeforeDec . $lat_direction. $latDecPlaces . " " . $lngBeforeDec . $lng_direction. $lngDecPlaces]);
1371
1372 }
1373
1374 for(my $i = 0; $i < scalar(@lat_array); $i++) {
1375 for(my $j = 0; $j < scalar(@lng_array); $j++) {
1376 #push (@{$section_ptr->{'metadata'}}, [$name, $lat_array[$i] . " " . $lng_array[$j] ]);
1377 my $coordshort = $lat_array[$i] . " " . $lng_array[$j];
1378 $unique_coordshort_map->{$coordshort} = "1"; # just store it
1379
1380 #print STDERR "@@@@ pushed new value: " . $coordshort . " into map\n";
1381
1382 }
1383 }
1384
1385
1386
1387 #Only add the metadata if it has not already been added
1388 #my $metaMap = $self->get_metadata_hashmap($section); ### TODO: metaMap not used. Unnecesssary step? (Called function has no side-effects.)
1389}
1390
1391
1392# methods for dealing with text
1393
1394# returns the text for a section
1395sub get_text {
1396 my $self = shift (@_);
1397 my ($section) = @_;
1398
1399 my $section_ptr = $self->_lookup_section($section);
1400 if (!defined $section_ptr) {
1401 print STDERR "doc::get_text couldn't find section " .
1402 "$section\n";
1403 return "";
1404 }
1405
1406 return $section_ptr->{'text'};
1407}
1408
1409# returns the (utf-8 encoded) length of the text for a section
1410sub get_text_length {
1411 my $self = shift (@_);
1412 my ($section) = @_;
1413
1414 my $section_ptr = $self->_lookup_section($section);
1415 if (!defined $section_ptr) {
1416 print STDERR "doc::get_text_length couldn't find section " .
1417 "$section\n";
1418 return 0;
1419 }
1420
1421 return length ($section_ptr->{'text'});
1422}
1423
1424# returns the total length for all the sections
1425sub get_total_text_length {
1426 my $self = shift (@_);
1427
1428 my $section = $self->get_top_section();
1429 my $length = 0;
1430 while (defined $section) {
1431 $length += $self->get_text_length($section);
1432 $section = $self->get_next_section($section);
1433 }
1434 return $length;
1435}
1436
1437sub delete_text {
1438 my $self = shift (@_);
1439 my ($section) = @_;
1440
1441 my $section_ptr = $self->_lookup_section($section);
1442 if (!defined $section_ptr) {
1443 print STDERR "doc::delete_text couldn't find section " .
1444 "$section\n";
1445 return;
1446 }
1447
1448 $section_ptr->{'text'} = "";
1449}
1450
1451# add_text assumes the text is in (extended) ascii form. For
1452# text which has been already converted to the UTF-8 format
1453# use add_utf8_text.
1454sub add_text {
1455 my $self = shift (@_);
1456 my ($section, $text) = @_;
1457
1458 # convert the text to UTF-8 encoded unicode characters
1459 # and add the text
1460 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1461}
1462
1463
1464# add_utf8_text assumes the text to be added has already
1465# been converted to the UTF-8 encoding. For ascii text use
1466# add_text
1467# Pass by value version (internally calls pass by ref version
1468# to avoid code duplication)
1469sub add_utf8_text {
1470 my $self = shift (@_);
1471 my ($section, $text) = @_;
1472
1473 $self->add_utf8_textref($section, \$text);
1474}
1475
1476# Pass by reference version, used by GreenstoneSQLPlugin for fulltext
1477sub add_utf8_textref {
1478 my $self = shift (@_);
1479 my ($section, $text_ref) = @_;
1480
1481 my $section_ptr = $self->_lookup_section($section);
1482 if (!defined $section_ptr) {
1483 print STDERR "doc::add_utf8_textref couldn't find section " .
1484 "$section\n";
1485 return;
1486 }
1487
1488 $section_ptr->{'text'} .= $$text_ref;
1489}
1490
1491# returns the Source meta, which is the utf8 filename generated.
1492# Added a separate method here for convenience
1493sub get_source {
1494 my $self = shift (@_);
1495 return $self->get_metadata_element ($self->get_top_section(), "Source");
1496}
1497
1498# returns the SourceFile meta, which is the url reference to the URL-encoded
1499# version of Source (the utf8 filename). Added a separate method here for convenience
1500sub get_sourcefile {
1501 my $self = shift (@_);
1502 return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
1503}
1504
1505# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
1506# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
1507sub get_assocfile_from_sourcefile {
1508 my $self = shift (@_);
1509
1510 # get the SourceFile meta, which is a *URL* to a file on the filesystem
1511 my $top_section = $self->get_top_section();
1512 my $source_file = $self->get_metadata_element($top_section, "SourceFile");
1513
1514 # get the actual filename as it exists on the filesystem which this url refers to
1515 $source_file = &unicode::url_to_filename($source_file);
1516 my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
1517 return $assocfilename;
1518}
1519
1520# methods for dealing with associated files
1521
1522# a file is associated with a document, NOT a section.
1523# if section is defined it is noted in the data structure
1524# only so that files associated from a particular section
1525# may be removed later (using delete_section_assoc_files)
1526sub associate_file {
1527 my $self = shift (@_);
1528 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1529 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1530
1531 # remove all associated files with the same name
1532 $self->delete_assoc_file ($assoc_filename);
1533
1534 # Too harsh a requirement
1535 # Definitely get HTML docs, for example, with some missing
1536 # support files
1537# if (!&util::fd_exists($real_filename)) {
1538# print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n";
1539# exit -1;
1540# }
1541
1542# print STDERR "**** is the following a UTF8 rep of *real* filename?\n $real_filename\n";
1543# print STDERR "****##### so, ensure it is before storing?!?!?\n";
1544## my $utf8_filename = Encode::encode("utf8",$filename);
1545
1546 push (@{$self->{'associated_files'}},
1547 [$real_filename, $assoc_filename, $mime_type, $section]);
1548}
1549
1550# returns a list of associated files in the form
1551# [[real_filename, assoc_filename, mimetype], ...]
1552sub get_assoc_files {
1553 my $self = shift (@_);
1554
1555 return $self->{'associated_files'};
1556}
1557
1558# the following two methods used to keep track of original associated files
1559# for incremental building. eg a txt file used by an item file does not end
1560# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1561# file for incremental build
1562sub associate_source_file {
1563 my $self = shift (@_);
1564 my ($full_filename) = @_;
1565
1566 push (@{$self->{'source_assoc_files'}}, $full_filename);
1567
1568}
1569
1570sub get_source_assoc_files {
1571 my $self = shift (@_);
1572
1573 return $self->{'source_assoc_files'};
1574
1575
1576}
1577sub metadata_file {
1578 my $self = shift (@_);
1579 my ($real_filename, $filename) = @_;
1580
1581 push (@{$self->{'metadata_files'}},
1582 [$real_filename, $filename]);
1583}
1584
1585# used for writing out the archiveinf-doc info database, to list all the metadata files
1586sub get_meta_files {
1587 my $self = shift (@_);
1588
1589 return $self->{'metadata_files'};
1590}
1591
1592sub delete_section_assoc_files {
1593 my $self = shift (@_);
1594 my ($section) = @_;
1595
1596 my $i=0;
1597 while ($i < scalar (@{$self->{'associated_files'}})) {
1598 if (defined $self->{'associated_files'}->[$i]->[3] &&
1599 $self->{'associated_files'}->[$i]->[3] eq $section) {
1600 splice (@{$self->{'associated_files'}}, $i, 1);
1601 } else {
1602 $i++;
1603 }
1604 }
1605}
1606
1607sub delete_assoc_file {
1608 my $self = shift (@_);
1609 my ($assoc_filename) = @_;
1610
1611 my $i=0;
1612 while ($i < scalar (@{$self->{'associated_files'}})) {
1613 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1614 splice (@{$self->{'associated_files'}}, $i, 1);
1615 } else {
1616 $i++;
1617 }
1618 }
1619}
1620
1621sub reset_nextsection_ptr {
1622 my $self = shift (@_);
1623 my ($section) = @_;
1624
1625 my $section_ptr = $self->_lookup_section($section);
1626 $section_ptr->{'next_subsection'} = 1;
1627}
1628
16291;
Note: See TracBrowser for help on using the repository browser.