source: main/trunk/greenstone2/perllib/doc.pm@ 33313

Last change on this file since 33313 was 33313, checked in by ak19, 5 years ago

Minor. Changes to comment

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 50.1 KB
RevLine 
[537]1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
[8894]10# This program is free software; you can redistr te it and/or modify
[537]11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
[1241]26# base class to hold documents
[4]27
28package doc;
[3834]29eval {require bytes};
[4]30
[832]31BEGIN {
[1241]32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
[832]34}
35
[15894]36use strict;
[1241]37use unicode;
38use util;
[27306]39use FileUtils;
[1241]40use ghtml;
[8220]41use File::stat;
[1241]42##use hashdoc;
[13172]43use docprint;
[33125]44use JSON;
[1241]45
[33125]46# We just need pi from the Trig lib
[33293]47# Import constants pi2, pip2, pip4 (2*pi, pi/2, pi/4).
[33125]48use Math::Trig ':pi';
49
[4]50# the document type may be indexed_doc, nonindexed_doc, or
51# classification
52
[18528]53our $OIDcount = 0;
[2327]54
[33171]55
56# Declare the Package (level) Variable that will tell us whether we're in the import or buildcol phase.
57# This var will be set by import.pl, buildcol.pl and export.pl to the then appropriate value.
58# https://perlmaven.com/package-variables-and-lexical-variables-in-perl
59# But: https://stackoverflow.com/questions/7542120/perl-how-to-make-variables-from-requiring-script-available-in-required-script
60# https://stackoverflow.com/questions/33230015/how-do-i-use-a-variable-in-another-perl-script
61#$doc::cmd_line_mode = undef;
62# Decided to follow the usage of $OIDcount above to declare the necessary package-level variables 'globally' accessible
63# actually accessible only for those packages that import this one (with 'use doc')
[33302]64my $cmd_line_mode = undef;
[33171]65
[33302]66# processor_mode keeps track of which buildcol pass we're at: dummy, text (sidx/didx passes) or infodb
67my $processor_mode = undef;
68
[20412]69# rename_method can be 'url', 'none', 'base64'
[4]70sub new {
71 my $class = shift (@_);
[18319]72 my ($source_filename, $doc_type, $rename_method) = @_;
[1374]73
[13770]74
[1241]75 my $self = bless {'associated_files'=>[],
76 'subsection_order'=>[],
77 'next_subsection'=>1,
78 'subsections'=>{},
79 'metadata'=>[],
[2327]80 'text'=>"",
81 'OIDtype'=>"hash"}, $class;
[4]82
[10217]83 # used to set lastmodified here, but this can screw up the HASH ids, so
84 # the docsave processor now calls set_lastmodified
[13770]85
[23362]86 $self->set_source_path($source_filename);
[10217]87
[7929]88 if (defined $source_filename) {
[15874]89 $source_filename = &util::filename_within_collection($source_filename);
[18508]90 print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
[18319]91 $self->set_source_filename ($source_filename, $rename_method);
[7929]92 }
93
[1374]94 $self->set_doc_type ($doc_type) if defined $doc_type;
[485]95
[4]96 return $self;
97}
[23362]98
99
100sub set_source_path
101{
102 my $self = shift @_;
103 my ($source_filename) = @_;
104
105 if (defined $source_filename) {
106 # On Windows the source_filename can be in terse DOS format
107 # e.g. test~1.txt
108
109 $self->{'terse_source_path'} = $source_filename;
110
[27306]111 # Use the FileUtil library methods as they are aware of more special
112 # cases such as HDFS [jmt12]
[27393]113 if (&FileUtils::fileExists($source_filename))
114 {
[23362]115 # See if we can do better for Windows with a filename
116 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
117 require Win32;
118 $self->{'source_path'} = Win32::GetLongPathName($source_filename);
119 }
120 else {
121 # For Unix-based systems, there is no difference between the two
122 $self->{'source_path'} = $source_filename;
123 }
124 }
125 else {
126 print STDERR "Warning: In doc::set_source_path(), file\n";
127 print STDERR " $source_filename\n";
128 print STDERR " does not exist\n";
129
130 # (default) Set it to whatever we were given
131 $self->{'source_path'} = $source_filename;
132 }
133 }
134 else {
135 # Previous code for setting source_path allowed for
136 # it to be undefined, so continue this practice
137 $self->{'terse_source_path'} = undef;
138 $self->{'source_path'} = undef;
139 }
140}
141
142
143sub get_source_path
144{
145 my $self = shift @_;
146
147 return $self->{'terse_source_path'};
148}
149
[10217]150# set lastmodified for OAI purposes, added by GRB, moved by kjdon
[23939]151sub set_oailastmodified {
152 my $self = shift (@_);
153
154 my $source_path = $self->{'terse_source_path'};
155
156 if (defined $source_path && (-e $source_path)) {
157 my $current_time = time;
158
159 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
160 $wday, $yday, $isdst) = localtime($current_time);
161
162 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
163
164 $self->add_utf8_metadata($self->get_top_section(), "oailastmodified", $current_time);
165 $self->add_utf8_metadata($self->get_top_section(), "oailastmodifieddate", $date_modified);
166 }
167}
168
169# no longer used for OAI purposes, since lastmodified is not what we want as the
170# Datestamp of a document. This doc metadata may be useful for general purposes.
[10217]171sub set_lastmodified {
172 my $self = shift (@_);
[4]173
[23362]174 my $source_path = $self->{'terse_source_path'};
[13770]175
176 if (defined $source_path && (-e $source_path)) {
177
178 my $file_stat = stat($source_path);
[10217]179 my $mtime = $file_stat->mtime;
[21862]180 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
181 $wday, $yday, $isdst) = localtime($mtime);
182
183 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
184
[23923]185 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $mtime);
[21862]186 $self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
[10217]187 }
188}
189
[1241]190# clone the $self object
191sub duplicate {
192 my $self = shift (@_);
193
194 my $newobj = {};
195
[8716]196 foreach my $k (keys %$self) {
[1241]197 $newobj->{$k} = &clone ($self->{$k});
198 }
199
200 bless $newobj, ref($self);
201 return $newobj;
202}
203
204sub clone {
205 my ($from) = @_;
206 my $type = ref ($from);
207
208 if ($type eq "HASH") {
209 my $to = {};
[8716]210 foreach my $key (keys %$from) {
[1241]211 $to->{$key} = &clone ($from->{$key});
212 }
213 return $to;
214 } elsif ($type eq "ARRAY") {
215 my $to = [];
[8716]216 foreach my $v (@$from) {
[1241]217 push (@$to, &clone ($v));
218 }
219 return $to;
220 } else {
221 return $from;
222 }
223}
224
[2327]225sub set_OIDtype {
226 my $self = shift (@_);
[12268]227 my ($type, $metadata) = @_;
[1241]228
[26536]229 if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|hash_on_full_filename|incremental|filename|dirname|full_filename|assigned)$/) {
[2327]230 $self->{'OIDtype'} = $type;
231 } else {
232 $self->{'OIDtype'} = "hash";
233 }
[16792]234
[12268]235 if ($type =~ /^assigned$/) {
236 if (defined $metadata) {
237 $self->{'OIDmetadata'} = $metadata;
238 } else {
239 $self->{'OIDmetadata'} = "dc.Identifier";
240 }
241 }
[2327]242}
243
[20412]244# rename_method can be 'url', 'none', 'base64'
[1241]245sub set_source_filename {
246 my $self = shift (@_);
[18319]247 my ($source_filename, $rename_method) = @_;
[1241]248
[16578]249 # Since the gsdlsourcefilename element goes into the doc.xml it has
250 # to be utf8. However, it should also *represent* the source filename
251 # (in the import directory) which may not be utf8 at all.
[18319]252 # For instance, if this meta element (gsdlsourcefilename) will be used
253 # by other applications that parse doc.xml in order to locate
254 # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
255 # encode the real filename as this is a binary-to-text encoding meaning
256 # that the resulting string is ASCII (utf8). Decoding will give the original.
[16578]257
[18319]258# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
259
[16670]260 # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
[23278]261 $source_filename = $self->encode_filename($source_filename, $rename_method);
262# my ($srcfilename,$dirname,$suffix)
263# = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
[16792]264# print STDERR "-> $srcfilename -> ";
[23278]265# $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
[27306]266# $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
[16792]267# print STDERR "$source_filename\n";
[16670]268
[16578]269 $self->set_utf8_metadata_element ($self->get_top_section(),
[1241]270 "gsdlsourcefilename",
271 $source_filename);
272}
273
[23278]274sub encode_filename {
275 my $self = shift (@_);
276 my ($source_filename, $rename_method) = @_;
277
278 my ($srcfilename,$dirname,$suffix)
279 = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
280# print STDERR "-> $srcfilename -> ";
281 $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
[27306]282 $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
[23278]283
284 return $source_filename;
285}
286
[7569]287sub set_converted_filename {
288 my $self = shift (@_);
289 my ($converted_filename) = @_;
290
[16578]291 # we know the converted filename is utf8
292 $self->set_utf8_metadata_element ($self->get_top_section(),
[7569]293 "gsdlconvertedfilename",
294 $converted_filename);
295}
296
[1241]297# returns the source_filename as it was provided
[19829]298sub get_unmodified_source_filename {
299 my $self = shift (@_);
300
[23362]301 return $self->{'terse_source_path'};
[19829]302}
303
304# returns the source_filename with whatever rename_method was given
[1241]305sub get_source_filename {
306 my $self = shift (@_);
307
308 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
309}
310
[19829]311
312
[7569]313# returns converted filename if available else returns source filename
314sub get_filename_for_hashing {
315 my $self = shift (@_);
316
317 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
318
319 if (!defined $filename) {
[11097]320 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
[10980]321 # if NULPlug processed file, then don't give a filename
[11097]322 if (defined $plugin_name && $plugin_name eq "NULPlug") {
[10980]323 $filename = undef;
[16670]324 } else { # returns the URL encoded source filename!
[10980]325 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
326 }
[7569]327 }
[23485]328
[27306]329 if (!&FileUtils::isFilenameAbsolute($filename)) {
330 $filename = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$filename);
[23485]331 }
332
[7569]333 return $filename;
334}
335
[1241]336sub set_doc_type {
337 my $self = shift (@_);
338 my ($doc_type) = @_;
339
340 $self->set_metadata_element ($self->get_top_section(),
341 "gsdldoctype",
342 $doc_type);
343}
344
[10217]345# returns the gsdldoctype as it was provided
[1241]346# the default of "indexed_doc" is used if no document
347# type was provided
348sub get_doc_type {
349 my $self = shift (@_);
350
351 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
352 return $doc_type if (defined $doc_type);
353 return "indexed_doc";
354}
355
356
357# look up the reference to the a particular section
358sub _lookup_section {
359 my $self = shift (@_);
360 my ($section) = @_;
361
362 my ($num);
363 my $sectionref = $self;
364
365 while (defined $section && $section ne "") {
[12327]366
[1241]367 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
[12327]368
369 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
370
[1241]371 $section = "" unless defined $section;
372
[12327]373
[1241]374 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
375 $sectionref = $sectionref->{'subsections'}->{$num};
376 } else {
377 return undef;
378 }
379 }
380
381 return $sectionref;
382}
383
[2327]384# calculate OID by hashing the contents of the document
[1241]385sub _calc_OID {
386 my $self = shift (@_);
387 my ($filename) = @_;
388
[22855]389
[1241]390 my $osexe = &util::get_os_exe();
391
[27306]392 my $hashfile_exe = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin",
[1241]393 $ENV{'GSDLOS'},"hashfile$osexe");
[8504]394
[28560]395 &util::set_gnomelib_env(); # gnomelib_env (particularly lib/libiconv2.dylib) required to run the hashfile executable on Mac Lions
396 # The subroutine will set the gnomelib env once for each subshell launched, by first testing if GEXTGNOME is not already set
397
[28575]398 # A different way to set the gnomelib env would be to do it more locally: exporting the necessary vars
399 # (specifically DYLD/LD_LIB_PATH) for gnome_lib as part of the command executed.
400 # E.g. $result=`export LD_LIBRARY_PATH=../ext/gnome-lib/darwin/lib; hashfile...`
401
[1241]402 my $result = "NULL";
[22855]403
[1679]404
[1241]405 if (-e "$hashfile_exe") {
[1679]406# $result = `\"$hashfile_exe\" \"$filename\"`;
[21862]407# $result = `hashfile$osexe \"$filename\" 2>&1`;
[1679]408 $result = `hashfile$osexe \"$filename\"`;
[21862]409
[1241]410 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
411 } else {
412 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
413 }
414 return "HASH$result";
415}
416
417# methods dealing with OID, not groups of them.
418
[2327]419# if $OID is not provided one is calculated
[1241]420sub set_OID {
421 my $self = shift (@_);
422 my ($OID) = @_;
[8504]423
[8797]424 my $use_hash_oid = 0;
[17057]425 # if an OID wasn't provided calculate one
[1241]426 if (!defined $OID) {
427 $OID = "NULL";
[17025]428 if ($self->{'OIDtype'} =~ /^hash/) {
[8797]429 $use_hash_oid = 1;
430 } elsif ($self->{'OIDtype'} eq "incremental") {
[2327]431 $OID = "D" . $OIDcount;
432 $OIDcount ++;
[26221]433 } elsif ($self->{'OIDtype'} eq "filename") {
434 my $filename = $self->get_source_filename();
435 $OID = &File::Basename::fileparse($filename, qr/\.[^.]*/);
436 $OID = &util::tidy_up_oid($OID);
[26536]437 } elsif ($self->{'OIDtype'} eq "full_filename") {
438 my $source_filename = $self->get_source_filename();
439 my $dirsep = &util::get_os_dirsep();
440
441 $source_filename =~ s/^import$dirsep//;
442 $source_filename =~ s/$dirsep/-/g;
443 $source_filename =~ s/\./_/g;
444
445 $OID = $source_filename;
446 $OID = &util::tidy_up_oid($OID);
[8716]447 } elsif ($self->{'OIDtype'} eq "dirname") {
448 my $filename = $self->get_source_filename();
[8797]449 if (defined($filename)) { # && -e $filename) {
[27350]450 # get the immediate parent directory
[8716]451 $OID = &File::Basename::dirname($filename);
452 if (defined $OID) {
[27350]453 $OID = &File::Basename::basename($OID);
[19617]454 $OID = &util::tidy_up_oid($OID);
[8716]455 } else {
[8797]456 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
457 $use_hash_oid = 1;
[8716]458 }
459 } else {
[27350]460 print STDERR "Failed to find a filename, generating hash id\n";
[8797]461 $use_hash_oid = 1;
[8716]462 }
[2327]463
[8797]464 } elsif ($self->{'OIDtype'} eq "assigned") {
[12268]465 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
[8797]466 if (defined $identifier && $identifier ne "") {
[17025]467 $OID = $identifier;
[19617]468 $OID = &util::tidy_up_oid($OID);
[8797]469 } else {
470 # need a hash id
[12268]471 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
[8797]472 $use_hash_oid = 1;
473 }
474
[2327]475 } else {
[8797]476 $use_hash_oid = 1;
477 }
478
479 if ($use_hash_oid) {
[23562]480 my $hash_on_file = 1;
[26536]481 my $hash_on_ga_xml = 0;
482
[17025]483 if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
484 $hash_on_file = 0;
[26536]485 $hash_on_ga_xml = 1;
[17025]486 }
[26536]487
488 if ($self->{'OIDtype'} eq "hash_on_full_filename") {
489 $hash_on_file = 0;
490 $hash_on_ga_xml = 0;
491
492 my $source_filename = $self->get_source_filename();
493 my $dirsep = &util::get_os_dirsep();
494
495 $source_filename =~ s/^import$dirsep//;
496 $source_filename =~ s/$dirsep/-/g;
497 $source_filename =~ s/\./_/g;
498
499 # If the filename is very short then (handled naively)
500 # this can cause conjestion in the hash-values
501 # computed, leading documents sharing the same leading
502 # Hex values in the computed has.
503 #
504 # The solution taken here is to replace the name of
505 # the file name a sufficient number of times (up to
506 # the character limit defined in 'rep_limit' and
507 # make that the content that is hashed on
508
509 # *** Think twice before changing the following value
510 # as it will break backward compatability of computed
511 # document HASH values
512
513 my $rep_limit = 256;
514 my $hash_content = undef;
515
516 if (length($source_filename)<$rep_limit) {
517 my $rep_string = "$source_filename|";
518 my $rs_len = length($rep_string);
519
520 my $clone_times = int(($rep_limit-1)/$rs_len) +1;
521
522 $hash_content = substr($rep_string x $clone_times, 0, $rep_limit);
523 }
524 else {
525 $hash_content = $source_filename;
526 }
527
528 my $filename = &util::get_tmp_filename();
529 if (!open (OUTFILE, ">:utf8", $filename)) {
530 print STDERR "doc::set_OID could not write to $filename\n";
531 } else {
532 print OUTFILE $hash_content;
533 close (OUTFILE);
534 }
535 $OID = $self->_calc_OID ($filename);
536
[27538]537 &FileUtils::removeFiles ($filename);
[26536]538 }
539
[17025]540 if ($hash_on_file) {
541 # "hash" OID - feed file to hashfile.exe
542 my $filename = $self->get_filename_for_hashing();
[23562]543
[17025]544 # -z: don't want to hash on the file if it is zero size
545 if (defined($filename) && -e $filename && !-z $filename) {
546 $OID = $self->_calc_OID ($filename);
547 } else {
[26536]548 $hash_on_ga_xml = 1; # switch to back-up plan, and hash on GA file instead
[17025]549 }
550 }
[26536]551
552 if ($hash_on_ga_xml) {
553 # In addition being asked to explicity calculate the has based on the GA file,
554 # can also end up coming into this block is doing 'hash_on_file' but the file
555 # itself is of zero bytes (as could be the case with 'doc.nul' file
556
[17025]557 my $filename = &util::get_tmp_filename();
[22855]558 if (!open (OUTFILE, ">:utf8", $filename)) {
[2327]559 print STDERR "doc::set_OID could not write to $filename\n";
560 } else {
[32512]561 my $doc_text = &docprint::get_section_xml($self);
[13172]562 print OUTFILE $doc_text;
[2327]563 close (OUTFILE);
564 }
565 $OID = $self->_calc_OID ($filename);
[27393]566 &FileUtils::removeFiles($filename);
[1374]567 }
[1241]568 }
569 }
570 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
571}
572
573# this uses hashdoc (embedded c thingy) which is faster but still
[32535]574# needs a little work to be sufficiently stable
[1241]575sub ___set_OID {
576 my $self = shift (@_);
577 my ($OID) = @_;
578
579 # if an OID wasn't provided then calculate hash value based on document
580 if (!defined $OID)
581 {
[32512]582 my $hash_text = &docprint::get_section_xml($self);
[1241]583 my $hash_len = length($hash_text);
584
585 $OID = &hashdoc::buffer($hash_text,$hash_len);
586 }
587
588 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
589}
590
591# returns the OID for this document
592sub get_OID {
593 my $self = shift (@_);
594 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
595 return $OID if (defined $OID);
596 return "NULL";
597}
598
599sub delete_OID {
600 my $self = shift (@_);
601
602 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
603}
604
605
606# methods for manipulating section names
607
608# returns the name of the top-most section (the top
609# level of the document
610sub get_top_section {
611 my $self = shift (@_);
612
613 return "";
614}
615
616# returns a section
617sub get_parent_section {
618 my $self = shift (@_);
619 my ($section) = @_;
620
621 $section =~ s/(^|\.)\d+$//;
622
623 return $section;
624}
625
626# returns the first child section (or the end child
627# if there isn't any)
628sub get_begin_child {
629 my $self = shift (@_);
630 my ($section) = @_;
631
632 my $section_ptr = $self->_lookup_section($section);
633 return "" unless defined $section_ptr;
634
635 if (defined $section_ptr->{'subsection_order'}->[0]) {
636 return "$section.$section_ptr->{'subsection_order'}->[0]";
637 }
638
639 return $self->get_end_child ($section);
640}
641
642# returns the next child of a parent section
643sub get_next_child {
644 my $self = shift (@_);
645 my ($section) = @_;
646
647 my $parent_section = $self->get_parent_section($section);
648 my $parent_section_ptr = $self->_lookup_section($parent_section);
649 return undef unless defined $parent_section_ptr;
650
651 my ($section_num) = $section =~ /(\d+)$/;
652 return undef unless defined $section_num;
653
654 my $i = 0;
655 my $section_order = $parent_section_ptr->{'subsection_order'};
656 while ($i < scalar(@$section_order)) {
657 last if $section_order->[$i] eq $section_num;
658 $i++;
659 }
660
661 $i++; # the next child
662 if ($i < scalar(@$section_order)) {
663 return $section_order->[$i] if $parent_section eq "";
664 return "$parent_section.$section_order->[$i]";
665 }
666
667 # no more sections in this level
668 return undef;
669}
670
671# returns a reference to a list of children
672sub get_children {
673 my $self = shift (@_);
674 my ($section) = @_;
675
676 my $section_ptr = $self->_lookup_section($section);
677 return [] unless defined $section_ptr;
678
679 my @children = @{$section_ptr->{'subsection_order'}};
680
681 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
682 return \@children;
683}
684
685# returns the child section one past the last one (which
686# is coded as "0")
687sub get_end_child {
688 my $self = shift (@_);
689 my ($section) = @_;
690
691 return $section . ".0" unless $section eq "";
692 return "0";
693}
694
695# returns the next section in book order
696sub get_next_section {
697 my $self = shift (@_);
698 my ($section) = @_;
699
700 return undef unless defined $section;
701
702 my $section_ptr = $self->_lookup_section($section);
703 return undef unless defined $section_ptr;
704
705 # first try to find first child
706 if (defined $section_ptr->{'subsection_order'}->[0]) {
707 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
708 return "$section.$section_ptr->{'subsection_order'}->[0]";
709 }
710
711 do {
712 # try to find sibling
713 my $next_child = $self->get_next_child ($section);
714 return $next_child if (defined $next_child);
715
716 # move up one level
717 $section = $self->get_parent_section ($section);
718 } while $section =~ /\d/;
719
720 return undef;
721}
722
723sub is_leaf_section {
724 my $self = shift (@_);
725 my ($section) = @_;
726
727 my $section_ptr = $self->_lookup_section($section);
728 return 1 unless defined $section_ptr;
729
730 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
731}
732
733# methods for dealing with sections
734
735# returns the name of the inserted section
736sub insert_section {
737 my $self = shift (@_);
738 my ($before_section) = @_;
739
740 # get the child to insert before and its parent section
741 my $parent_section = "";
742 my $before_child = "0";
743 my @before_section = split (/\./, $before_section);
744 if (scalar(@before_section) > 0) {
745 $before_child = pop (@before_section);
746 $parent_section = join (".", @before_section);
747 }
748
749 my $parent_section_ptr = $self->_lookup_section($parent_section);
750 if (!defined $parent_section_ptr) {
751 print STDERR "doc::insert_section couldn't find parent section " .
752 "$parent_section\n";
753 return;
754 }
755
756 # get the next section number
757 my $section_num = $parent_section_ptr->{'next_subsection'}++;
758
759 my $i = 0;
760 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
761 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
762 $i++;
763 }
764
765 # insert the section number into the order list
766 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
767
768 # add this section to the parent section
769 my $section_ptr = {'subsection_order'=>[],
770 'next_subsection'=>1,
771 'subsections'=>{},
772 'metadata'=>[],
773 'text'=>""};
774 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
775
776 # work out the full section number
777 my $section = $parent_section;
778 $section .= "." unless $section eq "";
779 $section .= $section_num;
780
781 return $section;
782}
783
784# creates a pre-named section
785sub create_named_section {
786 my $self = shift (@_);
787 my ($mastersection) = @_;
788
789 my ($num);
790 my $section = $mastersection;
791 my $sectionref = $self;
792
793 while ($section ne "") {
794 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
795 $num =~ s/^0+(\d)/$1/; # remove leading 0s
796 $section = "" unless defined $section;
797
798 if (defined $num) {
799 if (!defined $sectionref->{'subsections'}->{$num}) {
800 push (@{$sectionref->{'subsection_order'}}, $num);
801 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
802 'next_subsection'=>1,
803 'subsections'=>{},
804 'metadata'=>[],
805 'text'=>""};
806 if ($num >= $sectionref->{'next_subsection'}) {
807 $sectionref->{'next_subsection'} = $num + 1;
808 }
809 }
810 $sectionref = $sectionref->{'subsections'}->{$num};
811
812 } else {
813 print STDERR "doc::create_named_section couldn't create section ";
814 print STDERR "$mastersection\n";
815 last;
816 }
817 }
818}
819
820# returns a reference to a list of subsections
821sub list_subsections {
822 my $self = shift (@_);
823 my ($section) = @_;
824
825 my $section_ptr = $self->_lookup_section ($section);
826 if (!defined $section_ptr) {
827 print STDERR "doc::list_subsections couldn't find section $section\n";
828 return [];
829 }
830
831 return [@{$section_ptr->{'subsection_order'}}];
832}
833
834sub delete_section {
835 my $self = shift (@_);
836 my ($section) = @_;
837
838# my $section_ptr = {'subsection_order'=>[],
839# 'next_subsection'=>1,
840# 'subsections'=>{},
841# 'metadata'=>[],
842# 'text'=>""};
843
844 # if this is the top section reset everything
845 if ($section eq "") {
846 $self->{'subsection_order'} = [];
847 $self->{'subsections'} = {};
848 $self->{'metadata'} = [];
849 $self->{'text'} = "";
850 return;
851 }
852
853 # find the parent of the section to delete
854 my $parent_section = "";
855 my $child = "0";
856 my @section = split (/\./, $section);
857 if (scalar(@section) > 0) {
858 $child = pop (@section);
859 $parent_section = join (".", @section);
860 }
861
862 my $parent_section_ptr = $self->_lookup_section($parent_section);
863 if (!defined $parent_section_ptr) {
864 print STDERR "doc::delete_section couldn't find parent section " .
865 "$parent_section\n";
866 return;
867 }
868
869 # remove this section from the subsection_order list
870 my $i = 0;
871 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
872 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
873 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
874 last;
875 }
876 $i++;
877 }
878
879 # remove this section from the subsection hash
880 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
881 undef $parent_section_ptr->{'subsections'}->{$child};
882 }
883}
884
885#--
[4]886# methods for dealing with metadata
887
888# set_metadata_element and get_metadata_element are for metadata
889# which should only have one value. add_meta_data and get_metadata
890# are for metadata which can have more than one value.
891
[1241]892# returns the first metadata value which matches field
[6111]893
894# This version of get metadata element works much like the one above,
895# except it allows for the namespace portion of a metadata element to
896# be ignored, thus if you are searching for dc.Title, the first piece
897# of matching metadata ending with the name Title (once any namespace
898# is removed) would be returned.
899# 28-11-2003 John Thompson
[1241]900sub get_metadata_element {
901 my $self = shift (@_);
[6111]902 my ($section, $field, $ignore_namespace) = @_;
[1241]903 my ($data);
904
[6111]905 $ignore_namespace = 0 unless defined $ignore_namespace;
906
[1241]907 my $section_ptr = $self->_lookup_section($section);
908 if (!defined $section_ptr) {
[8716]909 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
[1241]910 return;
911 }
912
[24404]913 # Remove any namespace if we are being told to ignore them
[6111]914 if($ignore_namespace) {
[24404]915 $field =~ s/^.*\.//; #$field =~ s/^\w*\.//;
[6111]916 }
917
[1241]918 foreach $data (@{$section_ptr->{'metadata'}}) {
[6111]919
920 my $data_name = $data->[0];
[14966]921
[24404]922 # Remove any namespace if we are being told to ignore them
[6111]923 if($ignore_namespace) {
[24404]924 $data_name =~ s/^.*\.//; #$data_name =~ s/^\w*\.//;
[6111]925 }
[24404]926 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
927 $data_name =~ s/^ex\.([^.]+)$/$1/; #$data_name =~ s/^ex\.//;
928
[6111]929 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
[1241]930 }
931
932 return undef; # was not found
933}
934
935# returns a list of the form [value1, value2, ...]
936sub get_metadata {
937 my $self = shift (@_);
[6111]938 my ($section, $field, $ignore_namespace) = @_;
[1241]939 my ($data);
940
[6111]941 $ignore_namespace = 0 unless defined $ignore_namespace;
942
[1241]943 my $section_ptr = $self->_lookup_section($section);
944 if (!defined $section_ptr) {
[8716]945 print STDERR "doc::get_metadata couldn't find section ",
946 $section, "\n";
[1241]947 return;
948 }
949
[24404]950 # Remove any namespace if we are being told to ignore them
[6111]951 if($ignore_namespace) {
[24404]952 $field =~ s/^.*\.//;
[6111]953 }
954
[1241]955 my @metadata = ();
956 foreach $data (@{$section_ptr->{'metadata'}}) {
[6111]957
958 my $data_name = $data->[0];
[24404]959
960 # Remove any namespace if we are being told to ignore them
[6111]961 if($ignore_namespace) {
[24404]962 $data_name =~ s/^.*\.//;
963 }
964 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
965 $data_name =~ s/^ex\.([^.]+)$/$1/;
966
[6111]967 push (@metadata, $data->[1]) if ($data_name eq $field);
[1241]968 }
[9241]969
[1241]970 return \@metadata;
971}
972
[23827]973sub get_metadata_hashmap {
974 my $self = shift (@_);
975 my ($section, $opt_namespace) = @_;
976
977 my $section_ptr = $self->_lookup_section($section);
978 if (!defined $section_ptr) {
979 print STDERR "doc::get_metadata couldn't find section ",
980 $section, "\n";
981 return;
982 }
983
984 my $metadata_hashmap = {};
985 foreach my $data (@{$section_ptr->{'metadata'}}) {
986 my $metaname = $data->[0];
987
988 if ((!defined $opt_namespace) || ($metaname =~ m/^$opt_namespace\./)) {
989 if (!defined $metadata_hashmap->{$metaname}) {
990 $metadata_hashmap->{$metaname} = [];
991 }
992 my $metaval_list = $metadata_hashmap->{$metaname};
993 push(@$metaval_list, $data->[1]);
994 }
995 }
996
997 return $metadata_hashmap;
998}
999
[1241]1000# returns a list of the form [[field,value],[field,value],...]
1001sub get_all_metadata {
1002 my $self = shift (@_);
1003 my ($section) = @_;
1004
1005 my $section_ptr = $self->_lookup_section($section);
1006 if (!defined $section_ptr) {
[8716]1007 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
[1241]1008 return;
1009 }
1010
1011 return $section_ptr->{'metadata'};
1012}
1013
1014# $value is optional
1015sub delete_metadata {
1016 my $self = shift (@_);
1017 my ($section, $field, $value) = @_;
1018
1019 my $section_ptr = $self->_lookup_section($section);
1020 if (!defined $section_ptr) {
[23562]1021 print STDERR "doc::delete_metadata couldn't find section ", $section, "$field\n";
[1241]1022 return;
1023 }
1024
1025 my $i = 0;
1026 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
1027 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
1028 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
1029 splice (@{$section_ptr->{'metadata'}}, $i, 1);
1030 } else {
1031 $i++;
1032 }
1033 }
1034}
1035
1036sub delete_all_metadata {
1037 my $self = shift (@_);
1038 my ($section) = @_;
1039
1040 my $section_ptr = $self->_lookup_section($section);
1041 if (!defined $section_ptr) {
[8716]1042 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
[1241]1043 return;
1044 }
1045
1046 $section_ptr->{'metadata'} = [];
1047}
1048
[4]1049sub set_metadata_element {
1050 my $self = shift (@_);
1051 my ($section, $field, $value) = @_;
1052
[97]1053 $self->set_utf8_metadata_element ($section, $field,
[1870]1054 &unicode::ascii2utf8(\$value));
[73]1055}
1056
1057# set_utf8_metadata_element assumes the text has already been
1058# converted to the UTF-8 encoding.
1059sub set_utf8_metadata_element {
1060 my $self = shift (@_);
1061 my ($section, $field, $value) = @_;
[25557]1062
[4]1063 $self->delete_metadata ($section, $field);
[73]1064 $self->add_utf8_metadata ($section, $field, $value);
[4]1065}
1066
1067
[73]1068# add_metadata assumes the text is in (extended) ascii form. For
[8220]1069# text which has already been converted to the UTF-8 format use
[73]1070# add_utf8_metadata.
[4]1071sub add_metadata {
1072 my $self = shift (@_);
1073 my ($section, $field, $value) = @_;
[25557]1074
[97]1075 $self->add_utf8_metadata ($section, $field,
[1870]1076 &unicode::ascii2utf8(\$value));
[73]1077}
1078
1079sub add_utf8_metadata {
1080 my $self = shift (@_);
1081 my ($section, $field, $value) = @_;
[25557]1082
[33171]1083 # my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
[25557]1084 # my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
1085 # print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
[73]1086
[4]1087 my $section_ptr = $self->_lookup_section($section);
1088 if (!defined $section_ptr) {
[8716]1089 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
[4]1090 return;
1091 }
[1732]1092 if (!defined $value) {
1093 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
1094 return;
1095 }
1096 if (!defined $field) {
1097 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
1098 return;
1099 }
[4]1100
[8894]1101 #print STDERR "###$field=$value\n";
[22950]1102
1103 # For now, supress this check. Given that text data read in is now
1104 # Unicode aware, then the following block of code can (ironically enough)
1105 # cause our unicode compliant string to be re-encoded (leading to
1106 # a double-encoded UTF-8 string, which we definitely don't want!).
1107
1108
[7798]1109 # double check that the value is utf-8
[25557]1110 # if (!&unicode::check_is_utf8($value)) {
1111 # print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
1112 # &unicode::ensure_utf8(\$value);
1113 # print STDERR " Tried converting to utf8: $value\n";
1114 # }
[7798]1115
[25557]1116 #If the metadata value is either a latitude or a longitude value then we want to save a shortened version for spacial searching purposes
[26850]1117 if ($field =~ m/^(.+\.)?Latitude$/ || $field =~ m/^(.+\.)?Longitude$/)
[25557]1118 {
[26850]1119 my ($mdprefix,$metaname) = ($field =~ m/(.+)\.(.+)$/);
1120 if (defined $mdprefix) {
1121 # Add in a version of Latitude/Longitude without the metadata namespace prefix to keep Runtime happy
1122 push (@{$section_ptr->{'metadata'}}, [$metaname, $value]);
1123 }
1124
[25557]1125 my $direction;
1126 if($value =~ m/^-/)
1127 {
1128 $direction = ($field eq "Latitude") ? "S" : "W";
1129 }
1130 else
1131 {
1132 $direction = ($field eq "Latitude") ? "N" : "E";
1133 }
1134
1135 my ($beforeDec, $afterDec) = ($value =~ m/^-?([0-9]+)\.([0-9]+)$/);
1136 if(defined $beforeDec && defined $afterDec)
1137 {
1138 my $name = ($field eq "Latitude") ? "LatShort" : "LngShort";
1139 push (@{$section_ptr->{'metadata'}}, [$name, $beforeDec . $direction]);
1140
1141 for(my $i = 2; $i <= 4; $i++)
1142 {
1143 if(length($afterDec) >= $i)
1144 {
1145 push (@{$section_ptr->{'metadata'}}, [$name, substr($afterDec, 0, $i)]);
1146 }
1147 }
1148
1149 #Only add the metadata if it has not already been added
[33217]1150 my $metaMap = $self->get_metadata_hashmap($section); # metaMap not used and called function has no apparent side-effects,
1151 # but this line appears important for ensuring uniqueness of (Latitude, value) meta
1152 # in the section. Also for LatShort, Longitude, LngShort.
[25557]1153 }
[33126]1154
1155 if($field =~ m/^(.+\.)?Longitude$/) {
[33128]1156 # if we are dealing with Longitude meta, we should 1. have Latitude meta too; 2. already have processed Latitude meta
[33126]1157 # in that case, add both Lat and Lng of this section as a Coordinate meta
[33217]1158 my $latitude = $self->get_metadata_element ($section, "Latitude");
1159 my $longitude = $value;
1160 # TODO: would like all Longitude meta placed together, followed by all Coordinate meta,
1161 # but the following will add all such coord meta to the doc object and the end of this function will add Longitude meta
1162 $self->processCoordinate($section, $latitude, $longitude);
[33126]1163 }
[25557]1164 }
[33302]1165
1166 elsif($field eq "GPS.mapOverlay") { # then the $value is a JSON string
[25557]1167
[33313]1168 # In order to allow searching map-data-enriched documents by map shape descriptions,
[33302]1169 # and to run rawquery searches for other docs by proximity based on their map data,
1170 # need to store the shape descriptions and Coordinate info for shapes into the text index.
1171 # We add the description for each shape in the mapoverlay into the text index as GPSMapOverlayLabel
1172 # And we add Coordinate (CD) and CoordShort (CS) info for each shape in the mapoverlay in the format (Lat, Lng) as ("37S339 175E342")
1173 # where the digits before the N/S/E/W direction represents the whole number. And the digits after the direction are the
[33313]1174 # decimal places which can vary between 0 and 2 to 4 digits.
[33302]1175
[33313]1176 # However, we only want to process GPS.mapOverlay during buildcol and only in the text indexing passes (e.g. sidx and didx for lucene)
1177 # and certainly never during the infodb pass of buildcol. The latter can end up duplicating Coordinate/CoordShort/GPSMapOverlayLabel
1178 # when rebuilding with the online doc editor, as that runs incremental-rebuild which then calls basebuilder::reconstruct_doc_objs_metadata()
[33302]1179 # on all docs NOT being incrementally rebuilt. That call would get meta from the infodb and use it to reconstruct the doc objects of docs
[33313]1180 # NOT being incrementally built. If the Coord and Label meta were written to the infodb, they would then be loaded back in memory when the
1181 # collection is incrementally rebuilt for those docs that don't need incremental processing. Then this function would once again add the
1182 # same meta into the infodb, thus duplicating what goes into the infodb. Hence, don't do all the following if doc::processor_mode =~ "infodb",
1183 # or anything other than a text mode.
[33217]1184
[33302]1185 # Note that for incremental rebuilding, the text pass can be called textreindex for instance (and infodb pass can be incinfodb).
1186 # So don't check for exact string match
1187
[33312]1188 if($doc::cmd_line_mode eq "buildcol" && $doc::processor_mode =~ m/^text/) { # currently known text processor_modes:
1189 # text, textreindex, possibly textdelete (see ArchivesInfPlugin.pm::read() for last 2).
1190 # OR: ..."buildcol" && $doc::processor_mode !~ m/infodb$/) # if dummy pass important
[33171]1191
[33302]1192 ###print STDERR "GPS.mapOverlay before decoding, val = " . $value . "\n";
[33125]1193
[33302]1194 # TODO: html decode?
[33125]1195 $value =~ s@&#091;@[@g;
1196 $value =~ s@&#093;@]@g;
1197 $value =~ s@&quot;@"@g;
[33302]1198 ###print STDERR "GPS.mapOverlay after decoding, val = " . $value . "\n";
[33125]1199
[33293]1200 my $json_array = decode_json $value;
[33302]1201
1202 foreach my $shape (@$json_array) {
[33125]1203
[33302]1204 # Put each available shape description/label into this section's metadata with GPSMapOverlayLabel as metaname.
1205 # Just as for Coordinate meta, don't need to know which shape a label belongs too. This is just so each label
1206 # will be indexed, and therefore can be searched.
1207
1208 my $description = $shape->{"description"};
1209 if($description) {
1210 push (@{$section_ptr->{'metadata'}}, ["GPSMapOverlayLabel", $description]);
1211 ###print STDERR "@@@@############################################ Just added description meta: " . $description . "\n";
1212 }
1213
[33293]1214 my $type = $shape->{"type"};
[33302]1215 ###print STDERR "Shape type : " . $type . "\n";
[33293]1216
1217 if($type eq "circle") {
[33302]1218 ###print STDERR "Found a circle:\n" . &printShape($json, $shape);
[33293]1219
1220 # work out bounding box
1221 # want the inverse of this useful page:
1222 # https://stackoverflow.com/questions/639695/how-to-convert-latitude-or-longitude-to-meters
1223 # https://www.geodatasource.com/developers/javascript
[33125]1224
[33293]1225 # Dr Bainbridge wants us to use: https://gis.stackexchange.com/questions/5821/calculating-latitude-longitude-x-miles-from-point
1226 # Since for N,E,S,W the following works out the same as above link, we're using the rule of thumb in the following:
1227 # https://gis.stackexchange.com/questions/2951/algorithm-for-offsetting-a-latitude-longitude-by-some-amount-of-meters
1228 # which states
1229 # "If your displacements aren't too great (less than a few kilometers) and you're not right at the poles,
1230 # use the quick and dirty estimate that 111,111 meters (111.111 km) in the y direction is 1 degree
1231 # (of latitude) and 111,111 * cos(latitude) meters in the x direction is 1 degree (of longitude)."
1232 my $centre_lat = $shape->{"center"}->{"lat"};
1233 my $centre_lng = $shape->{"center"}->{"lng"};
1234 my $radius = $shape->{"radius"}; # in metres!
1235
[33302]1236 ###print STDERR "@@@ circle centre: ($centre_lat, $centre_lng), radius: $radius\n";
[33293]1237
1238 my $lat_north = $centre_lat + ($radius/111111);
1239 my $lat_south = $centre_lat - ($radius/111111);
1240
[33302]1241 ###print STDERR "### lat_north: $lat_north\n";
1242 ###print STDERR "### lat_south: $lat_south\n";
[33125]1243
[33293]1244 # our latitude and longitude values are in degrees. But cos and sin etc in perl and generally all prog languages
1245 # all expect their angle to be in radians. So need to convert from degree to radians before we can take the cos of it.
1246 my $centre_lat_radians = $self->degreesToRadians($centre_lat);
1247 my $cos_in_radians = cos($centre_lat_radians);
[33302]1248 ###print STDERR "cos $centre_lat_radians " . cos($centre_lat_radians) . "\n";
[33293]1249 my $lng_east = $centre_lng + ($radius/(111111 * $cos_in_radians));
1250 my $lng_west = $centre_lng - ($radius/(111111 * $cos_in_radians));
[33302]1251 ###print STDERR "### lng_east $lng_east\n";
1252 ###print STDERR "### lng_west $lng_west\n";
[33293]1253
1254 my $cos_lat = cos($centre_lat);
[33302]1255 ###print STDERR "cos $centre_lat is $cos_lat\n";
[33293]1256
1257 $self->processCoordinate($section, $lat_north, $lng_east);
1258 $self->processCoordinate($section, $lat_south, $lng_east);
1259 $self->processCoordinate($section, $lat_south, $lng_west);
1260 $self->processCoordinate($section, $lat_north, $lng_west);
1261
1262 }
1263 elsif ($type eq "marker") {
[33302]1264 ###print STDERR "@@ MARKER FOUND WITH LAT: " . $shape->{"position"}->{"lat"} . "\n";
1265 ###print STDERR "@@ MARKER FOUND WITH LNG: " . $shape->{"position"}->{"lng"} . "\n";
[33293]1266 $self->processCoordinate($section, $shape->{"position"}->{"lat"}, $shape->{"position"}->{"lng"});
1267 }
1268 elsif ($type eq "polyline" || $type eq "polygon") {
1269 my $path_array = $shape->{"path"};
1270 foreach my $position (@$path_array) {
1271 $self->processCoordinate($section, $position->{"lat"}, $position->{"lng"});
1272 }
1273 }
1274 elsif ($type eq "rectangle") {
1275
1276 my $bounds = $shape->{"bounds"};
1277 $self->processCoordinate($section, $bounds->{"north"}, $bounds->{"east"});
1278 $self->processCoordinate($section, $bounds->{"south"}, $bounds->{"east"});
1279 $self->processCoordinate($section, $bounds->{"south"}, $bounds->{"west"});
1280 $self->processCoordinate($section, $bounds->{"north"}, $bounds->{"west"});
1281 }
1282
[33125]1283 } # end for on each shape in GPS.mapOverlay
[33302]1284 } # end if(buildcol and text pass)
[33125]1285 } # end GPS.mapOverlay meta
1286
[4]1287 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1288}
1289
[33125]1290# https://en.wikipedia.org/wiki/Radian
1291sub degreesToRadians
1292{
1293 my $self = shift (@_);
1294 my ($degrees) = @_;
[4]1295
[33125]1296 return $degrees * pi /180; # returns radians
1297}
1298
1299sub radiansToDegrees
1300{
1301 my $self = shift (@_);
1302 my ($radians) = @_;
1303
1304 return $radians * 180 / pi; # returns degrees
1305}
[33293]1306
[33302]1307# Call as:
1308# my $json = JSON->new->allow_nonref;
1309# &printAllShapes($json, $json_array);
[33293]1310sub printAllShapes {
1311 my ($json, $json_array) = @_;
1312
1313
1314 #my $pretty_print_shape = $json->pretty->encode( $json_array->[0] );
1315 foreach my $shape (@$json_array) {
1316 my $pretty_print_shape = $json->pretty->encode( $shape );
1317 print STDERR "Shape: $pretty_print_shape\n";
1318 }
1319
[33185]1320}
[33125]1321
[33217]1322# For the (lat, lng) coordinate given,
1323# attaches Coordinate and multiple CoordShort (different precision level) metadata to the doc object
[33125]1324sub processCoordinate {
1325 my $self = shift (@_);
[33217]1326 my ($section, $latitude, $longitude) = @_;
[33125]1327
[33128]1328 my $section_ptr = $self->_lookup_section($section);
1329
[33125]1330 my $lat_direction = ($latitude =~ m/^-/) ? "S" : "N";
1331 my $lng_direction = ($longitude =~ m/^-/) ? "W" : "E";
1332
1333 # have to store (lat, lng) in pairs, when there are so many coords to store
1334 #push (@{$section_ptr->{'metadata'}}, ["Latitude", $latitude]);
1335 #push (@{$section_ptr->{'metadata'}}, ["Longitude", $longitude]);
1336
1337 push (@{$section_ptr->{'metadata'}}, ["Coordinate", "$latitude $longitude"]); # "$latitude$lat_direction $longitude$lng_direction"
1338
1339 my ($latBeforeDec, $latAfterDec);
1340 my ($lngBeforeDec, $lngAfterDec);
1341
1342 if($latitude !~ m/\./) {
1343 $latBeforeDec = $latitude;
1344 $latAfterDec = "";
1345 } else {
1346 ($latBeforeDec, $latAfterDec) = ($latitude =~ m/^-?([0-9]+)\.([0-9]+)$/);
1347 }
1348 if($longitude !~ m/\./) {
1349 $lngBeforeDec = $longitude;
1350 $lngAfterDec = "";
1351 } else {
1352 ($lngBeforeDec, $lngAfterDec) = ($longitude =~ m/^-?([0-9]+)\.([0-9]+)$/);
1353 }
1354
1355
[33217]1356 my $name = "CoordShort";
1357 push (@{$section_ptr->{'metadata'}}, [$name, "$latBeforeDec$lat_direction $lngBeforeDec$lng_direction"]);
1358
[33185]1359 for(my $i = 2; $i <= 4; $i++)
1360 {
1361 my $latDecPlaces = (length($latAfterDec) >= $i) ? substr($latAfterDec, 0, $i) : "";
1362 my $lngDecPlaces = (length($lngAfterDec) >= $i) ? substr($lngAfterDec, 0, $i) : "";
1363
[33217]1364 push (@{$section_ptr->{'metadata'}}, [$name,
1365 $latBeforeDec . $lat_direction. $latDecPlaces . " " . $lngBeforeDec . $lng_direction. $lngDecPlaces]);
[33185]1366
1367 }
1368
1369 #Only add the metadata if it has not already been added
[33217]1370 #my $metaMap = $self->get_metadata_hashmap($section); ### TODO: metaMap not used. Unnecesssary step? (Called function has no apparent side-effects.)
[33125]1371}
1372
1373
[4]1374# methods for dealing with text
1375
[1241]1376# returns the text for a section
1377sub get_text {
1378 my $self = shift (@_);
1379 my ($section) = @_;
1380
1381 my $section_ptr = $self->_lookup_section($section);
1382 if (!defined $section_ptr) {
1383 print STDERR "doc::get_text couldn't find section " .
1384 "$section\n";
1385 return "";
1386 }
1387
1388 return $section_ptr->{'text'};
1389}
1390
1391# returns the (utf-8 encoded) length of the text for a section
1392sub get_text_length {
1393 my $self = shift (@_);
1394 my ($section) = @_;
1395
1396 my $section_ptr = $self->_lookup_section($section);
1397 if (!defined $section_ptr) {
1398 print STDERR "doc::get_text_length couldn't find section " .
1399 "$section\n";
1400 return 0;
1401 }
1402
1403 return length ($section_ptr->{'text'});
1404}
1405
[23131]1406# returns the total length for all the sections
1407sub get_total_text_length {
1408 my $self = shift (@_);
1409
1410 my $section = $self->get_top_section();
1411 my $length = 0;
1412 while (defined $section) {
1413 $length += $self->get_text_length($section);
1414 $section = $self->get_next_section($section);
1415 }
1416 return $length;
1417}
1418
[1241]1419sub delete_text {
1420 my $self = shift (@_);
1421 my ($section) = @_;
1422
1423 my $section_ptr = $self->_lookup_section($section);
1424 if (!defined $section_ptr) {
1425 print STDERR "doc::delete_text couldn't find section " .
1426 "$section\n";
1427 return;
1428 }
1429
1430 $section_ptr->{'text'} = "";
1431}
1432
[73]1433# add_text assumes the text is in (extended) ascii form. For
1434# text which has been already converted to the UTF-8 format
1435# use add_utf8_text.
[4]1436sub add_text {
1437 my $self = shift (@_);
1438 my ($section, $text) = @_;
1439
[73]1440 # convert the text to UTF-8 encoded unicode characters
1441 # and add the text
[1870]1442 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
[73]1443}
1444
1445
1446# add_utf8_text assumes the text to be added has already
1447# been converted to the UTF-8 encoding. For ascii text use
1448# add_text
[32590]1449# Pass by value version (internally calls pass by ref version
1450# to avoid code duplication)
[73]1451sub add_utf8_text {
1452 my $self = shift (@_);
1453 my ($section, $text) = @_;
[32590]1454
1455 $self->add_utf8_textref($section, \$text);
1456}
[73]1457
[32590]1458# Pass by reference version, used by GreenstoneSQLPlugin for fulltext
1459sub add_utf8_textref {
1460 my $self = shift (@_);
1461 my ($section, $text_ref) = @_;
1462
[4]1463 my $section_ptr = $self->_lookup_section($section);
1464 if (!defined $section_ptr) {
[32590]1465 print STDERR "doc::add_utf8_textref couldn't find section " .
[4]1466 "$section\n";
1467 return;
1468 }
1469
[32590]1470 $section_ptr->{'text'} .= $$text_ref;
[4]1471}
1472
[16950]1473# returns the Source meta, which is the utf8 filename generated.
1474# Added a separate method here for convenience
1475sub get_source {
[16924]1476 my $self = shift (@_);
[16950]1477 return $self->get_metadata_element ($self->get_top_section(), "Source");
1478}
1479
1480# returns the SourceFile meta, which is the url reference to the URL-encoded
1481# version of Source (the utf8 filename). Added a separate method here for convenience
1482sub get_sourcefile {
1483 my $self = shift (@_);
1484 return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
1485}
1486
1487# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
1488# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
1489sub get_assocfile_from_sourcefile {
1490 my $self = shift (@_);
[16924]1491
1492 # get the SourceFile meta, which is a *URL* to a file on the filesystem
1493 my $top_section = $self->get_top_section();
1494 my $source_file = $self->get_metadata_element($top_section, "SourceFile");
[4]1495
[16924]1496 # get the actual filename as it exists on the filesystem which this url refers to
[16928]1497 $source_file = &unicode::url_to_filename($source_file);
[16924]1498 my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
1499 return $assocfilename;
1500}
1501
[1241]1502# methods for dealing with associated files
1503
1504# a file is associated with a document, NOT a section.
1505# if section is defined it is noted in the data structure
1506# only so that files associated from a particular section
1507# may be removed later (using delete_section_assoc_files)
1508sub associate_file {
1509 my $self = shift (@_);
1510 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1511 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1512
1513 # remove all associated files with the same name
1514 $self->delete_assoc_file ($assoc_filename);
[23413]1515
1516 # Too harsh a requirement
1517 # Definitely get HTML docs, for example, with some missing
1518 # support files
1519# if (!&util::fd_exists($real_filename)) {
1520# print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n";
1521# exit -1;
1522# }
1523
[23387]1524# print STDERR "**** is the following a UTF8 rep of *real* filename?\n $real_filename\n";
1525# print STDERR "****##### so, ensure it is before storing?!?!?\n";
1526## my $utf8_filename = Encode::encode("utf8",$filename);
1527
[1241]1528 push (@{$self->{'associated_files'}},
1529 [$real_filename, $assoc_filename, $mime_type, $section]);
1530}
1531
1532# returns a list of associated files in the form
1533# [[real_filename, assoc_filename, mimetype], ...]
1534sub get_assoc_files {
1535 my $self = shift (@_);
1536
1537 return $self->{'associated_files'};
1538}
1539
[20775]1540# the following two methods used to keep track of original associated files
1541# for incremental building. eg a txt file used by an item file does not end
1542# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1543# file for incremental build
1544sub associate_source_file {
1545 my $self = shift (@_);
1546 my ($full_filename) = @_;
[19494]1547
[20775]1548 push (@{$self->{'source_assoc_files'}}, $full_filename);
1549
1550}
1551
1552sub get_source_assoc_files {
1553 my $self = shift (@_);
1554
1555 return $self->{'source_assoc_files'};
1556
1557
1558}
[19494]1559sub metadata_file {
1560 my $self = shift (@_);
1561 my ($real_filename, $filename) = @_;
1562
1563 push (@{$self->{'metadata_files'}},
1564 [$real_filename, $filename]);
1565}
1566
[21566]1567# used for writing out the archiveinf-doc info database, to list all the metadata files
[19494]1568sub get_meta_files {
1569 my $self = shift (@_);
1570
1571 return $self->{'metadata_files'};
1572}
1573
[1241]1574sub delete_section_assoc_files {
1575 my $self = shift (@_);
1576 my ($section) = @_;
1577
1578 my $i=0;
1579 while ($i < scalar (@{$self->{'associated_files'}})) {
1580 if (defined $self->{'associated_files'}->[$i]->[3] &&
1581 $self->{'associated_files'}->[$i]->[3] eq $section) {
1582 splice (@{$self->{'associated_files'}}, $i, 1);
1583 } else {
1584 $i++;
1585 }
1586 }
1587}
1588
1589sub delete_assoc_file {
1590 my $self = shift (@_);
1591 my ($assoc_filename) = @_;
1592
1593 my $i=0;
1594 while ($i < scalar (@{$self->{'associated_files'}})) {
1595 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1596 splice (@{$self->{'associated_files'}}, $i, 1);
1597 } else {
1598 $i++;
1599 }
1600 }
1601}
1602
1603sub reset_nextsection_ptr {
1604 my $self = shift (@_);
1605 my ($section) = @_;
1606
1607 my $section_ptr = $self->_lookup_section($section);
1608 $section_ptr->{'next_subsection'} = 1;
1609}
1610
[4]16111;
Note: See TracBrowser for help on using the repository browser.