source: main/trunk/greenstone2/perllib/doc.pm@ 34394

Last change on this file since 34394 was 34394, checked in by ak19, 4 years ago

Bugfix 1 for GLI metadata slowdown: selecting multiple Gathererd files in GLI became very slow. Kathy and Dr Bainbridge had tracked this down to code I had added to support non basic ASCII filenames in GLI, which was making an expensive win operating system function call on Windows for each selected file, launching a Java Process for each. The speed of selecting multiple files is now back to being almost as fast as in 3.09. Tested on Windows and linux. Had to treat windows as a special case because I can't get the code modifications to work on Linux: the perl code stores a hex-encoded string for the filename that GLI now uses when OS is Windows and compares against the hex encoded name of a file selected. But on linux the hex encoded value generated by perl is not the same as that which java generates and after trying repeatedly, I'e not been able to succeed to get it to work. So the code behaves as before for Linux.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 52.8 KB
RevLine 
[537]1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
[8894]10# This program is free software; you can redistr te it and/or modify
[537]11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
[1241]26# base class to hold documents
[4]27
28package doc;
[3834]29eval {require bytes};
[4]30
[832]31BEGIN {
[1241]32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
[832]34}
35
[15894]36use strict;
[1241]37use unicode;
38use util;
[27306]39use FileUtils;
[1241]40use ghtml;
[8220]41use File::stat;
[1241]42##use hashdoc;
[13172]43use docprint;
[33125]44use JSON;
[1241]45
[33125]46# We just need pi from the Trig lib
[33293]47# Import constants pi2, pip2, pip4 (2*pi, pi/2, pi/4).
[33416]48# But doing
49# use Math::Trig ':pi';
50# resulted in errors on 32 bit linux VM with perl v8 of "bareword pi not allowed while strict subs in use"
51# So, now doing just
52use Math::Trig;
[33125]53
[4]54# the document type may be indexed_doc, nonindexed_doc, or
55# classification
56
[18528]57our $OIDcount = 0;
[2327]58
[33171]59
60# Declare the Package (level) Variable that will tell us whether we're in the import or buildcol phase.
61# This var will be set by import.pl, buildcol.pl and export.pl to the then appropriate value.
62# https://perlmaven.com/package-variables-and-lexical-variables-in-perl
63# But: https://stackoverflow.com/questions/7542120/perl-how-to-make-variables-from-requiring-script-available-in-required-script
64# https://stackoverflow.com/questions/33230015/how-do-i-use-a-variable-in-another-perl-script
65#$doc::cmd_line_mode = undef;
66# Decided to follow the usage of $OIDcount above to declare the necessary package-level variables 'globally' accessible
67# actually accessible only for those packages that import this one (with 'use doc')
[33302]68my $cmd_line_mode = undef;
[33171]69
[33302]70# processor_mode keeps track of which buildcol pass we're at: dummy, text (sidx/didx passes) or infodb
71my $processor_mode = undef;
72
[20412]73# rename_method can be 'url', 'none', 'base64'
[4]74sub new {
75 my $class = shift (@_);
[18319]76 my ($source_filename, $doc_type, $rename_method) = @_;
[1374]77
[13770]78
[1241]79 my $self = bless {'associated_files'=>[],
80 'subsection_order'=>[],
81 'next_subsection'=>1,
82 'subsections'=>{},
83 'metadata'=>[],
[2327]84 'text'=>"",
85 'OIDtype'=>"hash"}, $class;
[4]86
[10217]87 # used to set lastmodified here, but this can screw up the HASH ids, so
88 # the docsave processor now calls set_lastmodified
[13770]89
[23362]90 $self->set_source_path($source_filename);
[10217]91
[7929]92 if (defined $source_filename) {
[15874]93 $source_filename = &util::filename_within_collection($source_filename);
[18508]94 print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
[18319]95 $self->set_source_filename ($source_filename, $rename_method);
[7929]96 }
97
[1374]98 $self->set_doc_type ($doc_type) if defined $doc_type;
[485]99
[4]100 return $self;
101}
[23362]102
103
104sub set_source_path
105{
106 my $self = shift @_;
107 my ($source_filename) = @_;
108
109 if (defined $source_filename) {
110 # On Windows the source_filename can be in terse DOS format
111 # e.g. test~1.txt
112
113 $self->{'terse_source_path'} = $source_filename;
114
[27306]115 # Use the FileUtil library methods as they are aware of more special
116 # cases such as HDFS [jmt12]
[27393]117 if (&FileUtils::fileExists($source_filename))
118 {
[23362]119 # See if we can do better for Windows with a filename
120 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
121 require Win32;
122 $self->{'source_path'} = Win32::GetLongPathName($source_filename);
123 }
124 else {
125 # For Unix-based systems, there is no difference between the two
126 $self->{'source_path'} = $source_filename;
[34394]127 }
128
129 # On Windows, the code above has ensured source_path is the Win long (full) path name.
130 # To help GLI associate metadata with an easily calculated and accurate representation of
131 # filenames, we now store the Win long path name, hex encoded.
132 # We're not using this field on Linux, as I can't get the hex encodings generated to match
133 # what GLI Java code generates. But for symmetry we store this field on Unix too, but we need
134 # to hex-encode source_path on Unix too, or it may not be UTF-8 and doc.xml will be invalid
135 my $hexencodedlongsourcepath = &unicode::debug_unicode_string($self->{'source_path'});
136 $self->set_utf8_metadata_element ($self->get_top_section(), "gsdlfullsourcepath", $hexencodedlongsourcepath);
137
[23362]138 }
139 else {
140 print STDERR "Warning: In doc::set_source_path(), file\n";
141 print STDERR " $source_filename\n";
142 print STDERR " does not exist\n";
143
144 # (default) Set it to whatever we were given
145 $self->{'source_path'} = $source_filename;
146 }
147 }
148 else {
149 # Previous code for setting source_path allowed for
150 # it to be undefined, so continue this practice
151 $self->{'terse_source_path'} = undef;
152 $self->{'source_path'} = undef;
153 }
154}
155
156
157sub get_source_path
158{
159 my $self = shift @_;
160
161 return $self->{'terse_source_path'};
162}
163
[10217]164# set lastmodified for OAI purposes, added by GRB, moved by kjdon
[23939]165sub set_oailastmodified {
166 my $self = shift (@_);
167
168 my $source_path = $self->{'terse_source_path'};
169
170 if (defined $source_path && (-e $source_path)) {
171 my $current_time = time;
172
173 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
174 $wday, $yday, $isdst) = localtime($current_time);
175
176 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
177
178 $self->add_utf8_metadata($self->get_top_section(), "oailastmodified", $current_time);
179 $self->add_utf8_metadata($self->get_top_section(), "oailastmodifieddate", $date_modified);
180 }
181}
182
183# no longer used for OAI purposes, since lastmodified is not what we want as the
184# Datestamp of a document. This doc metadata may be useful for general purposes.
[10217]185sub set_lastmodified {
186 my $self = shift (@_);
[4]187
[23362]188 my $source_path = $self->{'terse_source_path'};
[13770]189
190 if (defined $source_path && (-e $source_path)) {
191
192 my $file_stat = stat($source_path);
[10217]193 my $mtime = $file_stat->mtime;
[21862]194 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
195 $wday, $yday, $isdst) = localtime($mtime);
196
197 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
198
[23923]199 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $mtime);
[21862]200 $self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
[10217]201 }
202}
203
[1241]204# clone the $self object
205sub duplicate {
206 my $self = shift (@_);
207
208 my $newobj = {};
209
[8716]210 foreach my $k (keys %$self) {
[1241]211 $newobj->{$k} = &clone ($self->{$k});
212 }
213
214 bless $newobj, ref($self);
215 return $newobj;
216}
217
218sub clone {
219 my ($from) = @_;
220 my $type = ref ($from);
221
222 if ($type eq "HASH") {
223 my $to = {};
[8716]224 foreach my $key (keys %$from) {
[1241]225 $to->{$key} = &clone ($from->{$key});
226 }
227 return $to;
228 } elsif ($type eq "ARRAY") {
229 my $to = [];
[8716]230 foreach my $v (@$from) {
[1241]231 push (@$to, &clone ($v));
232 }
233 return $to;
234 } else {
235 return $from;
236 }
237}
238
[2327]239sub set_OIDtype {
240 my $self = shift (@_);
[12268]241 my ($type, $metadata) = @_;
[1241]242
[26536]243 if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|hash_on_full_filename|incremental|filename|dirname|full_filename|assigned)$/) {
[2327]244 $self->{'OIDtype'} = $type;
245 } else {
246 $self->{'OIDtype'} = "hash";
247 }
[16792]248
[12268]249 if ($type =~ /^assigned$/) {
250 if (defined $metadata) {
251 $self->{'OIDmetadata'} = $metadata;
252 } else {
253 $self->{'OIDmetadata'} = "dc.Identifier";
254 }
255 }
[2327]256}
257
[20412]258# rename_method can be 'url', 'none', 'base64'
[1241]259sub set_source_filename {
260 my $self = shift (@_);
[18319]261 my ($source_filename, $rename_method) = @_;
[1241]262
[16578]263 # Since the gsdlsourcefilename element goes into the doc.xml it has
264 # to be utf8. However, it should also *represent* the source filename
265 # (in the import directory) which may not be utf8 at all.
[18319]266 # For instance, if this meta element (gsdlsourcefilename) will be used
267 # by other applications that parse doc.xml in order to locate
268 # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
269 # encode the real filename as this is a binary-to-text encoding meaning
270 # that the resulting string is ASCII (utf8). Decoding will give the original.
[16578]271
[18319]272# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
[33756]273## print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename\n";
[18319]274
[16670]275 # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
[23278]276 $source_filename = $self->encode_filename($source_filename, $rename_method);
277# my ($srcfilename,$dirname,$suffix)
278# = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
[16792]279# print STDERR "-> $srcfilename -> ";
[23278]280# $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
[27306]281# $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
[16792]282# print STDERR "$source_filename\n";
[33756]283
284## print STDERR "AFTER URL/base64 encoding gsdlsourcefilename: $source_filename\n";
285
[16578]286 $self->set_utf8_metadata_element ($self->get_top_section(),
[1241]287 "gsdlsourcefilename",
288 $source_filename);
[33756]289
290 # If we set the file renaming method, at least GLI can decode gsdlsourcefilename to get the
291 # original filename (relative path to file) back for GLI to notice what ex Meta is associated with it
292 $self->set_utf8_metadata_element($self->get_top_section(),
293 "gsdlsourcefilerenamemethod",
294 $rename_method);
[1241]295}
296
[23278]297sub encode_filename {
298 my $self = shift (@_);
299 my ($source_filename, $rename_method) = @_;
300
301 my ($srcfilename,$dirname,$suffix)
302 = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
303# print STDERR "-> $srcfilename -> ";
304 $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
[33756]305
306 # encode any subdirs of "import" also, but not import itself
307 my $sep = ($ENV{'GSDLOS'} =~ /^windows$/i) ? "\\" : "/";
308 my @dirs = ($sep eq "\\") ? split(/[\\\/]+/, $dirname) : split(/\//, $dirname);
309
[33757]310 my $dirpath = "";
311
312 # Don't encode the first folder ("import" or "tmp"): GLI's DocXMLFile.java looks for literal "import"
[34105]313 # or "tmp" before it knows what the file rename method is that is to be used decode the rest of gsdlsourcefilename.
314 if(scalar (@dirs) >= 1) {
315 $dirpath = shift(@dirs);
[33757]316 }
[33756]317 foreach my $subdir (@dirs) {
[34105]318 #print STDERR "@@@@ Found subdir: $subdir\n";
[33756]319
[33757]320 $subdir = &util::rename_file($subdir, $rename_method);
321 #print STDERR "@@@@ encoded subdir: $subdir\n";
322
[33756]323 $dirpath = &FileUtils::filenameConcatenate($dirpath, $subdir);
324 }
325 if($dirpath ne "") {
326 $dirname = $dirpath;
327 }
328
[27306]329 $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
[23278]330
331 return $source_filename;
332}
333
[7569]334sub set_converted_filename {
335 my $self = shift (@_);
336 my ($converted_filename) = @_;
337
[16578]338 # we know the converted filename is utf8
339 $self->set_utf8_metadata_element ($self->get_top_section(),
[7569]340 "gsdlconvertedfilename",
341 $converted_filename);
342}
343
[1241]344# returns the source_filename as it was provided
[19829]345sub get_unmodified_source_filename {
346 my $self = shift (@_);
347
[23362]348 return $self->{'terse_source_path'};
[19829]349}
350
351# returns the source_filename with whatever rename_method was given
[1241]352sub get_source_filename {
353 my $self = shift (@_);
354
355 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
356}
357
[19829]358
359
[7569]360# returns converted filename if available else returns source filename
361sub get_filename_for_hashing {
362 my $self = shift (@_);
363
364 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
365
366 if (!defined $filename) {
[11097]367 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
[10980]368 # if NULPlug processed file, then don't give a filename
[11097]369 if (defined $plugin_name && $plugin_name eq "NULPlug") {
[10980]370 $filename = undef;
[16670]371 } else { # returns the URL encoded source filename!
[10980]372 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
373 }
[7569]374 }
[23485]375
[27306]376 if (!&FileUtils::isFilenameAbsolute($filename)) {
377 $filename = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$filename);
[23485]378 }
379
[7569]380 return $filename;
381}
382
[1241]383sub set_doc_type {
384 my $self = shift (@_);
385 my ($doc_type) = @_;
386
387 $self->set_metadata_element ($self->get_top_section(),
388 "gsdldoctype",
389 $doc_type);
390}
391
[10217]392# returns the gsdldoctype as it was provided
[1241]393# the default of "indexed_doc" is used if no document
394# type was provided
395sub get_doc_type {
396 my $self = shift (@_);
397
398 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
399 return $doc_type if (defined $doc_type);
400 return "indexed_doc";
401}
402
403
404# look up the reference to the a particular section
405sub _lookup_section {
406 my $self = shift (@_);
407 my ($section) = @_;
408
409 my ($num);
410 my $sectionref = $self;
411
412 while (defined $section && $section ne "") {
[12327]413
[1241]414 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
[12327]415
416 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
417
[1241]418 $section = "" unless defined $section;
419
[12327]420
[1241]421 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
422 $sectionref = $sectionref->{'subsections'}->{$num};
423 } else {
424 return undef;
425 }
426 }
427
428 return $sectionref;
429}
430
[2327]431# calculate OID by hashing the contents of the document
[1241]432sub _calc_OID {
433 my $self = shift (@_);
434 my ($filename) = @_;
435
[22855]436
[1241]437 my $osexe = &util::get_os_exe();
438
[27306]439 my $hashfile_exe = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin",
[1241]440 $ENV{'GSDLOS'},"hashfile$osexe");
[8504]441
[28560]442 &util::set_gnomelib_env(); # gnomelib_env (particularly lib/libiconv2.dylib) required to run the hashfile executable on Mac Lions
443 # The subroutine will set the gnomelib env once for each subshell launched, by first testing if GEXTGNOME is not already set
444
[28575]445 # A different way to set the gnomelib env would be to do it more locally: exporting the necessary vars
446 # (specifically DYLD/LD_LIB_PATH) for gnome_lib as part of the command executed.
447 # E.g. $result=`export LD_LIBRARY_PATH=../ext/gnome-lib/darwin/lib; hashfile...`
448
[1241]449 my $result = "NULL";
[22855]450
[1679]451
[1241]452 if (-e "$hashfile_exe") {
[1679]453# $result = `\"$hashfile_exe\" \"$filename\"`;
[21862]454# $result = `hashfile$osexe \"$filename\" 2>&1`;
[1679]455 $result = `hashfile$osexe \"$filename\"`;
[21862]456
[1241]457 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
458 } else {
459 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
460 }
461 return "HASH$result";
462}
463
464# methods dealing with OID, not groups of them.
465
[2327]466# if $OID is not provided one is calculated
[1241]467sub set_OID {
468 my $self = shift (@_);
469 my ($OID) = @_;
[8504]470
[8797]471 my $use_hash_oid = 0;
[17057]472 # if an OID wasn't provided calculate one
[1241]473 if (!defined $OID) {
474 $OID = "NULL";
[17025]475 if ($self->{'OIDtype'} =~ /^hash/) {
[8797]476 $use_hash_oid = 1;
477 } elsif ($self->{'OIDtype'} eq "incremental") {
[2327]478 $OID = "D" . $OIDcount;
479 $OIDcount ++;
[26221]480 } elsif ($self->{'OIDtype'} eq "filename") {
481 my $filename = $self->get_source_filename();
482 $OID = &File::Basename::fileparse($filename, qr/\.[^.]*/);
483 $OID = &util::tidy_up_oid($OID);
[26536]484 } elsif ($self->{'OIDtype'} eq "full_filename") {
485 my $source_filename = $self->get_source_filename();
486 my $dirsep = &util::get_os_dirsep();
487
488 $source_filename =~ s/^import$dirsep//;
489 $source_filename =~ s/$dirsep/-/g;
490 $source_filename =~ s/\./_/g;
491
492 $OID = $source_filename;
493 $OID = &util::tidy_up_oid($OID);
[8716]494 } elsif ($self->{'OIDtype'} eq "dirname") {
495 my $filename = $self->get_source_filename();
[8797]496 if (defined($filename)) { # && -e $filename) {
[27350]497 # get the immediate parent directory
[8716]498 $OID = &File::Basename::dirname($filename);
499 if (defined $OID) {
[27350]500 $OID = &File::Basename::basename($OID);
[19617]501 $OID = &util::tidy_up_oid($OID);
[8716]502 } else {
[8797]503 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
504 $use_hash_oid = 1;
[8716]505 }
506 } else {
[27350]507 print STDERR "Failed to find a filename, generating hash id\n";
[8797]508 $use_hash_oid = 1;
[8716]509 }
[2327]510
[8797]511 } elsif ($self->{'OIDtype'} eq "assigned") {
[12268]512 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
[8797]513 if (defined $identifier && $identifier ne "") {
[17025]514 $OID = $identifier;
[19617]515 $OID = &util::tidy_up_oid($OID);
[8797]516 } else {
517 # need a hash id
[12268]518 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
[8797]519 $use_hash_oid = 1;
520 }
521
[2327]522 } else {
[8797]523 $use_hash_oid = 1;
524 }
525
526 if ($use_hash_oid) {
[23562]527 my $hash_on_file = 1;
[26536]528 my $hash_on_ga_xml = 0;
529
[17025]530 if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
531 $hash_on_file = 0;
[26536]532 $hash_on_ga_xml = 1;
[17025]533 }
[26536]534
535 if ($self->{'OIDtype'} eq "hash_on_full_filename") {
536 $hash_on_file = 0;
537 $hash_on_ga_xml = 0;
538
539 my $source_filename = $self->get_source_filename();
540 my $dirsep = &util::get_os_dirsep();
541
542 $source_filename =~ s/^import$dirsep//;
543 $source_filename =~ s/$dirsep/-/g;
544 $source_filename =~ s/\./_/g;
545
546 # If the filename is very short then (handled naively)
547 # this can cause conjestion in the hash-values
548 # computed, leading documents sharing the same leading
549 # Hex values in the computed has.
550 #
551 # The solution taken here is to replace the name of
552 # the file name a sufficient number of times (up to
553 # the character limit defined in 'rep_limit' and
554 # make that the content that is hashed on
555
556 # *** Think twice before changing the following value
557 # as it will break backward compatability of computed
558 # document HASH values
559
560 my $rep_limit = 256;
561 my $hash_content = undef;
562
563 if (length($source_filename)<$rep_limit) {
564 my $rep_string = "$source_filename|";
565 my $rs_len = length($rep_string);
566
567 my $clone_times = int(($rep_limit-1)/$rs_len) +1;
568
569 $hash_content = substr($rep_string x $clone_times, 0, $rep_limit);
570 }
571 else {
572 $hash_content = $source_filename;
573 }
574
575 my $filename = &util::get_tmp_filename();
576 if (!open (OUTFILE, ">:utf8", $filename)) {
577 print STDERR "doc::set_OID could not write to $filename\n";
578 } else {
579 print OUTFILE $hash_content;
580 close (OUTFILE);
581 }
582 $OID = $self->_calc_OID ($filename);
583
[27538]584 &FileUtils::removeFiles ($filename);
[26536]585 }
586
[17025]587 if ($hash_on_file) {
588 # "hash" OID - feed file to hashfile.exe
589 my $filename = $self->get_filename_for_hashing();
[23562]590
[17025]591 # -z: don't want to hash on the file if it is zero size
592 if (defined($filename) && -e $filename && !-z $filename) {
593 $OID = $self->_calc_OID ($filename);
594 } else {
[26536]595 $hash_on_ga_xml = 1; # switch to back-up plan, and hash on GA file instead
[17025]596 }
597 }
[26536]598
599 if ($hash_on_ga_xml) {
600 # In addition being asked to explicity calculate the has based on the GA file,
601 # can also end up coming into this block is doing 'hash_on_file' but the file
602 # itself is of zero bytes (as could be the case with 'doc.nul' file
603
[17025]604 my $filename = &util::get_tmp_filename();
[22855]605 if (!open (OUTFILE, ">:utf8", $filename)) {
[2327]606 print STDERR "doc::set_OID could not write to $filename\n";
607 } else {
[32512]608 my $doc_text = &docprint::get_section_xml($self);
[13172]609 print OUTFILE $doc_text;
[2327]610 close (OUTFILE);
611 }
612 $OID = $self->_calc_OID ($filename);
[27393]613 &FileUtils::removeFiles($filename);
[1374]614 }
[1241]615 }
616 }
617 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
618}
619
620# this uses hashdoc (embedded c thingy) which is faster but still
[32535]621# needs a little work to be sufficiently stable
[1241]622sub ___set_OID {
623 my $self = shift (@_);
624 my ($OID) = @_;
625
626 # if an OID wasn't provided then calculate hash value based on document
627 if (!defined $OID)
628 {
[32512]629 my $hash_text = &docprint::get_section_xml($self);
[1241]630 my $hash_len = length($hash_text);
631
632 $OID = &hashdoc::buffer($hash_text,$hash_len);
633 }
634
635 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
636}
637
638# returns the OID for this document
639sub get_OID {
640 my $self = shift (@_);
641 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
642 return $OID if (defined $OID);
643 return "NULL";
644}
645
646sub delete_OID {
647 my $self = shift (@_);
648
649 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
650}
651
652
653# methods for manipulating section names
654
655# returns the name of the top-most section (the top
656# level of the document
657sub get_top_section {
658 my $self = shift (@_);
659
660 return "";
661}
662
663# returns a section
664sub get_parent_section {
665 my $self = shift (@_);
666 my ($section) = @_;
667
668 $section =~ s/(^|\.)\d+$//;
669
670 return $section;
671}
672
673# returns the first child section (or the end child
674# if there isn't any)
675sub get_begin_child {
676 my $self = shift (@_);
677 my ($section) = @_;
678
679 my $section_ptr = $self->_lookup_section($section);
680 return "" unless defined $section_ptr;
681
682 if (defined $section_ptr->{'subsection_order'}->[0]) {
683 return "$section.$section_ptr->{'subsection_order'}->[0]";
684 }
685
686 return $self->get_end_child ($section);
687}
688
689# returns the next child of a parent section
690sub get_next_child {
691 my $self = shift (@_);
692 my ($section) = @_;
693
694 my $parent_section = $self->get_parent_section($section);
695 my $parent_section_ptr = $self->_lookup_section($parent_section);
696 return undef unless defined $parent_section_ptr;
697
698 my ($section_num) = $section =~ /(\d+)$/;
699 return undef unless defined $section_num;
700
701 my $i = 0;
702 my $section_order = $parent_section_ptr->{'subsection_order'};
703 while ($i < scalar(@$section_order)) {
704 last if $section_order->[$i] eq $section_num;
705 $i++;
706 }
707
708 $i++; # the next child
709 if ($i < scalar(@$section_order)) {
710 return $section_order->[$i] if $parent_section eq "";
711 return "$parent_section.$section_order->[$i]";
712 }
713
714 # no more sections in this level
715 return undef;
716}
717
718# returns a reference to a list of children
719sub get_children {
720 my $self = shift (@_);
721 my ($section) = @_;
722
723 my $section_ptr = $self->_lookup_section($section);
724 return [] unless defined $section_ptr;
725
726 my @children = @{$section_ptr->{'subsection_order'}};
727
728 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
729 return \@children;
730}
731
732# returns the child section one past the last one (which
733# is coded as "0")
734sub get_end_child {
735 my $self = shift (@_);
736 my ($section) = @_;
737
738 return $section . ".0" unless $section eq "";
739 return "0";
740}
741
742# returns the next section in book order
743sub get_next_section {
744 my $self = shift (@_);
745 my ($section) = @_;
746
747 return undef unless defined $section;
748
749 my $section_ptr = $self->_lookup_section($section);
750 return undef unless defined $section_ptr;
751
752 # first try to find first child
753 if (defined $section_ptr->{'subsection_order'}->[0]) {
754 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
755 return "$section.$section_ptr->{'subsection_order'}->[0]";
756 }
757
758 do {
759 # try to find sibling
760 my $next_child = $self->get_next_child ($section);
761 return $next_child if (defined $next_child);
762
763 # move up one level
764 $section = $self->get_parent_section ($section);
765 } while $section =~ /\d/;
766
767 return undef;
768}
769
770sub is_leaf_section {
771 my $self = shift (@_);
772 my ($section) = @_;
773
774 my $section_ptr = $self->_lookup_section($section);
775 return 1 unless defined $section_ptr;
776
777 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
778}
779
780# methods for dealing with sections
781
782# returns the name of the inserted section
783sub insert_section {
784 my $self = shift (@_);
785 my ($before_section) = @_;
786
787 # get the child to insert before and its parent section
788 my $parent_section = "";
789 my $before_child = "0";
790 my @before_section = split (/\./, $before_section);
791 if (scalar(@before_section) > 0) {
792 $before_child = pop (@before_section);
793 $parent_section = join (".", @before_section);
794 }
795
796 my $parent_section_ptr = $self->_lookup_section($parent_section);
797 if (!defined $parent_section_ptr) {
798 print STDERR "doc::insert_section couldn't find parent section " .
799 "$parent_section\n";
800 return;
801 }
802
803 # get the next section number
804 my $section_num = $parent_section_ptr->{'next_subsection'}++;
805
806 my $i = 0;
807 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
808 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
809 $i++;
810 }
811
812 # insert the section number into the order list
813 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
814
815 # add this section to the parent section
816 my $section_ptr = {'subsection_order'=>[],
817 'next_subsection'=>1,
818 'subsections'=>{},
819 'metadata'=>[],
820 'text'=>""};
821 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
822
823 # work out the full section number
824 my $section = $parent_section;
825 $section .= "." unless $section eq "";
826 $section .= $section_num;
827
828 return $section;
829}
830
831# creates a pre-named section
832sub create_named_section {
833 my $self = shift (@_);
834 my ($mastersection) = @_;
835
836 my ($num);
837 my $section = $mastersection;
838 my $sectionref = $self;
839
840 while ($section ne "") {
841 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
842 $num =~ s/^0+(\d)/$1/; # remove leading 0s
843 $section = "" unless defined $section;
844
845 if (defined $num) {
846 if (!defined $sectionref->{'subsections'}->{$num}) {
847 push (@{$sectionref->{'subsection_order'}}, $num);
848 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
849 'next_subsection'=>1,
850 'subsections'=>{},
851 'metadata'=>[],
852 'text'=>""};
853 if ($num >= $sectionref->{'next_subsection'}) {
854 $sectionref->{'next_subsection'} = $num + 1;
855 }
856 }
857 $sectionref = $sectionref->{'subsections'}->{$num};
858
859 } else {
860 print STDERR "doc::create_named_section couldn't create section ";
861 print STDERR "$mastersection\n";
862 last;
863 }
864 }
865}
866
867# returns a reference to a list of subsections
868sub list_subsections {
869 my $self = shift (@_);
870 my ($section) = @_;
871
872 my $section_ptr = $self->_lookup_section ($section);
873 if (!defined $section_ptr) {
874 print STDERR "doc::list_subsections couldn't find section $section\n";
875 return [];
876 }
877
878 return [@{$section_ptr->{'subsection_order'}}];
879}
880
881sub delete_section {
882 my $self = shift (@_);
883 my ($section) = @_;
884
885# my $section_ptr = {'subsection_order'=>[],
886# 'next_subsection'=>1,
887# 'subsections'=>{},
888# 'metadata'=>[],
889# 'text'=>""};
890
891 # if this is the top section reset everything
892 if ($section eq "") {
893 $self->{'subsection_order'} = [];
894 $self->{'subsections'} = {};
895 $self->{'metadata'} = [];
896 $self->{'text'} = "";
897 return;
898 }
899
900 # find the parent of the section to delete
901 my $parent_section = "";
902 my $child = "0";
903 my @section = split (/\./, $section);
904 if (scalar(@section) > 0) {
905 $child = pop (@section);
906 $parent_section = join (".", @section);
907 }
908
909 my $parent_section_ptr = $self->_lookup_section($parent_section);
910 if (!defined $parent_section_ptr) {
911 print STDERR "doc::delete_section couldn't find parent section " .
912 "$parent_section\n";
913 return;
914 }
915
916 # remove this section from the subsection_order list
917 my $i = 0;
918 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
919 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
920 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
921 last;
922 }
923 $i++;
924 }
925
926 # remove this section from the subsection hash
927 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
928 undef $parent_section_ptr->{'subsections'}->{$child};
929 }
930}
931
932#--
[4]933# methods for dealing with metadata
934
935# set_metadata_element and get_metadata_element are for metadata
936# which should only have one value. add_meta_data and get_metadata
937# are for metadata which can have more than one value.
938
[1241]939# returns the first metadata value which matches field
[6111]940
941# This version of get metadata element works much like the one above,
942# except it allows for the namespace portion of a metadata element to
943# be ignored, thus if you are searching for dc.Title, the first piece
944# of matching metadata ending with the name Title (once any namespace
945# is removed) would be returned.
946# 28-11-2003 John Thompson
[1241]947sub get_metadata_element {
948 my $self = shift (@_);
[6111]949 my ($section, $field, $ignore_namespace) = @_;
[1241]950 my ($data);
951
[6111]952 $ignore_namespace = 0 unless defined $ignore_namespace;
953
[1241]954 my $section_ptr = $self->_lookup_section($section);
955 if (!defined $section_ptr) {
[8716]956 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
[1241]957 return;
958 }
959
[24404]960 # Remove any namespace if we are being told to ignore them
[6111]961 if($ignore_namespace) {
[24404]962 $field =~ s/^.*\.//; #$field =~ s/^\w*\.//;
[6111]963 }
964
[1241]965 foreach $data (@{$section_ptr->{'metadata'}}) {
[6111]966
967 my $data_name = $data->[0];
[14966]968
[24404]969 # Remove any namespace if we are being told to ignore them
[6111]970 if($ignore_namespace) {
[24404]971 $data_name =~ s/^.*\.//; #$data_name =~ s/^\w*\.//;
[6111]972 }
[24404]973 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
974 $data_name =~ s/^ex\.([^.]+)$/$1/; #$data_name =~ s/^ex\.//;
975
[6111]976 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
[1241]977 }
978
979 return undef; # was not found
980}
981
982# returns a list of the form [value1, value2, ...]
983sub get_metadata {
984 my $self = shift (@_);
[6111]985 my ($section, $field, $ignore_namespace) = @_;
[1241]986 my ($data);
987
[6111]988 $ignore_namespace = 0 unless defined $ignore_namespace;
989
[1241]990 my $section_ptr = $self->_lookup_section($section);
991 if (!defined $section_ptr) {
[8716]992 print STDERR "doc::get_metadata couldn't find section ",
993 $section, "\n";
[1241]994 return;
995 }
996
[24404]997 # Remove any namespace if we are being told to ignore them
[6111]998 if($ignore_namespace) {
[24404]999 $field =~ s/^.*\.//;
[6111]1000 }
1001
[1241]1002 my @metadata = ();
1003 foreach $data (@{$section_ptr->{'metadata'}}) {
[6111]1004
1005 my $data_name = $data->[0];
[24404]1006
1007 # Remove any namespace if we are being told to ignore them
[6111]1008 if($ignore_namespace) {
[24404]1009 $data_name =~ s/^.*\.//;
1010 }
1011 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
1012 $data_name =~ s/^ex\.([^.]+)$/$1/;
1013
[6111]1014 push (@metadata, $data->[1]) if ($data_name eq $field);
[1241]1015 }
[9241]1016
[1241]1017 return \@metadata;
1018}
1019
[23827]1020sub get_metadata_hashmap {
1021 my $self = shift (@_);
1022 my ($section, $opt_namespace) = @_;
1023
1024 my $section_ptr = $self->_lookup_section($section);
1025 if (!defined $section_ptr) {
1026 print STDERR "doc::get_metadata couldn't find section ",
1027 $section, "\n";
1028 return;
1029 }
1030
1031 my $metadata_hashmap = {};
1032 foreach my $data (@{$section_ptr->{'metadata'}}) {
1033 my $metaname = $data->[0];
1034
1035 if ((!defined $opt_namespace) || ($metaname =~ m/^$opt_namespace\./)) {
1036 if (!defined $metadata_hashmap->{$metaname}) {
1037 $metadata_hashmap->{$metaname} = [];
1038 }
1039 my $metaval_list = $metadata_hashmap->{$metaname};
1040 push(@$metaval_list, $data->[1]);
1041 }
1042 }
1043
1044 return $metadata_hashmap;
1045}
1046
[1241]1047# returns a list of the form [[field,value],[field,value],...]
1048sub get_all_metadata {
1049 my $self = shift (@_);
1050 my ($section) = @_;
1051
1052 my $section_ptr = $self->_lookup_section($section);
1053 if (!defined $section_ptr) {
[8716]1054 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
[1241]1055 return;
1056 }
1057
1058 return $section_ptr->{'metadata'};
1059}
1060
1061# $value is optional
1062sub delete_metadata {
1063 my $self = shift (@_);
1064 my ($section, $field, $value) = @_;
1065
1066 my $section_ptr = $self->_lookup_section($section);
1067 if (!defined $section_ptr) {
[23562]1068 print STDERR "doc::delete_metadata couldn't find section ", $section, "$field\n";
[1241]1069 return;
1070 }
1071
1072 my $i = 0;
1073 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
1074 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
1075 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
1076 splice (@{$section_ptr->{'metadata'}}, $i, 1);
1077 } else {
1078 $i++;
1079 }
1080 }
1081}
1082
1083sub delete_all_metadata {
1084 my $self = shift (@_);
1085 my ($section) = @_;
1086
1087 my $section_ptr = $self->_lookup_section($section);
1088 if (!defined $section_ptr) {
[8716]1089 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
[1241]1090 return;
1091 }
1092
1093 $section_ptr->{'metadata'} = [];
1094}
1095
[4]1096sub set_metadata_element {
1097 my $self = shift (@_);
1098 my ($section, $field, $value) = @_;
1099
[97]1100 $self->set_utf8_metadata_element ($section, $field,
[1870]1101 &unicode::ascii2utf8(\$value));
[73]1102}
1103
1104# set_utf8_metadata_element assumes the text has already been
1105# converted to the UTF-8 encoding.
1106sub set_utf8_metadata_element {
1107 my $self = shift (@_);
1108 my ($section, $field, $value) = @_;
[25557]1109
[4]1110 $self->delete_metadata ($section, $field);
[73]1111 $self->add_utf8_metadata ($section, $field, $value);
[4]1112}
1113
1114
[73]1115# add_metadata assumes the text is in (extended) ascii form. For
[8220]1116# text which has already been converted to the UTF-8 format use
[73]1117# add_utf8_metadata.
[4]1118sub add_metadata {
1119 my $self = shift (@_);
1120 my ($section, $field, $value) = @_;
[25557]1121
[97]1122 $self->add_utf8_metadata ($section, $field,
[1870]1123 &unicode::ascii2utf8(\$value));
[73]1124}
1125
1126sub add_utf8_metadata {
1127 my $self = shift (@_);
1128 my ($section, $field, $value) = @_;
[25557]1129
[33171]1130 # my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
[25557]1131 # my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
1132 # print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
[73]1133
[4]1134 my $section_ptr = $self->_lookup_section($section);
1135 if (!defined $section_ptr) {
[8716]1136 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
[4]1137 return;
1138 }
[1732]1139 if (!defined $value) {
1140 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
1141 return;
1142 }
1143 if (!defined $field) {
1144 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
1145 return;
1146 }
[4]1147
[8894]1148 #print STDERR "###$field=$value\n";
[22950]1149
1150 # For now, supress this check. Given that text data read in is now
1151 # Unicode aware, then the following block of code can (ironically enough)
1152 # cause our unicode compliant string to be re-encoded (leading to
1153 # a double-encoded UTF-8 string, which we definitely don't want!).
1154
1155
[7798]1156 # double check that the value is utf-8
[25557]1157 # if (!&unicode::check_is_utf8($value)) {
1158 # print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
1159 # &unicode::ensure_utf8(\$value);
1160 # print STDERR " Tried converting to utf8: $value\n";
1161 # }
[7798]1162
[25557]1163 #If the metadata value is either a latitude or a longitude value then we want to save a shortened version for spacial searching purposes
[26850]1164 if ($field =~ m/^(.+\.)?Latitude$/ || $field =~ m/^(.+\.)?Longitude$/)
[25557]1165 {
[26850]1166 my ($mdprefix,$metaname) = ($field =~ m/(.+)\.(.+)$/);
1167 if (defined $mdprefix) {
1168 # Add in a version of Latitude/Longitude without the metadata namespace prefix to keep Runtime happy
1169 push (@{$section_ptr->{'metadata'}}, [$metaname, $value]);
1170 }
1171
[25557]1172 my $direction;
[34276]1173 # TODO does the following work, with eq latitude? the above code implies can be xxxxx.Latitude. should use metaname??
[25557]1174 if($value =~ m/^-/)
1175 {
1176 $direction = ($field eq "Latitude") ? "S" : "W";
1177 }
1178 else
1179 {
1180 $direction = ($field eq "Latitude") ? "N" : "E";
1181 }
1182
1183 my ($beforeDec, $afterDec) = ($value =~ m/^-?([0-9]+)\.([0-9]+)$/);
1184 if(defined $beforeDec && defined $afterDec)
1185 {
1186 my $name = ($field eq "Latitude") ? "LatShort" : "LngShort";
1187 push (@{$section_ptr->{'metadata'}}, [$name, $beforeDec . $direction]);
1188
1189 for(my $i = 2; $i <= 4; $i++)
1190 {
1191 if(length($afterDec) >= $i)
1192 {
1193 push (@{$section_ptr->{'metadata'}}, [$name, substr($afterDec, 0, $i)]);
1194 }
1195 }
[34276]1196
1197 # what is this???? completely does nothing...
[25557]1198 #Only add the metadata if it has not already been added
[33217]1199 my $metaMap = $self->get_metadata_hashmap($section); # metaMap not used and called function has no apparent side-effects,
1200 # but this line appears important for ensuring uniqueness of (Latitude, value) meta
1201 # in the section. Also for LatShort, Longitude, LngShort.
[25557]1202 }
[34276]1203
1204 # if we have both Latitude and Longitude, then add coordinate metadata (not sure why, but someone else added it)
1205 my $latitude;
1206 my $longitude;
[33126]1207 if($field =~ m/^(.+\.)?Longitude$/) {
[34276]1208 # if we are dealing with Longitude meta, and we have already seen Latitude, then add Coordinate metadata
1209 $latitude = $self->get_metadata_element ($section, "Latitude");
1210 if (defined $latitude) {
1211 $longitude = $value;
1212 }
1213 } else {
1214 # we are dealing with Latitude - if we have already seen Longitude, then add both as Coordinate metadata
1215 $longitude = $self->get_metadata_element ($section, "Longitude");
1216 if (defined $longitude) {
1217 $latitude = $value;
1218 }
[33126]1219 }
[34276]1220 if (defined $latitude && defined $longitude) {
1221 ##rint STDERR "adding coords for $section, $latitude, $longitude\n";
1222 $self->processCoordinate($section, $latitude, $longitude);
1223 } else {
1224 ##rint STDERR "lat or long not defined, not adding coords for $section\n";
1225 }
1226
[25557]1227 }
[33302]1228
1229 elsif($field eq "GPS.mapOverlay") { # then the $value is a JSON string
[25557]1230
[33313]1231 # In order to allow searching map-data-enriched documents by map shape descriptions,
[33302]1232 # and to run rawquery searches for other docs by proximity based on their map data,
1233 # need to store the shape descriptions and Coordinate info for shapes into the text index.
1234 # We add the description for each shape in the mapoverlay into the text index as GPSMapOverlayLabel
1235 # And we add Coordinate (CD) and CoordShort (CS) info for each shape in the mapoverlay in the format (Lat, Lng) as ("37S339 175E342")
1236 # where the digits before the N/S/E/W direction represents the whole number. And the digits after the direction are the
[33313]1237 # decimal places which can vary between 0 and 2 to 4 digits.
[33302]1238
[33313]1239 # However, we only want to process GPS.mapOverlay during buildcol and only in the text indexing passes (e.g. sidx and didx for lucene)
1240 # and certainly never during the infodb pass of buildcol. The latter can end up duplicating Coordinate/CoordShort/GPSMapOverlayLabel
1241 # when rebuilding with the online doc editor, as that runs incremental-rebuild which then calls basebuilder::reconstruct_doc_objs_metadata()
[33302]1242 # on all docs NOT being incrementally rebuilt. That call would get meta from the infodb and use it to reconstruct the doc objects of docs
[33313]1243 # NOT being incrementally built. If the Coord and Label meta were written to the infodb, they would then be loaded back in memory when the
1244 # collection is incrementally rebuilt for those docs that don't need incremental processing. Then this function would once again add the
1245 # same meta into the infodb, thus duplicating what goes into the infodb. Hence, don't do all the following if doc::processor_mode =~ "infodb",
1246 # or anything other than a text mode.
[33217]1247
[33302]1248 # Note that for incremental rebuilding, the text pass can be called textreindex for instance (and infodb pass can be incinfodb).
1249 # So don't check for exact string match
1250
[33312]1251 if($doc::cmd_line_mode eq "buildcol" && $doc::processor_mode =~ m/^text/) { # currently known text processor_modes:
1252 # text, textreindex, possibly textdelete (see ArchivesInfPlugin.pm::read() for last 2).
1253 # OR: ..."buildcol" && $doc::processor_mode !~ m/infodb$/) # if dummy pass important
[33171]1254
[33302]1255 ###print STDERR "GPS.mapOverlay before decoding, val = " . $value . "\n";
[33125]1256
[33302]1257 # TODO: html decode?
[33125]1258 $value =~ s@&#091;@[@g;
1259 $value =~ s@&#093;@]@g;
1260 $value =~ s@&quot;@"@g;
[33302]1261 ###print STDERR "GPS.mapOverlay after decoding, val = " . $value . "\n";
[33125]1262
[33293]1263 my $json_array = decode_json $value;
[33302]1264
1265 foreach my $shape (@$json_array) {
[33125]1266
[33302]1267 # Put each available shape description/label into this section's metadata with GPSMapOverlayLabel as metaname.
1268 # Just as for Coordinate meta, don't need to know which shape a label belongs too. This is just so each label
1269 # will be indexed, and therefore can be searched.
1270
1271 my $description = $shape->{"description"};
1272 if($description) {
1273 push (@{$section_ptr->{'metadata'}}, ["GPSMapOverlayLabel", $description]);
1274 ###print STDERR "@@@@############################################ Just added description meta: " . $description . "\n";
1275 }
1276
[33293]1277 my $type = $shape->{"type"};
[33302]1278 ###print STDERR "Shape type : " . $type . "\n";
[33293]1279
1280 if($type eq "circle") {
[33302]1281 ###print STDERR "Found a circle:\n" . &printShape($json, $shape);
[33293]1282
1283 # work out bounding box
1284 # want the inverse of this useful page:
1285 # https://stackoverflow.com/questions/639695/how-to-convert-latitude-or-longitude-to-meters
1286 # https://www.geodatasource.com/developers/javascript
[33125]1287
[33293]1288 # Dr Bainbridge wants us to use: https://gis.stackexchange.com/questions/5821/calculating-latitude-longitude-x-miles-from-point
1289 # Since for N,E,S,W the following works out the same as above link, we're using the rule of thumb in the following:
1290 # https://gis.stackexchange.com/questions/2951/algorithm-for-offsetting-a-latitude-longitude-by-some-amount-of-meters
1291 # which states
1292 # "If your displacements aren't too great (less than a few kilometers) and you're not right at the poles,
1293 # use the quick and dirty estimate that 111,111 meters (111.111 km) in the y direction is 1 degree
1294 # (of latitude) and 111,111 * cos(latitude) meters in the x direction is 1 degree (of longitude)."
1295 my $centre_lat = $shape->{"center"}->{"lat"};
1296 my $centre_lng = $shape->{"center"}->{"lng"};
1297 my $radius = $shape->{"radius"}; # in metres!
1298
[33302]1299 ###print STDERR "@@@ circle centre: ($centre_lat, $centre_lng), radius: $radius\n";
[33293]1300
1301 my $lat_north = $centre_lat + ($radius/111111);
1302 my $lat_south = $centre_lat - ($radius/111111);
1303
[33302]1304 ###print STDERR "### lat_north: $lat_north\n";
1305 ###print STDERR "### lat_south: $lat_south\n";
[33125]1306
[33293]1307 # our latitude and longitude values are in degrees. But cos and sin etc in perl and generally all prog languages
1308 # all expect their angle to be in radians. So need to convert from degree to radians before we can take the cos of it.
1309 my $centre_lat_radians = $self->degreesToRadians($centre_lat);
1310 my $cos_in_radians = cos($centre_lat_radians);
[33302]1311 ###print STDERR "cos $centre_lat_radians " . cos($centre_lat_radians) . "\n";
[33293]1312 my $lng_east = $centre_lng + ($radius/(111111 * $cos_in_radians));
1313 my $lng_west = $centre_lng - ($radius/(111111 * $cos_in_radians));
[33302]1314 ###print STDERR "### lng_east $lng_east\n";
1315 ###print STDERR "### lng_west $lng_west\n";
[33293]1316
1317 my $cos_lat = cos($centre_lat);
[33302]1318 ###print STDERR "cos $centre_lat is $cos_lat\n";
[33293]1319
1320 $self->processCoordinate($section, $lat_north, $lng_east);
1321 $self->processCoordinate($section, $lat_south, $lng_east);
1322 $self->processCoordinate($section, $lat_south, $lng_west);
1323 $self->processCoordinate($section, $lat_north, $lng_west);
1324
1325 }
1326 elsif ($type eq "marker") {
[33302]1327 ###print STDERR "@@ MARKER FOUND WITH LAT: " . $shape->{"position"}->{"lat"} . "\n";
1328 ###print STDERR "@@ MARKER FOUND WITH LNG: " . $shape->{"position"}->{"lng"} . "\n";
[33293]1329 $self->processCoordinate($section, $shape->{"position"}->{"lat"}, $shape->{"position"}->{"lng"});
1330 }
1331 elsif ($type eq "polyline" || $type eq "polygon") {
1332 my $path_array = $shape->{"path"};
1333 foreach my $position (@$path_array) {
1334 $self->processCoordinate($section, $position->{"lat"}, $position->{"lng"});
1335 }
1336 }
1337 elsif ($type eq "rectangle") {
1338
1339 my $bounds = $shape->{"bounds"};
1340 $self->processCoordinate($section, $bounds->{"north"}, $bounds->{"east"});
1341 $self->processCoordinate($section, $bounds->{"south"}, $bounds->{"east"});
1342 $self->processCoordinate($section, $bounds->{"south"}, $bounds->{"west"});
1343 $self->processCoordinate($section, $bounds->{"north"}, $bounds->{"west"});
1344 }
1345
[33125]1346 } # end for on each shape in GPS.mapOverlay
[33302]1347 } # end if(buildcol and text pass)
[33125]1348 } # end GPS.mapOverlay meta
1349
[4]1350 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1351}
1352
[33125]1353# https://en.wikipedia.org/wiki/Radian
1354sub degreesToRadians
1355{
1356 my $self = shift (@_);
1357 my ($degrees) = @_;
[4]1358
[33125]1359 return $degrees * pi /180; # returns radians
1360}
1361
1362sub radiansToDegrees
1363{
1364 my $self = shift (@_);
1365 my ($radians) = @_;
1366
1367 return $radians * 180 / pi; # returns degrees
1368}
[33293]1369
[33302]1370# Call as:
1371# my $json = JSON->new->allow_nonref;
1372# &printAllShapes($json, $json_array);
[33293]1373sub printAllShapes {
1374 my ($json, $json_array) = @_;
1375
1376
1377 #my $pretty_print_shape = $json->pretty->encode( $json_array->[0] );
1378 foreach my $shape (@$json_array) {
1379 my $pretty_print_shape = $json->pretty->encode( $shape );
1380 print STDERR "Shape: $pretty_print_shape\n";
1381 }
1382
[33185]1383}
[33125]1384
[33217]1385# For the (lat, lng) coordinate given,
1386# attaches Coordinate and multiple CoordShort (different precision level) metadata to the doc object
[33125]1387sub processCoordinate {
1388 my $self = shift (@_);
[33217]1389 my ($section, $latitude, $longitude) = @_;
[33125]1390
[33128]1391 my $section_ptr = $self->_lookup_section($section);
1392
[33125]1393 my $lat_direction = ($latitude =~ m/^-/) ? "S" : "N";
1394 my $lng_direction = ($longitude =~ m/^-/) ? "W" : "E";
1395
1396 # have to store (lat, lng) in pairs, when there are so many coords to store
1397 #push (@{$section_ptr->{'metadata'}}, ["Latitude", $latitude]);
1398 #push (@{$section_ptr->{'metadata'}}, ["Longitude", $longitude]);
1399
1400 push (@{$section_ptr->{'metadata'}}, ["Coordinate", "$latitude $longitude"]); # "$latitude$lat_direction $longitude$lng_direction"
1401
1402 my ($latBeforeDec, $latAfterDec);
1403 my ($lngBeforeDec, $lngAfterDec);
1404
1405 if($latitude !~ m/\./) {
1406 $latBeforeDec = $latitude;
1407 $latAfterDec = "";
1408 } else {
1409 ($latBeforeDec, $latAfterDec) = ($latitude =~ m/^-?([0-9]+)\.([0-9]+)$/);
1410 }
1411 if($longitude !~ m/\./) {
1412 $lngBeforeDec = $longitude;
1413 $lngAfterDec = "";
1414 } else {
1415 ($lngBeforeDec, $lngAfterDec) = ($longitude =~ m/^-?([0-9]+)\.([0-9]+)$/);
1416 }
1417
1418
[33217]1419 my $name = "CoordShort";
1420 push (@{$section_ptr->{'metadata'}}, [$name, "$latBeforeDec$lat_direction $lngBeforeDec$lng_direction"]);
1421
[33185]1422 for(my $i = 2; $i <= 4; $i++)
1423 {
1424 my $latDecPlaces = (length($latAfterDec) >= $i) ? substr($latAfterDec, 0, $i) : "";
1425 my $lngDecPlaces = (length($lngAfterDec) >= $i) ? substr($lngAfterDec, 0, $i) : "";
1426
[33217]1427 push (@{$section_ptr->{'metadata'}}, [$name,
1428 $latBeforeDec . $lat_direction. $latDecPlaces . " " . $lngBeforeDec . $lng_direction. $lngDecPlaces]);
[33185]1429
1430 }
1431
1432 #Only add the metadata if it has not already been added
[33217]1433 #my $metaMap = $self->get_metadata_hashmap($section); ### TODO: metaMap not used. Unnecesssary step? (Called function has no apparent side-effects.)
[33125]1434}
1435
1436
[4]1437# methods for dealing with text
1438
[1241]1439# returns the text for a section
1440sub get_text {
1441 my $self = shift (@_);
1442 my ($section) = @_;
1443
1444 my $section_ptr = $self->_lookup_section($section);
1445 if (!defined $section_ptr) {
1446 print STDERR "doc::get_text couldn't find section " .
1447 "$section\n";
1448 return "";
1449 }
1450
1451 return $section_ptr->{'text'};
1452}
1453
1454# returns the (utf-8 encoded) length of the text for a section
1455sub get_text_length {
1456 my $self = shift (@_);
1457 my ($section) = @_;
1458
1459 my $section_ptr = $self->_lookup_section($section);
1460 if (!defined $section_ptr) {
1461 print STDERR "doc::get_text_length couldn't find section " .
1462 "$section\n";
1463 return 0;
1464 }
1465
1466 return length ($section_ptr->{'text'});
1467}
1468
[23131]1469# returns the total length for all the sections
1470sub get_total_text_length {
1471 my $self = shift (@_);
1472
1473 my $section = $self->get_top_section();
1474 my $length = 0;
1475 while (defined $section) {
1476 $length += $self->get_text_length($section);
1477 $section = $self->get_next_section($section);
1478 }
1479 return $length;
1480}
1481
[1241]1482sub delete_text {
1483 my $self = shift (@_);
1484 my ($section) = @_;
1485
1486 my $section_ptr = $self->_lookup_section($section);
1487 if (!defined $section_ptr) {
1488 print STDERR "doc::delete_text couldn't find section " .
1489 "$section\n";
1490 return;
1491 }
1492
1493 $section_ptr->{'text'} = "";
1494}
1495
[73]1496# add_text assumes the text is in (extended) ascii form. For
1497# text which has been already converted to the UTF-8 format
1498# use add_utf8_text.
[4]1499sub add_text {
1500 my $self = shift (@_);
1501 my ($section, $text) = @_;
1502
[73]1503 # convert the text to UTF-8 encoded unicode characters
1504 # and add the text
[1870]1505 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
[73]1506}
1507
1508
1509# add_utf8_text assumes the text to be added has already
1510# been converted to the UTF-8 encoding. For ascii text use
1511# add_text
[32590]1512# Pass by value version (internally calls pass by ref version
1513# to avoid code duplication)
[73]1514sub add_utf8_text {
1515 my $self = shift (@_);
1516 my ($section, $text) = @_;
[32590]1517
1518 $self->add_utf8_textref($section, \$text);
1519}
[73]1520
[32590]1521# Pass by reference version, used by GreenstoneSQLPlugin for fulltext
1522sub add_utf8_textref {
1523 my $self = shift (@_);
1524 my ($section, $text_ref) = @_;
1525
[4]1526 my $section_ptr = $self->_lookup_section($section);
1527 if (!defined $section_ptr) {
[32590]1528 print STDERR "doc::add_utf8_textref couldn't find section " .
[4]1529 "$section\n";
1530 return;
1531 }
1532
[32590]1533 $section_ptr->{'text'} .= $$text_ref;
[4]1534}
1535
[16950]1536# returns the Source meta, which is the utf8 filename generated.
1537# Added a separate method here for convenience
1538sub get_source {
[16924]1539 my $self = shift (@_);
[16950]1540 return $self->get_metadata_element ($self->get_top_section(), "Source");
1541}
1542
1543# returns the SourceFile meta, which is the url reference to the URL-encoded
1544# version of Source (the utf8 filename). Added a separate method here for convenience
1545sub get_sourcefile {
1546 my $self = shift (@_);
1547 return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
1548}
1549
1550# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
1551# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
1552sub get_assocfile_from_sourcefile {
1553 my $self = shift (@_);
[16924]1554
1555 # get the SourceFile meta, which is a *URL* to a file on the filesystem
1556 my $top_section = $self->get_top_section();
1557 my $source_file = $self->get_metadata_element($top_section, "SourceFile");
[4]1558
[16924]1559 # get the actual filename as it exists on the filesystem which this url refers to
[16928]1560 $source_file = &unicode::url_to_filename($source_file);
[16924]1561 my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
1562 return $assocfilename;
1563}
1564
[1241]1565# methods for dealing with associated files
1566
1567# a file is associated with a document, NOT a section.
1568# if section is defined it is noted in the data structure
1569# only so that files associated from a particular section
1570# may be removed later (using delete_section_assoc_files)
1571sub associate_file {
1572 my $self = shift (@_);
1573 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1574 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1575
1576 # remove all associated files with the same name
1577 $self->delete_assoc_file ($assoc_filename);
[23413]1578
1579 # Too harsh a requirement
1580 # Definitely get HTML docs, for example, with some missing
1581 # support files
1582# if (!&util::fd_exists($real_filename)) {
1583# print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n";
1584# exit -1;
1585# }
1586
[23387]1587# print STDERR "**** is the following a UTF8 rep of *real* filename?\n $real_filename\n";
1588# print STDERR "****##### so, ensure it is before storing?!?!?\n";
1589## my $utf8_filename = Encode::encode("utf8",$filename);
1590
[1241]1591 push (@{$self->{'associated_files'}},
1592 [$real_filename, $assoc_filename, $mime_type, $section]);
1593}
1594
1595# returns a list of associated files in the form
1596# [[real_filename, assoc_filename, mimetype], ...]
1597sub get_assoc_files {
1598 my $self = shift (@_);
1599
1600 return $self->{'associated_files'};
1601}
1602
[20775]1603# the following two methods used to keep track of original associated files
1604# for incremental building. eg a txt file used by an item file does not end
1605# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1606# file for incremental build
1607sub associate_source_file {
1608 my $self = shift (@_);
1609 my ($full_filename) = @_;
[19494]1610
[20775]1611 push (@{$self->{'source_assoc_files'}}, $full_filename);
1612
1613}
1614
1615sub get_source_assoc_files {
1616 my $self = shift (@_);
1617
1618 return $self->{'source_assoc_files'};
1619
1620
1621}
[19494]1622sub metadata_file {
1623 my $self = shift (@_);
1624 my ($real_filename, $filename) = @_;
1625
1626 push (@{$self->{'metadata_files'}},
1627 [$real_filename, $filename]);
1628}
1629
[21566]1630# used for writing out the archiveinf-doc info database, to list all the metadata files
[19494]1631sub get_meta_files {
1632 my $self = shift (@_);
1633
1634 return $self->{'metadata_files'};
1635}
1636
[1241]1637sub delete_section_assoc_files {
1638 my $self = shift (@_);
1639 my ($section) = @_;
1640
1641 my $i=0;
1642 while ($i < scalar (@{$self->{'associated_files'}})) {
1643 if (defined $self->{'associated_files'}->[$i]->[3] &&
1644 $self->{'associated_files'}->[$i]->[3] eq $section) {
1645 splice (@{$self->{'associated_files'}}, $i, 1);
1646 } else {
1647 $i++;
1648 }
1649 }
1650}
1651
1652sub delete_assoc_file {
1653 my $self = shift (@_);
1654 my ($assoc_filename) = @_;
1655
1656 my $i=0;
1657 while ($i < scalar (@{$self->{'associated_files'}})) {
1658 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1659 splice (@{$self->{'associated_files'}}, $i, 1);
1660 } else {
1661 $i++;
1662 }
1663 }
1664}
1665
1666sub reset_nextsection_ptr {
1667 my $self = shift (@_);
1668 my ($section) = @_;
1669
1670 my $section_ptr = $self->_lookup_section($section);
1671 $section_ptr->{'next_subsection'} = 1;
1672}
1673
[4]16741;
Note: See TracBrowser for help on using the repository browser.