Context Navigation

source: main/trunk/greenstone2/perllib/doc.pm@ 35534

Last change on this file since 35534 was 34506, checked in by ak19, 4 years ago
Redoing work of commit revision 34394: Redoing Bugfix 1 for GLI doc.xml metadata slowdown resulting from earlier bugfix to help GLI cope with filenames and assigned meta that have non-ASCII chars in them. The slowdown happened when gathered files got selected in GLI and was fixed in commit 34394, but the fix was not ideal for 2 reasons. 1. A new form of filename encoding (hexed unicode) going into doc.xml, instead of existing encodings like URL and base64, though those existing encodings weren't the right ones for my first solution. 2. The solution was specific to Windows to cope with special chars in filenames and relied on a new meta field gsdlfullsourcepath being written out to doc.xml by doc.pm. So a built collection moved from Linux to Windows won't show up doc.xml meta in GLI, as it won't have the new doc.xml meta field that Windows is expecting. Have a better solution for 1 that doesn't require the new field. But still can't fix all of point 2, as the existing gsdlsourcefilename meta field in doc.xml can contain Windows Short filenames when the coll is built on Windows and this won't be backwards compatible on Linux anyway. This problem existed before too, except I didn't realise it until now. But the new solution fixes more issues. First step: undoing doc.pm adding new metadata field gsdlfullsourcepath.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 52.8 KB

Line
1	###########################################################################
2	#
3	# doc.pm --
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 1999 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistr te it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# base class to hold documents
27
28	package doc;
29	eval {require bytes};
30
31	BEGIN {
32	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33	unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34	}
35
36	use strict;
37	use unicode;
38	use util;
39	use FileUtils;
40	use ghtml;
41	use File::stat;
42	##use hashdoc;
43	use docprint;
44	use JSON;
45
46	# We just need pi from the Trig lib
47	# Import constants pi2, pip2, pip4 (2*pi, pi/2, pi/4).
48	# But doing
49	# use Math::Trig ':pi';
50	# resulted in errors on 32 bit linux VM with perl v8 of "bareword pi not allowed while strict subs in use"
51	# So, now doing just
52	use Math::Trig;
53
54	# the document type may be indexed_doc, nonindexed_doc, or
55	# classification
56
57	our $OIDcount = 0;
58
59
60	# Declare the Package (level) Variable that will tell us whether we're in the import or buildcol phase.
61	# This var will be set by import.pl, buildcol.pl and export.pl to the then appropriate value.
62	# https://perlmaven.com/package-variables-and-lexical-variables-in-perl
63	# But: https://stackoverflow.com/questions/7542120/perl-how-to-make-variables-from-requiring-script-available-in-required-script
64	# https://stackoverflow.com/questions/33230015/how-do-i-use-a-variable-in-another-perl-script
65	#$doc::cmd_line_mode = undef;
66	# Decided to follow the usage of $OIDcount above to declare the necessary package-level variables 'globally' accessible
67	# actually accessible only for those packages that import this one (with 'use doc')
68	my $cmd_line_mode = undef;
69
70	# processor_mode keeps track of which buildcol pass we're at: dummy, text (sidx/didx passes) or infodb
71	my $processor_mode = undef;
72
73	# rename_method can be 'url', 'none', 'base64'
74	sub new {
75	my $class = shift (@_);
76	my ($source_filename, $doc_type, $rename_method) = @_;
77
78
79	my $self = bless {'associated_files'=>[],
80	'subsection_order'=>[],
81	'next_subsection'=>1,
82	'subsections'=>{},
83	'metadata'=>[],
84	'text'=>"",
85	'OIDtype'=>"hash"}, $class;
86
87	# used to set lastmodified here, but this can screw up the HASH ids, so
88	# the docsave processor now calls set_lastmodified
89
90	$self->set_source_path($source_filename);
91
92	if (defined $source_filename) {
93	$source_filename = &util::filename_within_collection($source_filename);
94	print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
95	$self->set_source_filename ($source_filename, $rename_method);
96	}
97
98	$self->set_doc_type ($doc_type) if defined $doc_type;
99
100	return $self;
101	}
102
103
104	sub set_source_path
105	{
106	my $self = shift @_;
107	my ($source_filename) = @_;
108
109	if (defined $source_filename) {
110	# On Windows the source_filename can be in terse DOS format
111	# e.g. test~1.txt
112
113	$self->{'terse_source_path'} = $source_filename;
114
115	# Use the FileUtil library methods as they are aware of more special
116	# cases such as HDFS [jmt12]
117	if (&FileUtils::fileExists($source_filename))
118	{
119	# See if we can do better for Windows with a filename
120	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
121	require Win32;
122	$self->{'source_path'} = Win32::GetLongPathName($source_filename);
123	}
124	else {
125	# For Unix-based systems, there is no difference between the two
126	$self->{'source_path'} = $source_filename;
127	}
128
129	## Unused.
130	# On Windows, the code above has ensured source_path is the Win long (full) path name.
131	# To help GLI associate DocXML metadata with an easily calculated and accurate representation
132	# of filenames, we now store the Win long path name, hex encoded.
133	# We're not using this field on Linux, as I can't get the hex encodings generated to match
134	# what GLI Java code generates. But for symmetry we store this field on Unix too, but we need
135	# to hex-encode source_path on Unix too, or it may not be UTF-8 and doc.xml will be invalid
136	##my $hexencodedlongsourcepath = &unicode::debug_unicode_string($self->{'source_path'});
137	##$self->set_utf8_metadata_element ($self->get_top_section(), "gsdlfullsourcepath", $hexencodedlongsourcepath);
138
139	}
140	else {
141	print STDERR "Warning: In doc::set_source_path(), file\n";
142	print STDERR " $source_filename\n";
143	print STDERR " does not exist\n";
144
145	# (default) Set it to whatever we were given
146	$self->{'source_path'} = $source_filename;
147	}
148	}
149	else {
150	# Previous code for setting source_path allowed for
151	# it to be undefined, so continue this practice
152	$self->{'terse_source_path'} = undef;
153	$self->{'source_path'} = undef;
154	}
155	}
156
157
158	sub get_source_path
159	{
160	my $self = shift @_;
161
162	return $self->{'terse_source_path'};
163	}
164
165	# set lastmodified for OAI purposes, added by GRB, moved by kjdon
166	sub set_oailastmodified {
167	my $self = shift (@_);
168
169	my $source_path = $self->{'terse_source_path'};
170
171	if (defined $source_path && (-e $source_path)) {
172	my $current_time = time;
173
174	my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
175	$wday, $yday, $isdst) = localtime($current_time);
176
177	my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
178
179	$self->add_utf8_metadata($self->get_top_section(), "oailastmodified", $current_time);
180	$self->add_utf8_metadata($self->get_top_section(), "oailastmodifieddate", $date_modified);
181	}
182	}
183
184	# no longer used for OAI purposes, since lastmodified is not what we want as the
185	# Datestamp of a document. This doc metadata may be useful for general purposes.
186	sub set_lastmodified {
187	my $self = shift (@_);
188
189	my $source_path = $self->{'terse_source_path'};
190
191	if (defined $source_path && (-e $source_path)) {
192
193	my $file_stat = stat($source_path);
194	my $mtime = $file_stat->mtime;
195	my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
196	$wday, $yday, $isdst) = localtime($mtime);
197
198	my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
199
200	$self->add_utf8_metadata($self->get_top_section(), "lastmodified", $mtime);
201	$self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
202	}
203	}
204
205	# clone the $self object
206	sub duplicate {
207	my $self = shift (@_);
208
209	my $newobj = {};
210
211	foreach my $k (keys %$self) {
212	$newobj->{$k} = &clone ($self->{$k});
213	}
214
215	bless $newobj, ref($self);
216	return $newobj;
217	}
218
219	sub clone {
220	my ($from) = @_;
221	my $type = ref ($from);
222
223	if ($type eq "HASH") {
224	my $to = {};
225	foreach my $key (keys %$from) {
226	$to->{$key} = &clone ($from->{$key});
227	}
228	return $to;
229	} elsif ($type eq "ARRAY") {
230	my $to = [];
231	foreach my $v (@$from) {
232	push (@$to, &clone ($v));
233	}
234	return $to;
235	} else {
236	return $from;
237	}
238	}
239
240	sub set_OIDtype {
241	my $self = shift (@_);
242	my ($type, $metadata) = @_;
243
244	if (defined $type && $type =~ /^(hash\|hash_on_file\|hash_on_ga_xml\|hash_on_full_filename\|incremental\|filename\|dirname\|full_filename\|assigned)$/) {
245	$self->{'OIDtype'} = $type;
246	} else {
247	$self->{'OIDtype'} = "hash";
248	}
249
250	if ($type =~ /^assigned$/) {
251	if (defined $metadata) {
252	$self->{'OIDmetadata'} = $metadata;
253	} else {
254	$self->{'OIDmetadata'} = "dc.Identifier";
255	}
256	}
257	}
258
259	# rename_method can be 'url', 'none', 'base64'
260	sub set_source_filename {
261	my $self = shift (@_);
262	my ($source_filename, $rename_method) = @_;
263
264	# Since the gsdlsourcefilename element goes into the doc.xml it has
265	# to be utf8. However, it should also represent the source filename
266	# (in the import directory) which may not be utf8 at all.
267	# For instance, if this meta element (gsdlsourcefilename) will be used
268	# by other applications that parse doc.xml in order to locate
269	# gsdlsourcefilename. Therefore, the solution is to URLencode or base64
270	# encode the real filename as this is a binary-to-text encoding meaning
271	# that the resulting string is ASCII (utf8). Decoding will give the original.
272
273	# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
274	## print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename\n";
275
276	# URLencode just the gsdl_source_filename, not the directory. Then prepend dir
277	$source_filename = $self->encode_filename($source_filename, $rename_method);
278	# my ($srcfilename,$dirname,$suffix)
279	# = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
280	# print STDERR "-> $srcfilename -> ";
281	# $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
282	# $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
283	# print STDERR "$source_filename\n";
284
285	## print STDERR "AFTER URL/base64 encoding gsdlsourcefilename: $source_filename\n";
286
287	$self->set_utf8_metadata_element ($self->get_top_section(),
288	"gsdlsourcefilename",
289	$source_filename);
290
291	# If we set the file renaming method, at least GLI can decode gsdlsourcefilename to get the
292	# original filename (relative path to file) back for GLI to notice what ex Meta is associated with it
293	$self->set_utf8_metadata_element($self->get_top_section(),
294	"gsdlsourcefilerenamemethod",
295	$rename_method);
296	}
297
298	sub encode_filename {
299	my $self = shift (@_);
300	my ($source_filename, $rename_method) = @_;
301
302	my ($srcfilename,$dirname,$suffix)
303	= &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
304	# print STDERR "-> $srcfilename -> ";
305	$srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
306
307	# encode any subdirs of "import" also, but not import itself
308	my $sep = ($ENV{'GSDLOS'} =~ /^windows$/i) ? "\\" : "/";
309	my @dirs = ($sep eq "\\") ? split(/[\\\/]+/, $dirname) : split(/\//, $dirname);
310
311	my $dirpath = "";
312
313	# Don't encode the first folder ("import" or "tmp"): GLI's DocXMLFile.java looks for literal "import"
314	# or "tmp" before it knows what the file rename method is that is to be used decode the rest of gsdlsourcefilename.
315	if(scalar (@dirs) >= 1) {
316	$dirpath = shift(@dirs);
317	}
318	foreach my $subdir (@dirs) {
319	#print STDERR "@@@@ Found subdir: $subdir\n";
320
321	$subdir = &util::rename_file($subdir, $rename_method);
322	#print STDERR "@@@@ encoded subdir: $subdir\n";
323
324	$dirpath = &FileUtils::filenameConcatenate($dirpath, $subdir);
325	}
326	if($dirpath ne "") {
327	$dirname = $dirpath;
328	}
329
330	$source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
331
332	return $source_filename;
333	}
334
335	sub set_converted_filename {
336	my $self = shift (@_);
337	my ($converted_filename) = @_;
338
339	# we know the converted filename is utf8
340	$self->set_utf8_metadata_element ($self->get_top_section(),
341	"gsdlconvertedfilename",
342	$converted_filename);
343	}
344
345	# returns the source_filename as it was provided
346	sub get_unmodified_source_filename {
347	my $self = shift (@_);
348
349	return $self->{'terse_source_path'};
350	}
351
352	# returns the source_filename with whatever rename_method was given
353	sub get_source_filename {
354	my $self = shift (@_);
355
356	return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
357	}
358
359
360
361	# returns converted filename if available else returns source filename
362	sub get_filename_for_hashing {
363	my $self = shift (@_);
364
365	my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
366
367	if (!defined $filename) {
368	my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
369	# if NULPlug processed file, then don't give a filename
370	if (defined $plugin_name && $plugin_name eq "NULPlug") {
371	$filename = undef;
372	} else { # returns the URL encoded source filename!
373	$filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
374	}
375	}
376
377	if (!&FileUtils::isFilenameAbsolute($filename)) {
378	$filename = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$filename);
379	}
380
381	return $filename;
382	}
383
384	sub set_doc_type {
385	my $self = shift (@_);
386	my ($doc_type) = @_;
387
388	$self->set_metadata_element ($self->get_top_section(),
389	"gsdldoctype",
390	$doc_type);
391	}
392
393	# returns the gsdldoctype as it was provided
394	# the default of "indexed_doc" is used if no document
395	# type was provided
396	sub get_doc_type {
397	my $self = shift (@_);
398
399	my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
400	return $doc_type if (defined $doc_type);
401	return "indexed_doc";
402	}
403
404
405	# look up the reference to the a particular section
406	sub _lookup_section {
407	my $self = shift (@_);
408	my ($section) = @_;
409
410	my ($num);
411	my $sectionref = $self;
412
413	while (defined $section && $section ne "") {
414
415	($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
416
417	$num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
418
419	$section = "" unless defined $section;
420
421
422	if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
423	$sectionref = $sectionref->{'subsections'}->{$num};
424	} else {
425	return undef;
426	}
427	}
428
429	return $sectionref;
430	}
431
432	# calculate OID by hashing the contents of the document
433	sub _calc_OID {
434	my $self = shift (@_);
435	my ($filename) = @_;
436
437
438	my $osexe = &util::get_os_exe();
439
440	my $hashfile_exe = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin",
441	$ENV{'GSDLOS'},"hashfile$osexe");
442
443	&util::set_gnomelib_env(); # gnomelib_env (particularly lib/libiconv2.dylib) required to run the hashfile executable on Mac Lions
444	# The subroutine will set the gnomelib env once for each subshell launched, by first testing if GEXTGNOME is not already set
445
446	# A different way to set the gnomelib env would be to do it more locally: exporting the necessary vars
447	# (specifically DYLD/LD_LIB_PATH) for gnome_lib as part of the command executed.
448	# E.g. $result=`export LD_LIBRARY_PATH=../ext/gnome-lib/darwin/lib; hashfile...`
449
450	my $result = "NULL";
451
452
453	if (-e "$hashfile_exe") {
454	# $result = `\"$hashfile_exe\" \"$filename\"`;
455	# $result = `hashfile$osexe \"$filename\" 2>&1`;
456	$result = `hashfile$osexe \"$filename\"`;
457
458	($result) = $result =~ /:\s*([0-9a-f]+)/i;
459	} else {
460	print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
461	}
462	return "HASH$result";
463	}
464
465	# methods dealing with OID, not groups of them.
466
467	# if $OID is not provided one is calculated
468	sub set_OID {
469	my $self = shift (@_);
470	my ($OID) = @_;
471
472	my $use_hash_oid = 0;
473	# if an OID wasn't provided calculate one
474	if (!defined $OID) {
475	$OID = "NULL";
476	if ($self->{'OIDtype'} =~ /^hash/) {
477	$use_hash_oid = 1;
478	} elsif ($self->{'OIDtype'} eq "incremental") {
479	$OID = "D" . $OIDcount;
480	$OIDcount ++;
481	} elsif ($self->{'OIDtype'} eq "filename") {
482	my $filename = $self->get_source_filename();
483	$OID = &File::Basename::fileparse($filename, qr/\.[^.]*/);
484	$OID = &util::tidy_up_oid($OID);
485	} elsif ($self->{'OIDtype'} eq "full_filename") {
486	my $source_filename = $self->get_source_filename();
487	my $dirsep = &util::get_os_dirsep();
488
489	$source_filename =~ s/^import$dirsep//;
490	$source_filename =~ s/$dirsep/-/g;
491	$source_filename =~ s/\./_/g;
492
493	$OID = $source_filename;
494	$OID = &util::tidy_up_oid($OID);
495	} elsif ($self->{'OIDtype'} eq "dirname") {
496	my $filename = $self->get_source_filename();
497	if (defined($filename)) { # && -e $filename) {
498	# get the immediate parent directory
499	$OID = &File::Basename::dirname($filename);
500	if (defined $OID) {
501	$OID = &File::Basename::basename($OID);
502	$OID = &util::tidy_up_oid($OID);
503	} else {
504	print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
505	$use_hash_oid = 1;
506	}
507	} else {
508	print STDERR "Failed to find a filename, generating hash id\n";
509	$use_hash_oid = 1;
510	}
511
512	} elsif ($self->{'OIDtype'} eq "assigned") {
513	my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
514	if (defined $identifier && $identifier ne "") {
515	$OID = $identifier;
516	$OID = &util::tidy_up_oid($OID);
517	} else {
518	# need a hash id
519	print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
520	$use_hash_oid = 1;
521	}
522
523	} else {
524	$use_hash_oid = 1;
525	}
526
527	if ($use_hash_oid) {
528	my $hash_on_file = 1;
529	my $hash_on_ga_xml = 0;
530
531	if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
532	$hash_on_file = 0;
533	$hash_on_ga_xml = 1;
534	}
535
536	if ($self->{'OIDtype'} eq "hash_on_full_filename") {
537	$hash_on_file = 0;
538	$hash_on_ga_xml = 0;
539
540	my $source_filename = $self->get_source_filename();
541	my $dirsep = &util::get_os_dirsep();
542
543	$source_filename =~ s/^import$dirsep//;
544	$source_filename =~ s/$dirsep/-/g;
545	$source_filename =~ s/\./_/g;
546
547	# If the filename is very short then (handled naively)
548	# this can cause conjestion in the hash-values
549	# computed, leading documents sharing the same leading
550	# Hex values in the computed has.
551	#
552	# The solution taken here is to replace the name of
553	# the file name a sufficient number of times (up to
554	# the character limit defined in 'rep_limit' and
555	# make that the content that is hashed on
556
557	# *** Think twice before changing the following value
558	# as it will break backward compatability of computed
559	# document HASH values
560
561	my $rep_limit = 256;
562	my $hash_content = undef;
563
564	if (length($source_filename)<$rep_limit) {
565	my $rep_string = "$source_filename\|";
566	my $rs_len = length($rep_string);
567
568	my $clone_times = int(($rep_limit-1)/$rs_len) +1;
569
570	$hash_content = substr($rep_string x $clone_times, 0, $rep_limit);
571	}
572	else {
573	$hash_content = $source_filename;
574	}
575
576	my $filename = &util::get_tmp_filename();
577	if (!open (OUTFILE, ">:utf8", $filename)) {
578	print STDERR "doc::set_OID could not write to $filename\n";
579	} else {
580	print OUTFILE $hash_content;
581	close (OUTFILE);
582	}
583	$OID = $self->_calc_OID ($filename);
584
585	&FileUtils::removeFiles ($filename);
586	}
587
588	if ($hash_on_file) {
589	# "hash" OID - feed file to hashfile.exe
590	my $filename = $self->get_filename_for_hashing();
591
592	# -z: don't want to hash on the file if it is zero size
593	if (defined($filename) && -e $filename && !-z $filename) {
594	$OID = $self->_calc_OID ($filename);
595	} else {
596	$hash_on_ga_xml = 1; # switch to back-up plan, and hash on GA file instead
597	}
598	}
599
600	if ($hash_on_ga_xml) {
601	# In addition being asked to explicity calculate the has based on the GA file,
602	# can also end up coming into this block is doing 'hash_on_file' but the file
603	# itself is of zero bytes (as could be the case with 'doc.nul' file
604
605	my $filename = &util::get_tmp_filename();
606	if (!open (OUTFILE, ">:utf8", $filename)) {
607	print STDERR "doc::set_OID could not write to $filename\n";
608	} else {
609	my $doc_text = &docprint::get_section_xml($self);
610	print OUTFILE $doc_text;
611	close (OUTFILE);
612	}
613	$OID = $self->_calc_OID ($filename);
614	&FileUtils::removeFiles($filename);
615	}
616	}
617	}
618	$self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
619	}
620
621	# this uses hashdoc (embedded c thingy) which is faster but still
622	# needs a little work to be sufficiently stable
623	sub ___set_OID {
624	my $self = shift (@_);
625	my ($OID) = @_;
626
627	# if an OID wasn't provided then calculate hash value based on document
628	if (!defined $OID)
629	{
630	my $hash_text = &docprint::get_section_xml($self);
631	my $hash_len = length($hash_text);
632
633	$OID = &hashdoc::buffer($hash_text,$hash_len);
634	}
635
636	$self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
637	}
638
639	# returns the OID for this document
640	sub get_OID {
641	my $self = shift (@_);
642	my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
643	return $OID if (defined $OID);
644	return "NULL";
645	}
646
647	sub delete_OID {
648	my $self = shift (@_);
649
650	$self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
651	}
652
653
654	# methods for manipulating section names
655
656	# returns the name of the top-most section (the top
657	# level of the document
658	sub get_top_section {
659	my $self = shift (@_);
660
661	return "";
662	}
663
664	# returns a section
665	sub get_parent_section {
666	my $self = shift (@_);
667	my ($section) = @_;
668
669	$section =~ s/(^\|\.)\d+$//;
670
671	return $section;
672	}
673
674	# returns the first child section (or the end child
675	# if there isn't any)
676	sub get_begin_child {
677	my $self = shift (@_);
678	my ($section) = @_;
679
680	my $section_ptr = $self->_lookup_section($section);
681	return "" unless defined $section_ptr;
682
683	if (defined $section_ptr->{'subsection_order'}->[0]) {
684	return "$section.$section_ptr->{'subsection_order'}->[0]";
685	}
686
687	return $self->get_end_child ($section);
688	}
689
690	# returns the next child of a parent section
691	sub get_next_child {
692	my $self = shift (@_);
693	my ($section) = @_;
694
695	my $parent_section = $self->get_parent_section($section);
696	my $parent_section_ptr = $self->_lookup_section($parent_section);
697	return undef unless defined $parent_section_ptr;
698
699	my ($section_num) = $section =~ /(\d+)$/;
700	return undef unless defined $section_num;
701
702	my $i = 0;
703	my $section_order = $parent_section_ptr->{'subsection_order'};
704	while ($i < scalar(@$section_order)) {
705	last if $section_order->[$i] eq $section_num;
706	$i++;
707	}
708
709	$i++; # the next child
710	if ($i < scalar(@$section_order)) {
711	return $section_order->[$i] if $parent_section eq "";
712	return "$parent_section.$section_order->[$i]";
713	}
714
715	# no more sections in this level
716	return undef;
717	}
718
719	# returns a reference to a list of children
720	sub get_children {
721	my $self = shift (@_);
722	my ($section) = @_;
723
724	my $section_ptr = $self->_lookup_section($section);
725	return [] unless defined $section_ptr;
726
727	my @children = @{$section_ptr->{'subsection_order'}};
728
729	map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
730	return \@children;
731	}
732
733	# returns the child section one past the last one (which
734	# is coded as "0")
735	sub get_end_child {
736	my $self = shift (@_);
737	my ($section) = @_;
738
739	return $section . ".0" unless $section eq "";
740	return "0";
741	}
742
743	# returns the next section in book order
744	sub get_next_section {
745	my $self = shift (@_);
746	my ($section) = @_;
747
748	return undef unless defined $section;
749
750	my $section_ptr = $self->_lookup_section($section);
751	return undef unless defined $section_ptr;
752
753	# first try to find first child
754	if (defined $section_ptr->{'subsection_order'}->[0]) {
755	return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
756	return "$section.$section_ptr->{'subsection_order'}->[0]";
757	}
758
759	do {
760	# try to find sibling
761	my $next_child = $self->get_next_child ($section);
762	return $next_child if (defined $next_child);
763
764	# move up one level
765	$section = $self->get_parent_section ($section);
766	} while $section =~ /\d/;
767
768	return undef;
769	}
770
771	sub is_leaf_section {
772	my $self = shift (@_);
773	my ($section) = @_;
774
775	my $section_ptr = $self->_lookup_section($section);
776	return 1 unless defined $section_ptr;
777
778	return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
779	}
780
781	# methods for dealing with sections
782
783	# returns the name of the inserted section
784	sub insert_section {
785	my $self = shift (@_);
786	my ($before_section) = @_;
787
788	# get the child to insert before and its parent section
789	my $parent_section = "";
790	my $before_child = "0";
791	my @before_section = split (/\./, $before_section);
792	if (scalar(@before_section) > 0) {
793	$before_child = pop (@before_section);
794	$parent_section = join (".", @before_section);
795	}
796
797	my $parent_section_ptr = $self->_lookup_section($parent_section);
798	if (!defined $parent_section_ptr) {
799	print STDERR "doc::insert_section couldn't find parent section " .
800	"$parent_section\n";
801	return;
802	}
803
804	# get the next section number
805	my $section_num = $parent_section_ptr->{'next_subsection'}++;
806
807	my $i = 0;
808	while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
809	$parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
810	$i++;
811	}
812
813	# insert the section number into the order list
814	splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
815
816	# add this section to the parent section
817	my $section_ptr = {'subsection_order'=>[],
818	'next_subsection'=>1,
819	'subsections'=>{},
820	'metadata'=>[],
821	'text'=>""};
822	$parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
823
824	# work out the full section number
825	my $section = $parent_section;
826	$section .= "." unless $section eq "";
827	$section .= $section_num;
828
829	return $section;
830	}
831
832	# creates a pre-named section
833	sub create_named_section {
834	my $self = shift (@_);
835	my ($mastersection) = @_;
836
837	my ($num);
838	my $section = $mastersection;
839	my $sectionref = $self;
840
841	while ($section ne "") {
842	($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
843	$num =~ s/^0+(\d)/$1/; # remove leading 0s
844	$section = "" unless defined $section;
845
846	if (defined $num) {
847	if (!defined $sectionref->{'subsections'}->{$num}) {
848	push (@{$sectionref->{'subsection_order'}}, $num);
849	$sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
850	'next_subsection'=>1,
851	'subsections'=>{},
852	'metadata'=>[],
853	'text'=>""};
854	if ($num >= $sectionref->{'next_subsection'}) {
855	$sectionref->{'next_subsection'} = $num + 1;
856	}
857	}
858	$sectionref = $sectionref->{'subsections'}->{$num};
859
860	} else {
861	print STDERR "doc::create_named_section couldn't create section ";
862	print STDERR "$mastersection\n";
863	last;
864	}
865	}
866	}
867
868	# returns a reference to a list of subsections
869	sub list_subsections {
870	my $self = shift (@_);
871	my ($section) = @_;
872
873	my $section_ptr = $self->_lookup_section ($section);
874	if (!defined $section_ptr) {
875	print STDERR "doc::list_subsections couldn't find section $section\n";
876	return [];
877	}
878
879	return [@{$section_ptr->{'subsection_order'}}];
880	}
881
882	sub delete_section {
883	my $self = shift (@_);
884	my ($section) = @_;
885
886	# my $section_ptr = {'subsection_order'=>[],
887	# 'next_subsection'=>1,
888	# 'subsections'=>{},
889	# 'metadata'=>[],
890	# 'text'=>""};
891
892	# if this is the top section reset everything
893	if ($section eq "") {
894	$self->{'subsection_order'} = [];
895	$self->{'subsections'} = {};
896	$self->{'metadata'} = [];
897	$self->{'text'} = "";
898	return;
899	}
900
901	# find the parent of the section to delete
902	my $parent_section = "";
903	my $child = "0";
904	my @section = split (/\./, $section);
905	if (scalar(@section) > 0) {
906	$child = pop (@section);
907	$parent_section = join (".", @section);
908	}
909
910	my $parent_section_ptr = $self->_lookup_section($parent_section);
911	if (!defined $parent_section_ptr) {
912	print STDERR "doc::delete_section couldn't find parent section " .
913	"$parent_section\n";
914	return;
915	}
916
917	# remove this section from the subsection_order list
918	my $i = 0;
919	while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
920	if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
921	splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
922	last;
923	}
924	$i++;
925	}
926
927	# remove this section from the subsection hash
928	if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
929	undef $parent_section_ptr->{'subsections'}->{$child};
930	}
931	}
932
933	#--
934	# methods for dealing with metadata
935
936	# set_metadata_element and get_metadata_element are for metadata
937	# which should only have one value. add_meta_data and get_metadata
938	# are for metadata which can have more than one value.
939
940	# returns the first metadata value which matches field
941
942	# This version of get metadata element works much like the one above,
943	# except it allows for the namespace portion of a metadata element to
944	# be ignored, thus if you are searching for dc.Title, the first piece
945	# of matching metadata ending with the name Title (once any namespace
946	# is removed) would be returned.
947	# 28-11-2003 John Thompson
948	sub get_metadata_element {
949	my $self = shift (@_);
950	my ($section, $field, $ignore_namespace) = @_;
951	my ($data);
952
953	$ignore_namespace = 0 unless defined $ignore_namespace;
954
955	my $section_ptr = $self->_lookup_section($section);
956	if (!defined $section_ptr) {
957	print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
958	return;
959	}
960
961	# Remove any namespace if we are being told to ignore them
962	if($ignore_namespace) {
963	$field =~ s/^.\.//; #$field =~ s/^\w\.//;
964	}
965
966	foreach $data (@{$section_ptr->{'metadata'}}) {
967
968	my $data_name = $data->[0];
969
970	# Remove any namespace if we are being told to ignore them
971	if($ignore_namespace) {
972	$data_name =~ s/^.\.//; #$data_name =~ s/^\w\.//;
973	}
974	# we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
975	$data_name =~ s/^ex\.([^.]+)$/$1/; #$data_name =~ s/^ex\.//;
976
977	return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
978	}
979
980	return undef; # was not found
981	}
982
983	# returns a list of the form [value1, value2, ...]
984	sub get_metadata {
985	my $self = shift (@_);
986	my ($section, $field, $ignore_namespace) = @_;
987	my ($data);
988
989	$ignore_namespace = 0 unless defined $ignore_namespace;
990
991	my $section_ptr = $self->_lookup_section($section);
992	if (!defined $section_ptr) {
993	print STDERR "doc::get_metadata couldn't find section ",
994	$section, "\n";
995	return;
996	}
997
998	# Remove any namespace if we are being told to ignore them
999	if($ignore_namespace) {
1000	$field =~ s/^.*\.//;
1001	}
1002
1003	my @metadata = ();
1004	foreach $data (@{$section_ptr->{'metadata'}}) {
1005
1006	my $data_name = $data->[0];
1007
1008	# Remove any namespace if we are being told to ignore them
1009	if($ignore_namespace) {
1010	$data_name =~ s/^.*\.//;
1011	}
1012	# we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
1013	$data_name =~ s/^ex\.([^.]+)$/$1/;
1014
1015	push (@metadata, $data->[1]) if ($data_name eq $field);
1016	}
1017
1018	return \@metadata;
1019	}
1020
1021	sub get_metadata_hashmap {
1022	my $self = shift (@_);
1023	my ($section, $opt_namespace) = @_;
1024
1025	my $section_ptr = $self->_lookup_section($section);
1026	if (!defined $section_ptr) {
1027	print STDERR "doc::get_metadata couldn't find section ",
1028	$section, "\n";
1029	return;
1030	}
1031
1032	my $metadata_hashmap = {};
1033	foreach my $data (@{$section_ptr->{'metadata'}}) {
1034	my $metaname = $data->[0];
1035
1036	if ((!defined $opt_namespace) \|\| ($metaname =~ m/^$opt_namespace\./)) {
1037	if (!defined $metadata_hashmap->{$metaname}) {
1038	$metadata_hashmap->{$metaname} = [];
1039	}
1040	my $metaval_list = $metadata_hashmap->{$metaname};
1041	push(@$metaval_list, $data->[1]);
1042	}
1043	}
1044
1045	return $metadata_hashmap;
1046	}
1047
1048	# returns a list of the form [[field,value],[field,value],...]
1049	sub get_all_metadata {
1050	my $self = shift (@_);
1051	my ($section) = @_;
1052
1053	my $section_ptr = $self->_lookup_section($section);
1054	if (!defined $section_ptr) {
1055	print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
1056	return;
1057	}
1058
1059	return $section_ptr->{'metadata'};
1060	}
1061
1062	# $value is optional
1063	sub delete_metadata {
1064	my $self = shift (@_);
1065	my ($section, $field, $value) = @_;
1066
1067	my $section_ptr = $self->_lookup_section($section);
1068	if (!defined $section_ptr) {
1069	print STDERR "doc::delete_metadata couldn't find section ", $section, "$field\n";
1070	return;
1071	}
1072
1073	my $i = 0;
1074	while ($i < scalar (@{$section_ptr->{'metadata'}})) {
1075	if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
1076	(!defined $value \|\| $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
1077	splice (@{$section_ptr->{'metadata'}}, $i, 1);
1078	} else {
1079	$i++;
1080	}
1081	}
1082	}
1083
1084	sub delete_all_metadata {
1085	my $self = shift (@_);
1086	my ($section) = @_;
1087
1088	my $section_ptr = $self->_lookup_section($section);
1089	if (!defined $section_ptr) {
1090	print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
1091	return;
1092	}
1093
1094	$section_ptr->{'metadata'} = [];
1095	}
1096
1097	sub set_metadata_element {
1098	my $self = shift (@_);
1099	my ($section, $field, $value) = @_;
1100
1101	$self->set_utf8_metadata_element ($section, $field,
1102	&unicode::ascii2utf8(\$value));
1103	}
1104
1105	# set_utf8_metadata_element assumes the text has already been
1106	# converted to the UTF-8 encoding.
1107	sub set_utf8_metadata_element {
1108	my $self = shift (@_);
1109	my ($section, $field, $value) = @_;
1110
1111	$self->delete_metadata ($section, $field);
1112	$self->add_utf8_metadata ($section, $field, $value);
1113	}
1114
1115
1116	# add_metadata assumes the text is in (extended) ascii form. For
1117	# text which has already been converted to the UTF-8 format use
1118	# add_utf8_metadata.
1119	sub add_metadata {
1120	my $self = shift (@_);
1121	my ($section, $field, $value) = @_;
1122
1123	$self->add_utf8_metadata ($section, $field,
1124	&unicode::ascii2utf8(\$value));
1125	}
1126
1127	sub add_utf8_metadata {
1128	my $self = shift (@_);
1129	my ($section, $field, $value) = @_;
1130
1131	# my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1132	# my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
1133	# print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
1134
1135	my $section_ptr = $self->_lookup_section($section);
1136	if (!defined $section_ptr) {
1137	print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
1138	return;
1139	}
1140	if (!defined $value) {
1141	print STDERR "doc::add_utf8_metadata undefined value for $field\n";
1142	return;
1143	}
1144	if (!defined $field) {
1145	print STDERR "doc::add_utf8_metadata undefined metadata type \n";
1146	return;
1147	}
1148
1149	#print STDERR "###$field=$value\n";
1150
1151	# For now, supress this check. Given that text data read in is now
1152	# Unicode aware, then the following block of code can (ironically enough)
1153	# cause our unicode compliant string to be re-encoded (leading to
1154	# a double-encoded UTF-8 string, which we definitely don't want!).
1155
1156
1157	# double check that the value is utf-8
1158	# if (!&unicode::check_is_utf8($value)) {
1159	# print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
1160	# &unicode::ensure_utf8(\$value);
1161	# print STDERR " Tried converting to utf8: $value\n";
1162	# }
1163
1164	#If the metadata value is either a latitude or a longitude value then we want to save a shortened version for spacial searching purposes
1165	if ($field =~ m/^(.+\.)?Latitude$/ \|\| $field =~ m/^(.+\.)?Longitude$/)
1166	{
1167	my ($mdprefix,$metaname) = ($field =~ m/(.+)\.(.+)$/);
1168	if (defined $mdprefix) {
1169	# Add in a version of Latitude/Longitude without the metadata namespace prefix to keep Runtime happy
1170	push (@{$section_ptr->{'metadata'}}, [$metaname, $value]);
1171	}
1172
1173	my $direction;
1174	# TODO does the following work, with eq latitude? the above code implies can be xxxxx.Latitude. should use metaname??
1175	if($value =~ m/^-/)
1176	{
1177	$direction = ($field eq "Latitude") ? "S" : "W";
1178	}
1179	else
1180	{
1181	$direction = ($field eq "Latitude") ? "N" : "E";
1182	}
1183
1184	my ($beforeDec, $afterDec) = ($value =~ m/^-?([0-9]+)\.([0-9]+)$/);
1185	if(defined $beforeDec && defined $afterDec)
1186	{
1187	my $name = ($field eq "Latitude") ? "LatShort" : "LngShort";
1188	push (@{$section_ptr->{'metadata'}}, [$name, $beforeDec . $direction]);
1189
1190	for(my $i = 2; $i <= 4; $i++)
1191	{
1192	if(length($afterDec) >= $i)
1193	{
1194	push (@{$section_ptr->{'metadata'}}, [$name, substr($afterDec, 0, $i)]);
1195	}
1196	}
1197
1198	# what is this???? completely does nothing...
1199	#Only add the metadata if it has not already been added
1200	my $metaMap = $self->get_metadata_hashmap($section); # metaMap not used and called function has no apparent side-effects,
1201	# but this line appears important for ensuring uniqueness of (Latitude, value) meta
1202	# in the section. Also for LatShort, Longitude, LngShort.
1203	}
1204
1205	# if we have both Latitude and Longitude, then add coordinate metadata (not sure why, but someone else added it)
1206	my $latitude;
1207	my $longitude;
1208	if($field =~ m/^(.+\.)?Longitude$/) {
1209	# if we are dealing with Longitude meta, and we have already seen Latitude, then add Coordinate metadata
1210	$latitude = $self->get_metadata_element ($section, "Latitude");
1211	if (defined $latitude) {
1212	$longitude = $value;
1213	}
1214	} else {
1215	# we are dealing with Latitude - if we have already seen Longitude, then add both as Coordinate metadata
1216	$longitude = $self->get_metadata_element ($section, "Longitude");
1217	if (defined $longitude) {
1218	$latitude = $value;
1219	}
1220	}
1221	if (defined $latitude && defined $longitude) {
1222	##rint STDERR "adding coords for $section, $latitude, $longitude\n";
1223	$self->processCoordinate($section, $latitude, $longitude);
1224	} else {
1225	##rint STDERR "lat or long not defined, not adding coords for $section\n";
1226	}
1227
1228	}
1229
1230	elsif($field eq "GPS.mapOverlay") { # then the $value is a JSON string
1231
1232	# In order to allow searching map-data-enriched documents by map shape descriptions,
1233	# and to run rawquery searches for other docs by proximity based on their map data,
1234	# need to store the shape descriptions and Coordinate info for shapes into the text index.
1235	# We add the description for each shape in the mapoverlay into the text index as GPSMapOverlayLabel
1236	# And we add Coordinate (CD) and CoordShort (CS) info for each shape in the mapoverlay in the format (Lat, Lng) as ("37S339 175E342")
1237	# where the digits before the N/S/E/W direction represents the whole number. And the digits after the direction are the
1238	# decimal places which can vary between 0 and 2 to 4 digits.
1239
1240	# However, we only want to process GPS.mapOverlay during buildcol and only in the text indexing passes (e.g. sidx and didx for lucene)
1241	# and certainly never during the infodb pass of buildcol. The latter can end up duplicating Coordinate/CoordShort/GPSMapOverlayLabel
1242	# when rebuilding with the online doc editor, as that runs incremental-rebuild which then calls basebuilder::reconstruct_doc_objs_metadata()
1243	# on all docs NOT being incrementally rebuilt. That call would get meta from the infodb and use it to reconstruct the doc objects of docs
1244	# NOT being incrementally built. If the Coord and Label meta were written to the infodb, they would then be loaded back in memory when the
1245	# collection is incrementally rebuilt for those docs that don't need incremental processing. Then this function would once again add the
1246	# same meta into the infodb, thus duplicating what goes into the infodb. Hence, don't do all the following if doc::processor_mode =~ "infodb",
1247	# or anything other than a text mode.
1248
1249	# Note that for incremental rebuilding, the text pass can be called textreindex for instance (and infodb pass can be incinfodb).
1250	# So don't check for exact string match
1251
1252	if($doc::cmd_line_mode eq "buildcol" && $doc::processor_mode =~ m/^text/) { # currently known text processor_modes:
1253	# text, textreindex, possibly textdelete (see ArchivesInfPlugin.pm::read() for last 2).
1254	# OR: ..."buildcol" && $doc::processor_mode !~ m/infodb$/) # if dummy pass important
1255
1256	###print STDERR "GPS.mapOverlay before decoding, val = " . $value . "\n";
1257
1258	# TODO: html decode?
1259	$value =~ s@[@[@g;
1260	$value =~ s@]@]@g;
1261	$value =~ s@"@"@g;
1262	###print STDERR "GPS.mapOverlay after decoding, val = " . $value . "\n";
1263
1264	my $json_array = decode_json $value;
1265
1266	foreach my $shape (@$json_array) {
1267
1268	# Put each available shape description/label into this section's metadata with GPSMapOverlayLabel as metaname.
1269	# Just as for Coordinate meta, don't need to know which shape a label belongs too. This is just so each label
1270	# will be indexed, and therefore can be searched.
1271
1272	my $description = $shape->{"description"};
1273	if($description) {
1274	push (@{$section_ptr->{'metadata'}}, ["GPSMapOverlayLabel", $description]);
1275	###print STDERR "@@@@############################################ Just added description meta: " . $description . "\n";
1276	}
1277
1278	my $type = $shape->{"type"};
1279	###print STDERR "Shape type : " . $type . "\n";
1280
1281	if($type eq "circle") {
1282	###print STDERR "Found a circle:\n" . &printShape($json, $shape);
1283
1284	# work out bounding box
1285	# want the inverse of this useful page:
1286	# https://stackoverflow.com/questions/639695/how-to-convert-latitude-or-longitude-to-meters
1287	# https://www.geodatasource.com/developers/javascript
1288
1289	# Dr Bainbridge wants us to use: https://gis.stackexchange.com/questions/5821/calculating-latitude-longitude-x-miles-from-point
1290	# Since for N,E,S,W the following works out the same as above link, we're using the rule of thumb in the following:
1291	# https://gis.stackexchange.com/questions/2951/algorithm-for-offsetting-a-latitude-longitude-by-some-amount-of-meters
1292	# which states
1293	# "If your displacements aren't too great (less than a few kilometers) and you're not right at the poles,
1294	# use the quick and dirty estimate that 111,111 meters (111.111 km) in the y direction is 1 degree
1295	# (of latitude) and 111,111 * cos(latitude) meters in the x direction is 1 degree (of longitude)."
1296	my $centre_lat = $shape->{"center"}->{"lat"};
1297	my $centre_lng = $shape->{"center"}->{"lng"};
1298	my $radius = $shape->{"radius"}; # in metres!
1299
1300	###print STDERR "@@@ circle centre: ($centre_lat, $centre_lng), radius: $radius\n";
1301
1302	my $lat_north = $centre_lat + ($radius/111111);
1303	my $lat_south = $centre_lat - ($radius/111111);
1304
1305	###print STDERR "### lat_north: $lat_north\n";
1306	###print STDERR "### lat_south: $lat_south\n";
1307
1308	# our latitude and longitude values are in degrees. But cos and sin etc in perl and generally all prog languages
1309	# all expect their angle to be in radians. So need to convert from degree to radians before we can take the cos of it.
1310	my $centre_lat_radians = $self->degreesToRadians($centre_lat);
1311	my $cos_in_radians = cos($centre_lat_radians);
1312	###print STDERR "cos $centre_lat_radians " . cos($centre_lat_radians) . "\n";
1313	my $lng_east = $centre_lng + ($radius/(111111 * $cos_in_radians));
1314	my $lng_west = $centre_lng - ($radius/(111111 * $cos_in_radians));
1315	###print STDERR "### lng_east $lng_east\n";
1316	###print STDERR "### lng_west $lng_west\n";
1317
1318	my $cos_lat = cos($centre_lat);
1319	###print STDERR "cos $centre_lat is $cos_lat\n";
1320
1321	$self->processCoordinate($section, $lat_north, $lng_east);
1322	$self->processCoordinate($section, $lat_south, $lng_east);
1323	$self->processCoordinate($section, $lat_south, $lng_west);
1324	$self->processCoordinate($section, $lat_north, $lng_west);
1325
1326	}
1327	elsif ($type eq "marker") {
1328	###print STDERR "@@ MARKER FOUND WITH LAT: " . $shape->{"position"}->{"lat"} . "\n";
1329	###print STDERR "@@ MARKER FOUND WITH LNG: " . $shape->{"position"}->{"lng"} . "\n";
1330	$self->processCoordinate($section, $shape->{"position"}->{"lat"}, $shape->{"position"}->{"lng"});
1331	}
1332	elsif ($type eq "polyline" \|\| $type eq "polygon") {
1333	my $path_array = $shape->{"path"};
1334	foreach my $position (@$path_array) {
1335	$self->processCoordinate($section, $position->{"lat"}, $position->{"lng"});
1336	}
1337	}
1338	elsif ($type eq "rectangle") {
1339
1340	my $bounds = $shape->{"bounds"};
1341	$self->processCoordinate($section, $bounds->{"north"}, $bounds->{"east"});
1342	$self->processCoordinate($section, $bounds->{"south"}, $bounds->{"east"});
1343	$self->processCoordinate($section, $bounds->{"south"}, $bounds->{"west"});
1344	$self->processCoordinate($section, $bounds->{"north"}, $bounds->{"west"});
1345	}
1346
1347	} # end for on each shape in GPS.mapOverlay
1348	} # end if(buildcol and text pass)
1349	} # end GPS.mapOverlay meta
1350
1351	push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1352	}
1353
1354	# https://en.wikipedia.org/wiki/Radian
1355	sub degreesToRadians
1356	{
1357	my $self = shift (@_);
1358	my ($degrees) = @_;
1359
1360	return $degrees * pi /180; # returns radians
1361	}
1362
1363	sub radiansToDegrees
1364	{
1365	my $self = shift (@_);
1366	my ($radians) = @_;
1367
1368	return $radians * 180 / pi; # returns degrees
1369	}
1370
1371	# Call as:
1372	# my $json = JSON->new->allow_nonref;
1373	# &printAllShapes($json, $json_array);
1374	sub printAllShapes {
1375	my ($json, $json_array) = @_;
1376
1377
1378	#my $pretty_print_shape = $json->pretty->encode( $json_array->[0] );
1379	foreach my $shape (@$json_array) {
1380	my $pretty_print_shape = $json->pretty->encode( $shape );
1381	print STDERR "Shape: $pretty_print_shape\n";
1382	}
1383
1384	}
1385
1386	# For the (lat, lng) coordinate given,
1387	# attaches Coordinate and multiple CoordShort (different precision level) metadata to the doc object
1388	sub processCoordinate {
1389	my $self = shift (@_);
1390	my ($section, $latitude, $longitude) = @_;
1391
1392	my $section_ptr = $self->_lookup_section($section);
1393
1394	my $lat_direction = ($latitude =~ m/^-/) ? "S" : "N";
1395	my $lng_direction = ($longitude =~ m/^-/) ? "W" : "E";
1396
1397	# have to store (lat, lng) in pairs, when there are so many coords to store
1398	#push (@{$section_ptr->{'metadata'}}, ["Latitude", $latitude]);
1399	#push (@{$section_ptr->{'metadata'}}, ["Longitude", $longitude]);
1400
1401	push (@{$section_ptr->{'metadata'}}, ["Coordinate", "$latitude $longitude"]); # "$latitude$lat_direction $longitude$lng_direction"
1402
1403	my ($latBeforeDec, $latAfterDec);
1404	my ($lngBeforeDec, $lngAfterDec);
1405
1406	if($latitude !~ m/\./) {
1407	$latBeforeDec = $latitude;
1408	$latAfterDec = "";
1409	} else {
1410	($latBeforeDec, $latAfterDec) = ($latitude =~ m/^-?([0-9]+)\.([0-9]+)$/);
1411	}
1412	if($longitude !~ m/\./) {
1413	$lngBeforeDec = $longitude;
1414	$lngAfterDec = "";
1415	} else {
1416	($lngBeforeDec, $lngAfterDec) = ($longitude =~ m/^-?([0-9]+)\.([0-9]+)$/);
1417	}
1418
1419
1420	my $name = "CoordShort";
1421	push (@{$section_ptr->{'metadata'}}, [$name, "$latBeforeDec$lat_direction $lngBeforeDec$lng_direction"]);
1422
1423	for(my $i = 2; $i <= 4; $i++)
1424	{
1425	my $latDecPlaces = (length($latAfterDec) >= $i) ? substr($latAfterDec, 0, $i) : "";
1426	my $lngDecPlaces = (length($lngAfterDec) >= $i) ? substr($lngAfterDec, 0, $i) : "";
1427
1428	push (@{$section_ptr->{'metadata'}}, [$name,
1429	$latBeforeDec . $lat_direction. $latDecPlaces . " " . $lngBeforeDec . $lng_direction. $lngDecPlaces]);
1430
1431	}
1432
1433	#Only add the metadata if it has not already been added
1434	#my $metaMap = $self->get_metadata_hashmap($section); ### TODO: metaMap not used. Unnecesssary step? (Called function has no apparent side-effects.)
1435	}
1436
1437
1438	# methods for dealing with text
1439
1440	# returns the text for a section
1441	sub get_text {
1442	my $self = shift (@_);
1443	my ($section) = @_;
1444
1445	my $section_ptr = $self->_lookup_section($section);
1446	if (!defined $section_ptr) {
1447	print STDERR "doc::get_text couldn't find section " .
1448	"$section\n";
1449	return "";
1450	}
1451
1452	return $section_ptr->{'text'};
1453	}
1454
1455	# returns the (utf-8 encoded) length of the text for a section
1456	sub get_text_length {
1457	my $self = shift (@_);
1458	my ($section) = @_;
1459
1460	my $section_ptr = $self->_lookup_section($section);
1461	if (!defined $section_ptr) {
1462	print STDERR "doc::get_text_length couldn't find section " .
1463	"$section\n";
1464	return 0;
1465	}
1466
1467	return length ($section_ptr->{'text'});
1468	}
1469
1470	# returns the total length for all the sections
1471	sub get_total_text_length {
1472	my $self = shift (@_);
1473
1474	my $section = $self->get_top_section();
1475	my $length = 0;
1476	while (defined $section) {
1477	$length += $self->get_text_length($section);
1478	$section = $self->get_next_section($section);
1479	}
1480	return $length;
1481	}
1482
1483	sub delete_text {
1484	my $self = shift (@_);
1485	my ($section) = @_;
1486
1487	my $section_ptr = $self->_lookup_section($section);
1488	if (!defined $section_ptr) {
1489	print STDERR "doc::delete_text couldn't find section " .
1490	"$section\n";
1491	return;
1492	}
1493
1494	$section_ptr->{'text'} = "";
1495	}
1496
1497	# add_text assumes the text is in (extended) ascii form. For
1498	# text which has been already converted to the UTF-8 format
1499	# use add_utf8_text.
1500	sub add_text {
1501	my $self = shift (@_);
1502	my ($section, $text) = @_;
1503
1504	# convert the text to UTF-8 encoded unicode characters
1505	# and add the text
1506	$self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1507	}
1508
1509
1510	# add_utf8_text assumes the text to be added has already
1511	# been converted to the UTF-8 encoding. For ascii text use
1512	# add_text
1513	# Pass by value version (internally calls pass by ref version
1514	# to avoid code duplication)
1515	sub add_utf8_text {
1516	my $self = shift (@_);
1517	my ($section, $text) = @_;
1518
1519	$self->add_utf8_textref($section, \$text);
1520	}
1521
1522	# Pass by reference version, used by GreenstoneSQLPlugin for fulltext
1523	sub add_utf8_textref {
1524	my $self = shift (@_);
1525	my ($section, $text_ref) = @_;
1526
1527	my $section_ptr = $self->_lookup_section($section);
1528	if (!defined $section_ptr) {
1529	print STDERR "doc::add_utf8_textref couldn't find section " .
1530	"$section\n";
1531	return;
1532	}
1533
1534	$section_ptr->{'text'} .= $$text_ref;
1535	}
1536
1537	# returns the Source meta, which is the utf8 filename generated.
1538	# Added a separate method here for convenience
1539	sub get_source {
1540	my $self = shift (@_);
1541	return $self->get_metadata_element ($self->get_top_section(), "Source");
1542	}
1543
1544	# returns the SourceFile meta, which is the url reference to the URL-encoded
1545	# version of Source (the utf8 filename). Added a separate method here for convenience
1546	sub get_sourcefile {
1547	my $self = shift (@_);
1548	return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
1549	}
1550
1551	# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
1552	# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
1553	sub get_assocfile_from_sourcefile {
1554	my $self = shift (@_);
1555
1556	# get the SourceFile meta, which is a URL to a file on the filesystem
1557	my $top_section = $self->get_top_section();
1558	my $source_file = $self->get_metadata_element($top_section, "SourceFile");
1559
1560	# get the actual filename as it exists on the filesystem which this url refers to
1561	$source_file = &unicode::url_to_filename($source_file);
1562	my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
1563	return $assocfilename;
1564	}
1565
1566	# methods for dealing with associated files
1567
1568	# a file is associated with a document, NOT a section.
1569	# if section is defined it is noted in the data structure
1570	# only so that files associated from a particular section
1571	# may be removed later (using delete_section_assoc_files)
1572	sub associate_file {
1573	my $self = shift (@_);
1574	my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1575	$mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1576
1577	# remove all associated files with the same name
1578	$self->delete_assoc_file ($assoc_filename);
1579
1580	# Too harsh a requirement
1581	# Definitely get HTML docs, for example, with some missing
1582	# support files
1583	# if (!&util::fd_exists($real_filename)) {
1584	# print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n";
1585	# exit -1;
1586	# }
1587
1588	# print STDERR "**** is the following a UTF8 rep of real filename?\n $real_filename\n";
1589	# print STDERR "****##### so, ensure it is before storing?!?!?\n";
1590	## my $utf8_filename = Encode::encode("utf8",$filename);
1591
1592	push (@{$self->{'associated_files'}},
1593	[$real_filename, $assoc_filename, $mime_type, $section]);
1594	}
1595
1596	# returns a list of associated files in the form
1597	# [[real_filename, assoc_filename, mimetype], ...]
1598	sub get_assoc_files {
1599	my $self = shift (@_);
1600
1601	return $self->{'associated_files'};
1602	}
1603
1604	# the following two methods used to keep track of original associated files
1605	# for incremental building. eg a txt file used by an item file does not end
1606	# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1607	# file for incremental build
1608	sub associate_source_file {
1609	my $self = shift (@_);
1610	my ($full_filename) = @_;
1611
1612	push (@{$self->{'source_assoc_files'}}, $full_filename);
1613
1614	}
1615
1616	sub get_source_assoc_files {
1617	my $self = shift (@_);
1618
1619	return $self->{'source_assoc_files'};
1620
1621
1622	}
1623	sub metadata_file {
1624	my $self = shift (@_);
1625	my ($real_filename, $filename) = @_;
1626
1627	push (@{$self->{'metadata_files'}},
1628	[$real_filename, $filename]);
1629	}
1630
1631	# used for writing out the archiveinf-doc info database, to list all the metadata files
1632	sub get_meta_files {
1633	my $self = shift (@_);
1634
1635	return $self->{'metadata_files'};
1636	}
1637
1638	sub delete_section_assoc_files {
1639	my $self = shift (@_);
1640	my ($section) = @_;
1641
1642	my $i=0;
1643	while ($i < scalar (@{$self->{'associated_files'}})) {
1644	if (defined $self->{'associated_files'}->[$i]->[3] &&
1645	$self->{'associated_files'}->[$i]->[3] eq $section) {
1646	splice (@{$self->{'associated_files'}}, $i, 1);
1647	} else {
1648	$i++;
1649	}
1650	}
1651	}
1652
1653	sub delete_assoc_file {
1654	my $self = shift (@_);
1655	my ($assoc_filename) = @_;
1656
1657	my $i=0;
1658	while ($i < scalar (@{$self->{'associated_files'}})) {
1659	if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1660	splice (@{$self->{'associated_files'}}, $i, 1);
1661	} else {
1662	$i++;
1663	}
1664	}
1665	}
1666
1667	sub reset_nextsection_ptr {
1668	my $self = shift (@_);
1669	my ($section) = @_;
1670
1671	my $section_ptr = $self->_lookup_section($section);
1672	$section_ptr->{'next_subsection'} = 1;
1673	}
1674
1675	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: