Context Navigation

source: main/trunk/greenstone2/perllib/doc.pm@ 34394

Last change on this file since 34394 was 34394, checked in by ak19, 4 years ago
Bugfix 1 for GLI metadata slowdown: selecting multiple Gathererd files in GLI became very slow. Kathy and Dr Bainbridge had tracked this down to code I had added to support non basic ASCII filenames in GLI, which was making an expensive win operating system function call on Windows for each selected file, launching a Java Process for each. The speed of selecting multiple files is now back to being almost as fast as in 3.09. Tested on Windows and linux. Had to treat windows as a special case because I can't get the code modifications to work on Linux: the perl code stores a hex-encoded string for the filename that GLI now uses when OS is Windows and compares against the hex encoded name of a file selected. But on linux the hex encoded value generated by perl is not the same as that which java generates and after trying repeatedly, I'e not been able to succeed to get it to work. So the code behaves as before for Linux.
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 52.8 KB

Rev	Line
[537]	1	###########################################################################
	2	#
	3	# doc.pm --
	4	# A component of the Greenstone digital library software
	5	# from the New Zealand Digital Library Project at the
	6	# University of Waikato, New Zealand.
	7	#
	8	# Copyright (C) 1999 New Zealand Digital Library Project
	9	#
[8894]	10	# This program is free software; you can redistr te it and/or modify
[537]	11	# it under the terms of the GNU General Public License as published by
	12	# the Free Software Foundation; either version 2 of the License, or
	13	# (at your option) any later version.
	14	#
	15	# This program is distributed in the hope that it will be useful,
	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	# GNU General Public License for more details.
	19	#
	20	# You should have received a copy of the GNU General Public License
	21	# along with this program; if not, write to the Free Software
	22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	#
	24	###########################################################################
	25
[1241]	26	# base class to hold documents
[4]	27
	28	package doc;
[3834]	29	eval {require bytes};
[4]	30
[832]	31	BEGIN {
[1241]	32	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
	33	unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
[832]	34	}
	35
[15894]	36	use strict;
[1241]	37	use unicode;
	38	use util;
[27306]	39	use FileUtils;
[1241]	40	use ghtml;
[8220]	41	use File::stat;
[1241]	42	##use hashdoc;
[13172]	43	use docprint;
[33125]	44	use JSON;
[1241]	45
[33125]	46	# We just need pi from the Trig lib
[33293]	47	# Import constants pi2, pip2, pip4 (2*pi, pi/2, pi/4).
[33416]	48	# But doing
	49	# use Math::Trig ':pi';
	50	# resulted in errors on 32 bit linux VM with perl v8 of "bareword pi not allowed while strict subs in use"
	51	# So, now doing just
	52	use Math::Trig;
[33125]	53
[4]	54	# the document type may be indexed_doc, nonindexed_doc, or
	55	# classification
	56
[18528]	57	our $OIDcount = 0;
[2327]	58
[33171]	59
	60	# Declare the Package (level) Variable that will tell us whether we're in the import or buildcol phase.
	61	# This var will be set by import.pl, buildcol.pl and export.pl to the then appropriate value.
	62	# https://perlmaven.com/package-variables-and-lexical-variables-in-perl
	63	# But: https://stackoverflow.com/questions/7542120/perl-how-to-make-variables-from-requiring-script-available-in-required-script
	64	# https://stackoverflow.com/questions/33230015/how-do-i-use-a-variable-in-another-perl-script
	65	#$doc::cmd_line_mode = undef;
	66	# Decided to follow the usage of $OIDcount above to declare the necessary package-level variables 'globally' accessible
	67	# actually accessible only for those packages that import this one (with 'use doc')
[33302]	68	my $cmd_line_mode = undef;
[33171]	69
[33302]	70	# processor_mode keeps track of which buildcol pass we're at: dummy, text (sidx/didx passes) or infodb
	71	my $processor_mode = undef;
	72
[20412]	73	# rename_method can be 'url', 'none', 'base64'
[4]	74	sub new {
	75	my $class = shift (@_);
[18319]	76	my ($source_filename, $doc_type, $rename_method) = @_;
[1374]	77
[13770]	78
[1241]	79	my $self = bless {'associated_files'=>[],
	80	'subsection_order'=>[],
	81	'next_subsection'=>1,
	82	'subsections'=>{},
	83	'metadata'=>[],
[2327]	84	'text'=>"",
	85	'OIDtype'=>"hash"}, $class;
[4]	86
[10217]	87	# used to set lastmodified here, but this can screw up the HASH ids, so
	88	# the docsave processor now calls set_lastmodified
[13770]	89
[23362]	90	$self->set_source_path($source_filename);
[10217]	91
[7929]	92	if (defined $source_filename) {
[15874]	93	$source_filename = &util::filename_within_collection($source_filename);
[18508]	94	print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
[18319]	95	$self->set_source_filename ($source_filename, $rename_method);
[7929]	96	}
	97
[1374]	98	$self->set_doc_type ($doc_type) if defined $doc_type;
[485]	99
[4]	100	return $self;
	101	}
[23362]	102
	103
	104	sub set_source_path
	105	{
	106	my $self = shift @_;
	107	my ($source_filename) = @_;
	108
	109	if (defined $source_filename) {
	110	# On Windows the source_filename can be in terse DOS format
	111	# e.g. test~1.txt
	112
	113	$self->{'terse_source_path'} = $source_filename;
	114
[27306]	115	# Use the FileUtil library methods as they are aware of more special
	116	# cases such as HDFS [jmt12]
[27393]	117	if (&FileUtils::fileExists($source_filename))
	118	{
[23362]	119	# See if we can do better for Windows with a filename
	120	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
	121	require Win32;
	122	$self->{'source_path'} = Win32::GetLongPathName($source_filename);
	123	}
	124	else {
	125	# For Unix-based systems, there is no difference between the two
	126	$self->{'source_path'} = $source_filename;
[34394]	127	}
	128
	129	# On Windows, the code above has ensured source_path is the Win long (full) path name.
	130	# To help GLI associate metadata with an easily calculated and accurate representation of
	131	# filenames, we now store the Win long path name, hex encoded.
	132	# We're not using this field on Linux, as I can't get the hex encodings generated to match
	133	# what GLI Java code generates. But for symmetry we store this field on Unix too, but we need
	134	# to hex-encode source_path on Unix too, or it may not be UTF-8 and doc.xml will be invalid
	135	my $hexencodedlongsourcepath = &unicode::debug_unicode_string($self->{'source_path'});
	136	$self->set_utf8_metadata_element ($self->get_top_section(), "gsdlfullsourcepath", $hexencodedlongsourcepath);
	137
[23362]	138	}
	139	else {
	140	print STDERR "Warning: In doc::set_source_path(), file\n";
	141	print STDERR " $source_filename\n";
	142	print STDERR " does not exist\n";
	143
	144	# (default) Set it to whatever we were given
	145	$self->{'source_path'} = $source_filename;
	146	}
	147	}
	148	else {
	149	# Previous code for setting source_path allowed for
	150	# it to be undefined, so continue this practice
	151	$self->{'terse_source_path'} = undef;
	152	$self->{'source_path'} = undef;
	153	}
	154	}
	155
	156
	157	sub get_source_path
	158	{
	159	my $self = shift @_;
	160
	161	return $self->{'terse_source_path'};
	162	}
	163
[10217]	164	# set lastmodified for OAI purposes, added by GRB, moved by kjdon
[23939]	165	sub set_oailastmodified {
	166	my $self = shift (@_);
	167
	168	my $source_path = $self->{'terse_source_path'};
	169
	170	if (defined $source_path && (-e $source_path)) {
	171	my $current_time = time;
	172
	173	my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
	174	$wday, $yday, $isdst) = localtime($current_time);
	175
	176	my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
	177
	178	$self->add_utf8_metadata($self->get_top_section(), "oailastmodified", $current_time);
	179	$self->add_utf8_metadata($self->get_top_section(), "oailastmodifieddate", $date_modified);
	180	}
	181	}
	182
	183	# no longer used for OAI purposes, since lastmodified is not what we want as the
	184	# Datestamp of a document. This doc metadata may be useful for general purposes.
[10217]	185	sub set_lastmodified {
	186	my $self = shift (@_);
[4]	187
[23362]	188	my $source_path = $self->{'terse_source_path'};
[13770]	189
	190	if (defined $source_path && (-e $source_path)) {
	191
	192	my $file_stat = stat($source_path);
[10217]	193	my $mtime = $file_stat->mtime;
[21862]	194	my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
	195	$wday, $yday, $isdst) = localtime($mtime);
	196
	197	my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
	198
[23923]	199	$self->add_utf8_metadata($self->get_top_section(), "lastmodified", $mtime);
[21862]	200	$self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
[10217]	201	}
	202	}
	203
[1241]	204	# clone the $self object
	205	sub duplicate {
	206	my $self = shift (@_);
	207
	208	my $newobj = {};
	209
[8716]	210	foreach my $k (keys %$self) {
[1241]	211	$newobj->{$k} = &clone ($self->{$k});
	212	}
	213
	214	bless $newobj, ref($self);
	215	return $newobj;
	216	}
	217
	218	sub clone {
	219	my ($from) = @_;
	220	my $type = ref ($from);
	221
	222	if ($type eq "HASH") {
	223	my $to = {};
[8716]	224	foreach my $key (keys %$from) {
[1241]	225	$to->{$key} = &clone ($from->{$key});
	226	}
	227	return $to;
	228	} elsif ($type eq "ARRAY") {
	229	my $to = [];
[8716]	230	foreach my $v (@$from) {
[1241]	231	push (@$to, &clone ($v));
	232	}
	233	return $to;
	234	} else {
	235	return $from;
	236	}
	237	}
	238
[2327]	239	sub set_OIDtype {
	240	my $self = shift (@_);
[12268]	241	my ($type, $metadata) = @_;
[1241]	242
[26536]	243	if (defined $type && $type =~ /^(hash\|hash_on_file\|hash_on_ga_xml\|hash_on_full_filename\|incremental\|filename\|dirname\|full_filename\|assigned)$/) {
[2327]	244	$self->{'OIDtype'} = $type;
	245	} else {
	246	$self->{'OIDtype'} = "hash";
	247	}
[16792]	248
[12268]	249	if ($type =~ /^assigned$/) {
	250	if (defined $metadata) {
	251	$self->{'OIDmetadata'} = $metadata;
	252	} else {
	253	$self->{'OIDmetadata'} = "dc.Identifier";
	254	}
	255	}
[2327]	256	}
	257
[20412]	258	# rename_method can be 'url', 'none', 'base64'
[1241]	259	sub set_source_filename {
	260	my $self = shift (@_);
[18319]	261	my ($source_filename, $rename_method) = @_;
[1241]	262
[16578]	263	# Since the gsdlsourcefilename element goes into the doc.xml it has
	264	# to be utf8. However, it should also represent the source filename
	265	# (in the import directory) which may not be utf8 at all.
[18319]	266	# For instance, if this meta element (gsdlsourcefilename) will be used
	267	# by other applications that parse doc.xml in order to locate
	268	# gsdlsourcefilename. Therefore, the solution is to URLencode or base64
	269	# encode the real filename as this is a binary-to-text encoding meaning
	270	# that the resulting string is ASCII (utf8). Decoding will give the original.
[16578]	271
[18319]	272	# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
[33756]	273	## print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename\n";
[18319]	274
[16670]	275	# URLencode just the gsdl_source_filename, not the directory. Then prepend dir
[23278]	276	$source_filename = $self->encode_filename($source_filename, $rename_method);
	277	# my ($srcfilename,$dirname,$suffix)
	278	# = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
[16792]	279	# print STDERR "-> $srcfilename -> ";
[23278]	280	# $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
[27306]	281	# $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
[16792]	282	# print STDERR "$source_filename\n";
[33756]	283
	284	## print STDERR "AFTER URL/base64 encoding gsdlsourcefilename: $source_filename\n";
	285
[16578]	286	$self->set_utf8_metadata_element ($self->get_top_section(),
[1241]	287	"gsdlsourcefilename",
	288	$source_filename);
[33756]	289
	290	# If we set the file renaming method, at least GLI can decode gsdlsourcefilename to get the
	291	# original filename (relative path to file) back for GLI to notice what ex Meta is associated with it
	292	$self->set_utf8_metadata_element($self->get_top_section(),
	293	"gsdlsourcefilerenamemethod",
	294	$rename_method);
[1241]	295	}
	296
[23278]	297	sub encode_filename {
	298	my $self = shift (@_);
	299	my ($source_filename, $rename_method) = @_;
	300
	301	my ($srcfilename,$dirname,$suffix)
	302	= &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
	303	# print STDERR "-> $srcfilename -> ";
	304	$srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
[33756]	305
	306	# encode any subdirs of "import" also, but not import itself
	307	my $sep = ($ENV{'GSDLOS'} =~ /^windows$/i) ? "\\" : "/";
	308	my @dirs = ($sep eq "\\") ? split(/[\\\/]+/, $dirname) : split(/\//, $dirname);
	309
[33757]	310	my $dirpath = "";
	311
	312	# Don't encode the first folder ("import" or "tmp"): GLI's DocXMLFile.java looks for literal "import"
[34105]	313	# or "tmp" before it knows what the file rename method is that is to be used decode the rest of gsdlsourcefilename.
	314	if(scalar (@dirs) >= 1) {
	315	$dirpath = shift(@dirs);
[33757]	316	}
[33756]	317	foreach my $subdir (@dirs) {
[34105]	318	#print STDERR "@@@@ Found subdir: $subdir\n";
[33756]	319
[33757]	320	$subdir = &util::rename_file($subdir, $rename_method);
	321	#print STDERR "@@@@ encoded subdir: $subdir\n";
	322
[33756]	323	$dirpath = &FileUtils::filenameConcatenate($dirpath, $subdir);
	324	}
	325	if($dirpath ne "") {
	326	$dirname = $dirpath;
	327	}
	328
[27306]	329	$source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
[23278]	330
	331	return $source_filename;
	332	}
	333
[7569]	334	sub set_converted_filename {
	335	my $self = shift (@_);
	336	my ($converted_filename) = @_;
	337
[16578]	338	# we know the converted filename is utf8
	339	$self->set_utf8_metadata_element ($self->get_top_section(),
[7569]	340	"gsdlconvertedfilename",
	341	$converted_filename);
	342	}
	343
[1241]	344	# returns the source_filename as it was provided
[19829]	345	sub get_unmodified_source_filename {
	346	my $self = shift (@_);
	347
[23362]	348	return $self->{'terse_source_path'};
[19829]	349	}
	350
	351	# returns the source_filename with whatever rename_method was given
[1241]	352	sub get_source_filename {
	353	my $self = shift (@_);
	354
	355	return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
	356	}
	357
[19829]	358
	359
[7569]	360	# returns converted filename if available else returns source filename
	361	sub get_filename_for_hashing {
	362	my $self = shift (@_);
	363
	364	my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
	365
	366	if (!defined $filename) {
[11097]	367	my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
[10980]	368	# if NULPlug processed file, then don't give a filename
[11097]	369	if (defined $plugin_name && $plugin_name eq "NULPlug") {
[10980]	370	$filename = undef;
[16670]	371	} else { # returns the URL encoded source filename!
[10980]	372	$filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
	373	}
[7569]	374	}
[23485]	375
[27306]	376	if (!&FileUtils::isFilenameAbsolute($filename)) {
	377	$filename = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$filename);
[23485]	378	}
	379
[7569]	380	return $filename;
	381	}
	382
[1241]	383	sub set_doc_type {
	384	my $self = shift (@_);
	385	my ($doc_type) = @_;
	386
	387	$self->set_metadata_element ($self->get_top_section(),
	388	"gsdldoctype",
	389	$doc_type);
	390	}
	391
[10217]	392	# returns the gsdldoctype as it was provided
[1241]	393	# the default of "indexed_doc" is used if no document
	394	# type was provided
	395	sub get_doc_type {
	396	my $self = shift (@_);
	397
	398	my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
	399	return $doc_type if (defined $doc_type);
	400	return "indexed_doc";
	401	}
	402
	403
	404	# look up the reference to the a particular section
	405	sub _lookup_section {
	406	my $self = shift (@_);
	407	my ($section) = @_;
	408
	409	my ($num);
	410	my $sectionref = $self;
	411
	412	while (defined $section && $section ne "") {
[12327]	413
[1241]	414	($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
[12327]	415
	416	$num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
	417
[1241]	418	$section = "" unless defined $section;
	419
[12327]	420
[1241]	421	if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
	422	$sectionref = $sectionref->{'subsections'}->{$num};
	423	} else {
	424	return undef;
	425	}
	426	}
	427
	428	return $sectionref;
	429	}
	430
[2327]	431	# calculate OID by hashing the contents of the document
[1241]	432	sub _calc_OID {
	433	my $self = shift (@_);
	434	my ($filename) = @_;
	435
[22855]	436
[1241]	437	my $osexe = &util::get_os_exe();
	438
[27306]	439	my $hashfile_exe = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin",
[1241]	440	$ENV{'GSDLOS'},"hashfile$osexe");
[8504]	441
[28560]	442	&util::set_gnomelib_env(); # gnomelib_env (particularly lib/libiconv2.dylib) required to run the hashfile executable on Mac Lions
	443	# The subroutine will set the gnomelib env once for each subshell launched, by first testing if GEXTGNOME is not already set
	444
[28575]	445	# A different way to set the gnomelib env would be to do it more locally: exporting the necessary vars
	446	# (specifically DYLD/LD_LIB_PATH) for gnome_lib as part of the command executed.
	447	# E.g. $result=`export LD_LIBRARY_PATH=../ext/gnome-lib/darwin/lib; hashfile...`
	448
[1241]	449	my $result = "NULL";
[22855]	450
[1679]	451
[1241]	452	if (-e "$hashfile_exe") {
[1679]	453	# $result = `\"$hashfile_exe\" \"$filename\"`;
[21862]	454	# $result = `hashfile$osexe \"$filename\" 2>&1`;
[1679]	455	$result = `hashfile$osexe \"$filename\"`;
[21862]	456
[1241]	457	($result) = $result =~ /:\s*([0-9a-f]+)/i;
	458	} else {
	459	print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
	460	}
	461	return "HASH$result";
	462	}
	463
	464	# methods dealing with OID, not groups of them.
	465
[2327]	466	# if $OID is not provided one is calculated
[1241]	467	sub set_OID {
	468	my $self = shift (@_);
	469	my ($OID) = @_;
[8504]	470
[8797]	471	my $use_hash_oid = 0;
[17057]	472	# if an OID wasn't provided calculate one
[1241]	473	if (!defined $OID) {
	474	$OID = "NULL";
[17025]	475	if ($self->{'OIDtype'} =~ /^hash/) {
[8797]	476	$use_hash_oid = 1;
	477	} elsif ($self->{'OIDtype'} eq "incremental") {
[2327]	478	$OID = "D" . $OIDcount;
	479	$OIDcount ++;
[26221]	480	} elsif ($self->{'OIDtype'} eq "filename") {
	481	my $filename = $self->get_source_filename();
	482	$OID = &File::Basename::fileparse($filename, qr/\.[^.]*/);
	483	$OID = &util::tidy_up_oid($OID);
[26536]	484	} elsif ($self->{'OIDtype'} eq "full_filename") {
	485	my $source_filename = $self->get_source_filename();
	486	my $dirsep = &util::get_os_dirsep();
	487
	488	$source_filename =~ s/^import$dirsep//;
	489	$source_filename =~ s/$dirsep/-/g;
	490	$source_filename =~ s/\./_/g;
	491
	492	$OID = $source_filename;
	493	$OID = &util::tidy_up_oid($OID);
[8716]	494	} elsif ($self->{'OIDtype'} eq "dirname") {
	495	my $filename = $self->get_source_filename();
[8797]	496	if (defined($filename)) { # && -e $filename) {
[27350]	497	# get the immediate parent directory
[8716]	498	$OID = &File::Basename::dirname($filename);
	499	if (defined $OID) {
[27350]	500	$OID = &File::Basename::basename($OID);
[19617]	501	$OID = &util::tidy_up_oid($OID);
[8716]	502	} else {
[8797]	503	print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
	504	$use_hash_oid = 1;
[8716]	505	}
	506	} else {
[27350]	507	print STDERR "Failed to find a filename, generating hash id\n";
[8797]	508	$use_hash_oid = 1;
[8716]	509	}
[2327]	510
[8797]	511	} elsif ($self->{'OIDtype'} eq "assigned") {
[12268]	512	my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
[8797]	513	if (defined $identifier && $identifier ne "") {
[17025]	514	$OID = $identifier;
[19617]	515	$OID = &util::tidy_up_oid($OID);
[8797]	516	} else {
	517	# need a hash id
[12268]	518	print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
[8797]	519	$use_hash_oid = 1;
	520	}
	521
[2327]	522	} else {
[8797]	523	$use_hash_oid = 1;
	524	}
	525
	526	if ($use_hash_oid) {
[23562]	527	my $hash_on_file = 1;
[26536]	528	my $hash_on_ga_xml = 0;
	529
[17025]	530	if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
	531	$hash_on_file = 0;
[26536]	532	$hash_on_ga_xml = 1;
[17025]	533	}
[26536]	534
	535	if ($self->{'OIDtype'} eq "hash_on_full_filename") {
	536	$hash_on_file = 0;
	537	$hash_on_ga_xml = 0;
	538
	539	my $source_filename = $self->get_source_filename();
	540	my $dirsep = &util::get_os_dirsep();
	541
	542	$source_filename =~ s/^import$dirsep//;
	543	$source_filename =~ s/$dirsep/-/g;
	544	$source_filename =~ s/\./_/g;
	545
	546	# If the filename is very short then (handled naively)
	547	# this can cause conjestion in the hash-values
	548	# computed, leading documents sharing the same leading
	549	# Hex values in the computed has.
	550	#
	551	# The solution taken here is to replace the name of
	552	# the file name a sufficient number of times (up to
	553	# the character limit defined in 'rep_limit' and
	554	# make that the content that is hashed on
	555
	556	# *** Think twice before changing the following value
	557	# as it will break backward compatability of computed
	558	# document HASH values
	559
	560	my $rep_limit = 256;
	561	my $hash_content = undef;
	562
	563	if (length($source_filename)<$rep_limit) {
	564	my $rep_string = "$source_filename\|";
	565	my $rs_len = length($rep_string);
	566
	567	my $clone_times = int(($rep_limit-1)/$rs_len) +1;
	568
	569	$hash_content = substr($rep_string x $clone_times, 0, $rep_limit);
	570	}
	571	else {
	572	$hash_content = $source_filename;
	573	}
	574
	575	my $filename = &util::get_tmp_filename();
	576	if (!open (OUTFILE, ">:utf8", $filename)) {
	577	print STDERR "doc::set_OID could not write to $filename\n";
	578	} else {
	579	print OUTFILE $hash_content;
	580	close (OUTFILE);
	581	}
	582	$OID = $self->_calc_OID ($filename);
	583
[27538]	584	&FileUtils::removeFiles ($filename);
[26536]	585	}
	586
[17025]	587	if ($hash_on_file) {
	588	# "hash" OID - feed file to hashfile.exe
	589	my $filename = $self->get_filename_for_hashing();
[23562]	590
[17025]	591	# -z: don't want to hash on the file if it is zero size
	592	if (defined($filename) && -e $filename && !-z $filename) {
	593	$OID = $self->_calc_OID ($filename);
	594	} else {
[26536]	595	$hash_on_ga_xml = 1; # switch to back-up plan, and hash on GA file instead
[17025]	596	}
	597	}
[26536]	598
	599	if ($hash_on_ga_xml) {
	600	# In addition being asked to explicity calculate the has based on the GA file,
	601	# can also end up coming into this block is doing 'hash_on_file' but the file
	602	# itself is of zero bytes (as could be the case with 'doc.nul' file
	603
[17025]	604	my $filename = &util::get_tmp_filename();
[22855]	605	if (!open (OUTFILE, ">:utf8", $filename)) {
[2327]	606	print STDERR "doc::set_OID could not write to $filename\n";
	607	} else {
[32512]	608	my $doc_text = &docprint::get_section_xml($self);
[13172]	609	print OUTFILE $doc_text;
[2327]	610	close (OUTFILE);
	611	}
	612	$OID = $self->_calc_OID ($filename);
[27393]	613	&FileUtils::removeFiles($filename);
[1374]	614	}
[1241]	615	}
	616	}
	617	$self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
	618	}
	619
	620	# this uses hashdoc (embedded c thingy) which is faster but still
[32535]	621	# needs a little work to be sufficiently stable
[1241]	622	sub ___set_OID {
	623	my $self = shift (@_);
	624	my ($OID) = @_;
	625
	626	# if an OID wasn't provided then calculate hash value based on document
	627	if (!defined $OID)
	628	{
[32512]	629	my $hash_text = &docprint::get_section_xml($self);
[1241]	630	my $hash_len = length($hash_text);
	631
	632	$OID = &hashdoc::buffer($hash_text,$hash_len);
	633	}
	634
	635	$self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
	636	}
	637
	638	# returns the OID for this document
	639	sub get_OID {
	640	my $self = shift (@_);
	641	my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
	642	return $OID if (defined $OID);
	643	return "NULL";
	644	}
	645
	646	sub delete_OID {
	647	my $self = shift (@_);
	648
	649	$self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
	650	}
	651
	652
	653	# methods for manipulating section names
	654
	655	# returns the name of the top-most section (the top
	656	# level of the document
	657	sub get_top_section {
	658	my $self = shift (@_);
	659
	660	return "";
	661	}
	662
	663	# returns a section
	664	sub get_parent_section {
	665	my $self = shift (@_);
	666	my ($section) = @_;
	667
	668	$section =~ s/(^\|\.)\d+$//;
	669
	670	return $section;
	671	}
	672
	673	# returns the first child section (or the end child
	674	# if there isn't any)
	675	sub get_begin_child {
	676	my $self = shift (@_);
	677	my ($section) = @_;
	678
	679	my $section_ptr = $self->_lookup_section($section);
	680	return "" unless defined $section_ptr;
	681
	682	if (defined $section_ptr->{'subsection_order'}->[0]) {
	683	return "$section.$section_ptr->{'subsection_order'}->[0]";
	684	}
	685
	686	return $self->get_end_child ($section);
	687	}
	688
	689	# returns the next child of a parent section
	690	sub get_next_child {
	691	my $self = shift (@_);
	692	my ($section) = @_;
	693
	694	my $parent_section = $self->get_parent_section($section);
	695	my $parent_section_ptr = $self->_lookup_section($parent_section);
	696	return undef unless defined $parent_section_ptr;
	697
	698	my ($section_num) = $section =~ /(\d+)$/;
	699	return undef unless defined $section_num;
	700
	701	my $i = 0;
	702	my $section_order = $parent_section_ptr->{'subsection_order'};
	703	while ($i < scalar(@$section_order)) {
	704	last if $section_order->[$i] eq $section_num;
	705	$i++;
	706	}
	707
	708	$i++; # the next child
	709	if ($i < scalar(@$section_order)) {
	710	return $section_order->[$i] if $parent_section eq "";
	711	return "$parent_section.$section_order->[$i]";
	712	}
	713
	714	# no more sections in this level
	715	return undef;
	716	}
	717
	718	# returns a reference to a list of children
	719	sub get_children {
	720	my $self = shift (@_);
	721	my ($section) = @_;
	722
	723	my $section_ptr = $self->_lookup_section($section);
	724	return [] unless defined $section_ptr;
	725
	726	my @children = @{$section_ptr->{'subsection_order'}};
	727
	728	map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
	729	return \@children;
	730	}
	731
	732	# returns the child section one past the last one (which
	733	# is coded as "0")
	734	sub get_end_child {
	735	my $self = shift (@_);
	736	my ($section) = @_;
	737
	738	return $section . ".0" unless $section eq "";
	739	return "0";
	740	}
	741
	742	# returns the next section in book order
	743	sub get_next_section {
	744	my $self = shift (@_);
	745	my ($section) = @_;
	746
	747	return undef unless defined $section;
	748
	749	my $section_ptr = $self->_lookup_section($section);
	750	return undef unless defined $section_ptr;
	751
	752	# first try to find first child
	753	if (defined $section_ptr->{'subsection_order'}->[0]) {
	754	return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
	755	return "$section.$section_ptr->{'subsection_order'}->[0]";
	756	}
	757
	758	do {
	759	# try to find sibling
	760	my $next_child = $self->get_next_child ($section);
	761	return $next_child if (defined $next_child);
	762
	763	# move up one level
	764	$section = $self->get_parent_section ($section);
	765	} while $section =~ /\d/;
	766
	767	return undef;
	768	}
	769
	770	sub is_leaf_section {
	771	my $self = shift (@_);
	772	my ($section) = @_;
	773
	774	my $section_ptr = $self->_lookup_section($section);
	775	return 1 unless defined $section_ptr;
	776
	777	return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
	778	}
	779
	780	# methods for dealing with sections
	781
	782	# returns the name of the inserted section
	783	sub insert_section {
	784	my $self = shift (@_);
	785	my ($before_section) = @_;
	786
	787	# get the child to insert before and its parent section
	788	my $parent_section = "";
	789	my $before_child = "0";
	790	my @before_section = split (/\./, $before_section);
	791	if (scalar(@before_section) > 0) {
	792	$before_child = pop (@before_section);
	793	$parent_section = join (".", @before_section);
	794	}
	795
	796	my $parent_section_ptr = $self->_lookup_section($parent_section);
	797	if (!defined $parent_section_ptr) {
	798	print STDERR "doc::insert_section couldn't find parent section " .
	799	"$parent_section\n";
	800	return;
	801	}
	802
	803	# get the next section number
	804	my $section_num = $parent_section_ptr->{'next_subsection'}++;
	805
	806	my $i = 0;
	807	while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
	808	$parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
	809	$i++;
	810	}
	811
	812	# insert the section number into the order list
	813	splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
	814
	815	# add this section to the parent section
	816	my $section_ptr = {'subsection_order'=>[],
	817	'next_subsection'=>1,
	818	'subsections'=>{},
	819	'metadata'=>[],
	820	'text'=>""};
	821	$parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
	822
	823	# work out the full section number
	824	my $section = $parent_section;
	825	$section .= "." unless $section eq "";
	826	$section .= $section_num;
	827
	828	return $section;
	829	}
	830
	831	# creates a pre-named section
	832	sub create_named_section {
	833	my $self = shift (@_);
	834	my ($mastersection) = @_;
	835
	836	my ($num);
	837	my $section = $mastersection;
	838	my $sectionref = $self;
	839
	840	while ($section ne "") {
	841	($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
	842	$num =~ s/^0+(\d)/$1/; # remove leading 0s
	843	$section = "" unless defined $section;
	844
	845	if (defined $num) {
	846	if (!defined $sectionref->{'subsections'}->{$num}) {
	847	push (@{$sectionref->{'subsection_order'}}, $num);
	848	$sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
	849	'next_subsection'=>1,
	850	'subsections'=>{},
	851	'metadata'=>[],
	852	'text'=>""};
	853	if ($num >= $sectionref->{'next_subsection'}) {
	854	$sectionref->{'next_subsection'} = $num + 1;
	855	}
	856	}
	857	$sectionref = $sectionref->{'subsections'}->{$num};
	858
	859	} else {
	860	print STDERR "doc::create_named_section couldn't create section ";
	861	print STDERR "$mastersection\n";
	862	last;
	863	}
	864	}
	865	}
	866
	867	# returns a reference to a list of subsections
	868	sub list_subsections {
	869	my $self = shift (@_);
	870	my ($section) = @_;
	871
	872	my $section_ptr = $self->_lookup_section ($section);
	873	if (!defined $section_ptr) {
	874	print STDERR "doc::list_subsections couldn't find section $section\n";
	875	return [];
	876	}
	877
	878	return [@{$section_ptr->{'subsection_order'}}];
	879	}
	880
	881	sub delete_section {
	882	my $self = shift (@_);
	883	my ($section) = @_;
	884
	885	# my $section_ptr = {'subsection_order'=>[],
	886	# 'next_subsection'=>1,
	887	# 'subsections'=>{},
	888	# 'metadata'=>[],
	889	# 'text'=>""};
	890
	891	# if this is the top section reset everything
	892	if ($section eq "") {
	893	$self->{'subsection_order'} = [];
	894	$self->{'subsections'} = {};
	895	$self->{'metadata'} = [];
	896	$self->{'text'} = "";
	897	return;
	898	}
	899
	900	# find the parent of the section to delete
	901	my $parent_section = "";
	902	my $child = "0";
	903	my @section = split (/\./, $section);
	904	if (scalar(@section) > 0) {
	905	$child = pop (@section);
	906	$parent_section = join (".", @section);
	907	}
	908
	909	my $parent_section_ptr = $self->_lookup_section($parent_section);
	910	if (!defined $parent_section_ptr) {
	911	print STDERR "doc::delete_section couldn't find parent section " .
	912	"$parent_section\n";
	913	return;
	914	}
	915
	916	# remove this section from the subsection_order list
	917	my $i = 0;
	918	while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
	919	if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
	920	splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
	921	last;
	922	}
	923	$i++;
	924	}
	925
	926	# remove this section from the subsection hash
	927	if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
	928	undef $parent_section_ptr->{'subsections'}->{$child};
	929	}
	930	}
	931
	932	#--
[4]	933	# methods for dealing with metadata
	934
	935	# set_metadata_element and get_metadata_element are for metadata
	936	# which should only have one value. add_meta_data and get_metadata
	937	# are for metadata which can have more than one value.
	938
[1241]	939	# returns the first metadata value which matches field
[6111]	940
	941	# This version of get metadata element works much like the one above,
	942	# except it allows for the namespace portion of a metadata element to
	943	# be ignored, thus if you are searching for dc.Title, the first piece
	944	# of matching metadata ending with the name Title (once any namespace
	945	# is removed) would be returned.
	946	# 28-11-2003 John Thompson
[1241]	947	sub get_metadata_element {
	948	my $self = shift (@_);
[6111]	949	my ($section, $field, $ignore_namespace) = @_;
[1241]	950	my ($data);
	951
[6111]	952	$ignore_namespace = 0 unless defined $ignore_namespace;
	953
[1241]	954	my $section_ptr = $self->_lookup_section($section);
	955	if (!defined $section_ptr) {
[8716]	956	print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
[1241]	957	return;
	958	}
	959
[24404]	960	# Remove any namespace if we are being told to ignore them
[6111]	961	if($ignore_namespace) {
[24404]	962	$field =~ s/^.\.//; #$field =~ s/^\w\.//;
[6111]	963	}
	964
[1241]	965	foreach $data (@{$section_ptr->{'metadata'}}) {
[6111]	966
	967	my $data_name = $data->[0];
[14966]	968
[24404]	969	# Remove any namespace if we are being told to ignore them
[6111]	970	if($ignore_namespace) {
[24404]	971	$data_name =~ s/^.\.//; #$data_name =~ s/^\w\.//;
[6111]	972	}
[24404]	973	# we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
	974	$data_name =~ s/^ex\.([^.]+)$/$1/; #$data_name =~ s/^ex\.//;
	975
[6111]	976	return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
[1241]	977	}
	978
	979	return undef; # was not found
	980	}
	981
	982	# returns a list of the form [value1, value2, ...]
	983	sub get_metadata {
	984	my $self = shift (@_);
[6111]	985	my ($section, $field, $ignore_namespace) = @_;
[1241]	986	my ($data);
	987
[6111]	988	$ignore_namespace = 0 unless defined $ignore_namespace;
	989
[1241]	990	my $section_ptr = $self->_lookup_section($section);
	991	if (!defined $section_ptr) {
[8716]	992	print STDERR "doc::get_metadata couldn't find section ",
	993	$section, "\n";
[1241]	994	return;
	995	}
	996
[24404]	997	# Remove any namespace if we are being told to ignore them
[6111]	998	if($ignore_namespace) {
[24404]	999	$field =~ s/^.*\.//;
[6111]	1000	}
	1001
[1241]	1002	my @metadata = ();
	1003	foreach $data (@{$section_ptr->{'metadata'}}) {
[6111]	1004
	1005	my $data_name = $data->[0];
[24404]	1006
	1007	# Remove any namespace if we are being told to ignore them
[6111]	1008	if($ignore_namespace) {
[24404]	1009	$data_name =~ s/^.*\.//;
	1010	}
	1011	# we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
	1012	$data_name =~ s/^ex\.([^.]+)$/$1/;
	1013
[6111]	1014	push (@metadata, $data->[1]) if ($data_name eq $field);
[1241]	1015	}
[9241]	1016
[1241]	1017	return \@metadata;
	1018	}
	1019
[23827]	1020	sub get_metadata_hashmap {
	1021	my $self = shift (@_);
	1022	my ($section, $opt_namespace) = @_;
	1023
	1024	my $section_ptr = $self->_lookup_section($section);
	1025	if (!defined $section_ptr) {
	1026	print STDERR "doc::get_metadata couldn't find section ",
	1027	$section, "\n";
	1028	return;
	1029	}
	1030
	1031	my $metadata_hashmap = {};
	1032	foreach my $data (@{$section_ptr->{'metadata'}}) {
	1033	my $metaname = $data->[0];
	1034
	1035	if ((!defined $opt_namespace) \|\| ($metaname =~ m/^$opt_namespace\./)) {
	1036	if (!defined $metadata_hashmap->{$metaname}) {
	1037	$metadata_hashmap->{$metaname} = [];
	1038	}
	1039	my $metaval_list = $metadata_hashmap->{$metaname};
	1040	push(@$metaval_list, $data->[1]);
	1041	}
	1042	}
	1043
	1044	return $metadata_hashmap;
	1045	}
	1046
[1241]	1047	# returns a list of the form [[field,value],[field,value],...]
	1048	sub get_all_metadata {
	1049	my $self = shift (@_);
	1050	my ($section) = @_;
	1051
	1052	my $section_ptr = $self->_lookup_section($section);
	1053	if (!defined $section_ptr) {
[8716]	1054	print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
[1241]	1055	return;
	1056	}
	1057
	1058	return $section_ptr->{'metadata'};
	1059	}
	1060
	1061	# $value is optional
	1062	sub delete_metadata {
	1063	my $self = shift (@_);
	1064	my ($section, $field, $value) = @_;
	1065
	1066	my $section_ptr = $self->_lookup_section($section);
	1067	if (!defined $section_ptr) {
[23562]	1068	print STDERR "doc::delete_metadata couldn't find section ", $section, "$field\n";
[1241]	1069	return;
	1070	}
	1071
	1072	my $i = 0;
	1073	while ($i < scalar (@{$section_ptr->{'metadata'}})) {
	1074	if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
	1075	(!defined $value \|\| $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
	1076	splice (@{$section_ptr->{'metadata'}}, $i, 1);
	1077	} else {
	1078	$i++;
	1079	}
	1080	}
	1081	}
	1082
	1083	sub delete_all_metadata {
	1084	my $self = shift (@_);
	1085	my ($section) = @_;
	1086
	1087	my $section_ptr = $self->_lookup_section($section);
	1088	if (!defined $section_ptr) {
[8716]	1089	print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
[1241]	1090	return;
	1091	}
	1092
	1093	$section_ptr->{'metadata'} = [];
	1094	}
	1095
[4]	1096	sub set_metadata_element {
	1097	my $self = shift (@_);
	1098	my ($section, $field, $value) = @_;
	1099
[97]	1100	$self->set_utf8_metadata_element ($section, $field,
[1870]	1101	&unicode::ascii2utf8(\$value));
[73]	1102	}
	1103
	1104	# set_utf8_metadata_element assumes the text has already been
	1105	# converted to the UTF-8 encoding.
	1106	sub set_utf8_metadata_element {
	1107	my $self = shift (@_);
	1108	my ($section, $field, $value) = @_;
[25557]	1109
[4]	1110	$self->delete_metadata ($section, $field);
[73]	1111	$self->add_utf8_metadata ($section, $field, $value);
[4]	1112	}
	1113
	1114
[73]	1115	# add_metadata assumes the text is in (extended) ascii form. For
[8220]	1116	# text which has already been converted to the UTF-8 format use
[73]	1117	# add_utf8_metadata.
[4]	1118	sub add_metadata {
	1119	my $self = shift (@_);
	1120	my ($section, $field, $value) = @_;
[25557]	1121
[97]	1122	$self->add_utf8_metadata ($section, $field,
[1870]	1123	&unicode::ascii2utf8(\$value));
[73]	1124	}
	1125
	1126	sub add_utf8_metadata {
	1127	my $self = shift (@_);
	1128	my ($section, $field, $value) = @_;
[25557]	1129
[33171]	1130	# my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
[25557]	1131	# my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
	1132	# print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
[73]	1133
[4]	1134	my $section_ptr = $self->_lookup_section($section);
	1135	if (!defined $section_ptr) {
[8716]	1136	print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
[4]	1137	return;
	1138	}
[1732]	1139	if (!defined $value) {
	1140	print STDERR "doc::add_utf8_metadata undefined value for $field\n";
	1141	return;
	1142	}
	1143	if (!defined $field) {
	1144	print STDERR "doc::add_utf8_metadata undefined metadata type \n";
	1145	return;
	1146	}
[4]	1147
[8894]	1148	#print STDERR "###$field=$value\n";
[22950]	1149
	1150	# For now, supress this check. Given that text data read in is now
	1151	# Unicode aware, then the following block of code can (ironically enough)
	1152	# cause our unicode compliant string to be re-encoded (leading to
	1153	# a double-encoded UTF-8 string, which we definitely don't want!).
	1154
	1155
[7798]	1156	# double check that the value is utf-8
[25557]	1157	# if (!&unicode::check_is_utf8($value)) {
	1158	# print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
	1159	# &unicode::ensure_utf8(\$value);
	1160	# print STDERR " Tried converting to utf8: $value\n";
	1161	# }
[7798]	1162
[25557]	1163	#If the metadata value is either a latitude or a longitude value then we want to save a shortened version for spacial searching purposes
[26850]	1164	if ($field =~ m/^(.+\.)?Latitude$/ \|\| $field =~ m/^(.+\.)?Longitude$/)
[25557]	1165	{
[26850]	1166	my ($mdprefix,$metaname) = ($field =~ m/(.+)\.(.+)$/);
	1167	if (defined $mdprefix) {
	1168	# Add in a version of Latitude/Longitude without the metadata namespace prefix to keep Runtime happy
	1169	push (@{$section_ptr->{'metadata'}}, [$metaname, $value]);
	1170	}
	1171
[25557]	1172	my $direction;
[34276]	1173	# TODO does the following work, with eq latitude? the above code implies can be xxxxx.Latitude. should use metaname??
[25557]	1174	if($value =~ m/^-/)
	1175	{
	1176	$direction = ($field eq "Latitude") ? "S" : "W";
	1177	}
	1178	else
	1179	{
	1180	$direction = ($field eq "Latitude") ? "N" : "E";
	1181	}
	1182
	1183	my ($beforeDec, $afterDec) = ($value =~ m/^-?([0-9]+)\.([0-9]+)$/);
	1184	if(defined $beforeDec && defined $afterDec)
	1185	{
	1186	my $name = ($field eq "Latitude") ? "LatShort" : "LngShort";
	1187	push (@{$section_ptr->{'metadata'}}, [$name, $beforeDec . $direction]);
	1188
	1189	for(my $i = 2; $i <= 4; $i++)
	1190	{
	1191	if(length($afterDec) >= $i)
	1192	{
	1193	push (@{$section_ptr->{'metadata'}}, [$name, substr($afterDec, 0, $i)]);
	1194	}
	1195	}
[34276]	1196
	1197	# what is this???? completely does nothing...
[25557]	1198	#Only add the metadata if it has not already been added
[33217]	1199	my $metaMap = $self->get_metadata_hashmap($section); # metaMap not used and called function has no apparent side-effects,
	1200	# but this line appears important for ensuring uniqueness of (Latitude, value) meta
	1201	# in the section. Also for LatShort, Longitude, LngShort.
[25557]	1202	}
[34276]	1203
	1204	# if we have both Latitude and Longitude, then add coordinate metadata (not sure why, but someone else added it)
	1205	my $latitude;
	1206	my $longitude;
[33126]	1207	if($field =~ m/^(.+\.)?Longitude$/) {
[34276]	1208	# if we are dealing with Longitude meta, and we have already seen Latitude, then add Coordinate metadata
	1209	$latitude = $self->get_metadata_element ($section, "Latitude");
	1210	if (defined $latitude) {
	1211	$longitude = $value;
	1212	}
	1213	} else {
	1214	# we are dealing with Latitude - if we have already seen Longitude, then add both as Coordinate metadata
	1215	$longitude = $self->get_metadata_element ($section, "Longitude");
	1216	if (defined $longitude) {
	1217	$latitude = $value;
	1218	}
[33126]	1219	}
[34276]	1220	if (defined $latitude && defined $longitude) {
	1221	##rint STDERR "adding coords for $section, $latitude, $longitude\n";
	1222	$self->processCoordinate($section, $latitude, $longitude);
	1223	} else {
	1224	##rint STDERR "lat or long not defined, not adding coords for $section\n";
	1225	}
	1226
[25557]	1227	}
[33302]	1228
	1229	elsif($field eq "GPS.mapOverlay") { # then the $value is a JSON string
[25557]	1230
[33313]	1231	# In order to allow searching map-data-enriched documents by map shape descriptions,
[33302]	1232	# and to run rawquery searches for other docs by proximity based on their map data,
	1233	# need to store the shape descriptions and Coordinate info for shapes into the text index.
	1234	# We add the description for each shape in the mapoverlay into the text index as GPSMapOverlayLabel
	1235	# And we add Coordinate (CD) and CoordShort (CS) info for each shape in the mapoverlay in the format (Lat, Lng) as ("37S339 175E342")
	1236	# where the digits before the N/S/E/W direction represents the whole number. And the digits after the direction are the
[33313]	1237	# decimal places which can vary between 0 and 2 to 4 digits.
[33302]	1238
[33313]	1239	# However, we only want to process GPS.mapOverlay during buildcol and only in the text indexing passes (e.g. sidx and didx for lucene)
	1240	# and certainly never during the infodb pass of buildcol. The latter can end up duplicating Coordinate/CoordShort/GPSMapOverlayLabel
	1241	# when rebuilding with the online doc editor, as that runs incremental-rebuild which then calls basebuilder::reconstruct_doc_objs_metadata()
[33302]	1242	# on all docs NOT being incrementally rebuilt. That call would get meta from the infodb and use it to reconstruct the doc objects of docs
[33313]	1243	# NOT being incrementally built. If the Coord and Label meta were written to the infodb, they would then be loaded back in memory when the
	1244	# collection is incrementally rebuilt for those docs that don't need incremental processing. Then this function would once again add the
	1245	# same meta into the infodb, thus duplicating what goes into the infodb. Hence, don't do all the following if doc::processor_mode =~ "infodb",
	1246	# or anything other than a text mode.
[33217]	1247
[33302]	1248	# Note that for incremental rebuilding, the text pass can be called textreindex for instance (and infodb pass can be incinfodb).
	1249	# So don't check for exact string match
	1250
[33312]	1251	if($doc::cmd_line_mode eq "buildcol" && $doc::processor_mode =~ m/^text/) { # currently known text processor_modes:
	1252	# text, textreindex, possibly textdelete (see ArchivesInfPlugin.pm::read() for last 2).
	1253	# OR: ..."buildcol" && $doc::processor_mode !~ m/infodb$/) # if dummy pass important
[33171]	1254
[33302]	1255	###print STDERR "GPS.mapOverlay before decoding, val = " . $value . "\n";
[33125]	1256
[33302]	1257	# TODO: html decode?
[33125]	1258	$value =~ s@[@[@g;
	1259	$value =~ s@]@]@g;
	1260	$value =~ s@"@"@g;
[33302]	1261	###print STDERR "GPS.mapOverlay after decoding, val = " . $value . "\n";
[33125]	1262
[33293]	1263	my $json_array = decode_json $value;
[33302]	1264
	1265	foreach my $shape (@$json_array) {
[33125]	1266
[33302]	1267	# Put each available shape description/label into this section's metadata with GPSMapOverlayLabel as metaname.
	1268	# Just as for Coordinate meta, don't need to know which shape a label belongs too. This is just so each label
	1269	# will be indexed, and therefore can be searched.
	1270
	1271	my $description = $shape->{"description"};
	1272	if($description) {
	1273	push (@{$section_ptr->{'metadata'}}, ["GPSMapOverlayLabel", $description]);
	1274	###print STDERR "@@@@############################################ Just added description meta: " . $description . "\n";
	1275	}
	1276
[33293]	1277	my $type = $shape->{"type"};
[33302]	1278	###print STDERR "Shape type : " . $type . "\n";
[33293]	1279
	1280	if($type eq "circle") {
[33302]	1281	###print STDERR "Found a circle:\n" . &printShape($json, $shape);
[33293]	1282
	1283	# work out bounding box
	1284	# want the inverse of this useful page:
	1285	# https://stackoverflow.com/questions/639695/how-to-convert-latitude-or-longitude-to-meters
	1286	# https://www.geodatasource.com/developers/javascript
[33125]	1287
[33293]	1288	# Dr Bainbridge wants us to use: https://gis.stackexchange.com/questions/5821/calculating-latitude-longitude-x-miles-from-point
	1289	# Since for N,E,S,W the following works out the same as above link, we're using the rule of thumb in the following:
	1290	# https://gis.stackexchange.com/questions/2951/algorithm-for-offsetting-a-latitude-longitude-by-some-amount-of-meters
	1291	# which states
	1292	# "If your displacements aren't too great (less than a few kilometers) and you're not right at the poles,
	1293	# use the quick and dirty estimate that 111,111 meters (111.111 km) in the y direction is 1 degree
	1294	# (of latitude) and 111,111 * cos(latitude) meters in the x direction is 1 degree (of longitude)."
	1295	my $centre_lat = $shape->{"center"}->{"lat"};
	1296	my $centre_lng = $shape->{"center"}->{"lng"};
	1297	my $radius = $shape->{"radius"}; # in metres!
	1298
[33302]	1299	###print STDERR "@@@ circle centre: ($centre_lat, $centre_lng), radius: $radius\n";
[33293]	1300
	1301	my $lat_north = $centre_lat + ($radius/111111);
	1302	my $lat_south = $centre_lat - ($radius/111111);
	1303
[33302]	1304	###print STDERR "### lat_north: $lat_north\n";
	1305	###print STDERR "### lat_south: $lat_south\n";
[33125]	1306
[33293]	1307	# our latitude and longitude values are in degrees. But cos and sin etc in perl and generally all prog languages
	1308	# all expect their angle to be in radians. So need to convert from degree to radians before we can take the cos of it.
	1309	my $centre_lat_radians = $self->degreesToRadians($centre_lat);
	1310	my $cos_in_radians = cos($centre_lat_radians);
[33302]	1311	###print STDERR "cos $centre_lat_radians " . cos($centre_lat_radians) . "\n";
[33293]	1312	my $lng_east = $centre_lng + ($radius/(111111 * $cos_in_radians));
	1313	my $lng_west = $centre_lng - ($radius/(111111 * $cos_in_radians));
[33302]	1314	###print STDERR "### lng_east $lng_east\n";
	1315	###print STDERR "### lng_west $lng_west\n";
[33293]	1316
	1317	my $cos_lat = cos($centre_lat);
[33302]	1318	###print STDERR "cos $centre_lat is $cos_lat\n";
[33293]	1319
	1320	$self->processCoordinate($section, $lat_north, $lng_east);
	1321	$self->processCoordinate($section, $lat_south, $lng_east);
	1322	$self->processCoordinate($section, $lat_south, $lng_west);
	1323	$self->processCoordinate($section, $lat_north, $lng_west);
	1324
	1325	}
	1326	elsif ($type eq "marker") {
[33302]	1327	###print STDERR "@@ MARKER FOUND WITH LAT: " . $shape->{"position"}->{"lat"} . "\n";
	1328	###print STDERR "@@ MARKER FOUND WITH LNG: " . $shape->{"position"}->{"lng"} . "\n";
[33293]	1329	$self->processCoordinate($section, $shape->{"position"}->{"lat"}, $shape->{"position"}->{"lng"});
	1330	}
	1331	elsif ($type eq "polyline" \|\| $type eq "polygon") {
	1332	my $path_array = $shape->{"path"};
	1333	foreach my $position (@$path_array) {
	1334	$self->processCoordinate($section, $position->{"lat"}, $position->{"lng"});
	1335	}
	1336	}
	1337	elsif ($type eq "rectangle") {
	1338
	1339	my $bounds = $shape->{"bounds"};
	1340	$self->processCoordinate($section, $bounds->{"north"}, $bounds->{"east"});
	1341	$self->processCoordinate($section, $bounds->{"south"}, $bounds->{"east"});
	1342	$self->processCoordinate($section, $bounds->{"south"}, $bounds->{"west"});
	1343	$self->processCoordinate($section, $bounds->{"north"}, $bounds->{"west"});
	1344	}
	1345
[33125]	1346	} # end for on each shape in GPS.mapOverlay
[33302]	1347	} # end if(buildcol and text pass)
[33125]	1348	} # end GPS.mapOverlay meta
	1349
[4]	1350	push (@{$section_ptr->{'metadata'}}, [$field, $value]);
	1351	}
	1352
[33125]	1353	# https://en.wikipedia.org/wiki/Radian
	1354	sub degreesToRadians
	1355	{
	1356	my $self = shift (@_);
	1357	my ($degrees) = @_;
[4]	1358
[33125]	1359	return $degrees * pi /180; # returns radians
	1360	}
	1361
	1362	sub radiansToDegrees
	1363	{
	1364	my $self = shift (@_);
	1365	my ($radians) = @_;
	1366
	1367	return $radians * 180 / pi; # returns degrees
	1368	}
[33293]	1369
[33302]	1370	# Call as:
	1371	# my $json = JSON->new->allow_nonref;
	1372	# &printAllShapes($json, $json_array);
[33293]	1373	sub printAllShapes {
	1374	my ($json, $json_array) = @_;
	1375
	1376
	1377	#my $pretty_print_shape = $json->pretty->encode( $json_array->[0] );
	1378	foreach my $shape (@$json_array) {
	1379	my $pretty_print_shape = $json->pretty->encode( $shape );
	1380	print STDERR "Shape: $pretty_print_shape\n";
	1381	}
	1382
[33185]	1383	}
[33125]	1384
[33217]	1385	# For the (lat, lng) coordinate given,
	1386	# attaches Coordinate and multiple CoordShort (different precision level) metadata to the doc object
[33125]	1387	sub processCoordinate {
	1388	my $self = shift (@_);
[33217]	1389	my ($section, $latitude, $longitude) = @_;
[33125]	1390
[33128]	1391	my $section_ptr = $self->_lookup_section($section);
	1392
[33125]	1393	my $lat_direction = ($latitude =~ m/^-/) ? "S" : "N";
	1394	my $lng_direction = ($longitude =~ m/^-/) ? "W" : "E";
	1395
	1396	# have to store (lat, lng) in pairs, when there are so many coords to store
	1397	#push (@{$section_ptr->{'metadata'}}, ["Latitude", $latitude]);
	1398	#push (@{$section_ptr->{'metadata'}}, ["Longitude", $longitude]);
	1399
	1400	push (@{$section_ptr->{'metadata'}}, ["Coordinate", "$latitude $longitude"]); # "$latitude$lat_direction $longitude$lng_direction"
	1401
	1402	my ($latBeforeDec, $latAfterDec);
	1403	my ($lngBeforeDec, $lngAfterDec);
	1404
	1405	if($latitude !~ m/\./) {
	1406	$latBeforeDec = $latitude;
	1407	$latAfterDec = "";
	1408	} else {
	1409	($latBeforeDec, $latAfterDec) = ($latitude =~ m/^-?([0-9]+)\.([0-9]+)$/);
	1410	}
	1411	if($longitude !~ m/\./) {
	1412	$lngBeforeDec = $longitude;
	1413	$lngAfterDec = "";
	1414	} else {
	1415	($lngBeforeDec, $lngAfterDec) = ($longitude =~ m/^-?([0-9]+)\.([0-9]+)$/);
	1416	}
	1417
	1418
[33217]	1419	my $name = "CoordShort";
	1420	push (@{$section_ptr->{'metadata'}}, [$name, "$latBeforeDec$lat_direction $lngBeforeDec$lng_direction"]);
	1421
[33185]	1422	for(my $i = 2; $i <= 4; $i++)
	1423	{
	1424	my $latDecPlaces = (length($latAfterDec) >= $i) ? substr($latAfterDec, 0, $i) : "";
	1425	my $lngDecPlaces = (length($lngAfterDec) >= $i) ? substr($lngAfterDec, 0, $i) : "";
	1426
[33217]	1427	push (@{$section_ptr->{'metadata'}}, [$name,
	1428	$latBeforeDec . $lat_direction. $latDecPlaces . " " . $lngBeforeDec . $lng_direction. $lngDecPlaces]);
[33185]	1429
	1430	}
	1431
	1432	#Only add the metadata if it has not already been added
[33217]	1433	#my $metaMap = $self->get_metadata_hashmap($section); ### TODO: metaMap not used. Unnecesssary step? (Called function has no apparent side-effects.)
[33125]	1434	}
	1435
	1436
[4]	1437	# methods for dealing with text
	1438
[1241]	1439	# returns the text for a section
	1440	sub get_text {
	1441	my $self = shift (@_);
	1442	my ($section) = @_;
	1443
	1444	my $section_ptr = $self->_lookup_section($section);
	1445	if (!defined $section_ptr) {
	1446	print STDERR "doc::get_text couldn't find section " .
	1447	"$section\n";
	1448	return "";
	1449	}
	1450
	1451	return $section_ptr->{'text'};
	1452	}
	1453
	1454	# returns the (utf-8 encoded) length of the text for a section
	1455	sub get_text_length {
	1456	my $self = shift (@_);
	1457	my ($section) = @_;
	1458
	1459	my $section_ptr = $self->_lookup_section($section);
	1460	if (!defined $section_ptr) {
	1461	print STDERR "doc::get_text_length couldn't find section " .
	1462	"$section\n";
	1463	return 0;
	1464	}
	1465
	1466	return length ($section_ptr->{'text'});
	1467	}
	1468
[23131]	1469	# returns the total length for all the sections
	1470	sub get_total_text_length {
	1471	my $self = shift (@_);
	1472
	1473	my $section = $self->get_top_section();
	1474	my $length = 0;
	1475	while (defined $section) {
	1476	$length += $self->get_text_length($section);
	1477	$section = $self->get_next_section($section);
	1478	}
	1479	return $length;
	1480	}
	1481
[1241]	1482	sub delete_text {
	1483	my $self = shift (@_);
	1484	my ($section) = @_;
	1485
	1486	my $section_ptr = $self->_lookup_section($section);
	1487	if (!defined $section_ptr) {
	1488	print STDERR "doc::delete_text couldn't find section " .
	1489	"$section\n";
	1490	return;
	1491	}
	1492
	1493	$section_ptr->{'text'} = "";
	1494	}
	1495
[73]	1496	# add_text assumes the text is in (extended) ascii form. For
	1497	# text which has been already converted to the UTF-8 format
	1498	# use add_utf8_text.
[4]	1499	sub add_text {
	1500	my $self = shift (@_);
	1501	my ($section, $text) = @_;
	1502
[73]	1503	# convert the text to UTF-8 encoded unicode characters
	1504	# and add the text
[1870]	1505	$self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
[73]	1506	}
	1507
	1508
	1509	# add_utf8_text assumes the text to be added has already
	1510	# been converted to the UTF-8 encoding. For ascii text use
	1511	# add_text
[32590]	1512	# Pass by value version (internally calls pass by ref version
	1513	# to avoid code duplication)
[73]	1514	sub add_utf8_text {
	1515	my $self = shift (@_);
	1516	my ($section, $text) = @_;
[32590]	1517
	1518	$self->add_utf8_textref($section, \$text);
	1519	}
[73]	1520
[32590]	1521	# Pass by reference version, used by GreenstoneSQLPlugin for fulltext
	1522	sub add_utf8_textref {
	1523	my $self = shift (@_);
	1524	my ($section, $text_ref) = @_;
	1525
[4]	1526	my $section_ptr = $self->_lookup_section($section);
	1527	if (!defined $section_ptr) {
[32590]	1528	print STDERR "doc::add_utf8_textref couldn't find section " .
[4]	1529	"$section\n";
	1530	return;
	1531	}
	1532
[32590]	1533	$section_ptr->{'text'} .= $$text_ref;
[4]	1534	}
	1535
[16950]	1536	# returns the Source meta, which is the utf8 filename generated.
	1537	# Added a separate method here for convenience
	1538	sub get_source {
[16924]	1539	my $self = shift (@_);
[16950]	1540	return $self->get_metadata_element ($self->get_top_section(), "Source");
	1541	}
	1542
	1543	# returns the SourceFile meta, which is the url reference to the URL-encoded
	1544	# version of Source (the utf8 filename). Added a separate method here for convenience
	1545	sub get_sourcefile {
	1546	my $self = shift (@_);
	1547	return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
	1548	}
	1549
	1550	# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
	1551	# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
	1552	sub get_assocfile_from_sourcefile {
	1553	my $self = shift (@_);
[16924]	1554
	1555	# get the SourceFile meta, which is a URL to a file on the filesystem
	1556	my $top_section = $self->get_top_section();
	1557	my $source_file = $self->get_metadata_element($top_section, "SourceFile");
[4]	1558
[16924]	1559	# get the actual filename as it exists on the filesystem which this url refers to
[16928]	1560	$source_file = &unicode::url_to_filename($source_file);
[16924]	1561	my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
	1562	return $assocfilename;
	1563	}
	1564
[1241]	1565	# methods for dealing with associated files
	1566
	1567	# a file is associated with a document, NOT a section.
	1568	# if section is defined it is noted in the data structure
	1569	# only so that files associated from a particular section
	1570	# may be removed later (using delete_section_assoc_files)
	1571	sub associate_file {
	1572	my $self = shift (@_);
	1573	my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
	1574	$mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
	1575
	1576	# remove all associated files with the same name
	1577	$self->delete_assoc_file ($assoc_filename);
[23413]	1578
	1579	# Too harsh a requirement
	1580	# Definitely get HTML docs, for example, with some missing
	1581	# support files
	1582	# if (!&util::fd_exists($real_filename)) {
	1583	# print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n";
	1584	# exit -1;
	1585	# }
	1586
[23387]	1587	# print STDERR "**** is the following a UTF8 rep of real filename?\n $real_filename\n";
	1588	# print STDERR "****##### so, ensure it is before storing?!?!?\n";
	1589	## my $utf8_filename = Encode::encode("utf8",$filename);
	1590
[1241]	1591	push (@{$self->{'associated_files'}},
	1592	[$real_filename, $assoc_filename, $mime_type, $section]);
	1593	}
	1594
	1595	# returns a list of associated files in the form
	1596	# [[real_filename, assoc_filename, mimetype], ...]
	1597	sub get_assoc_files {
	1598	my $self = shift (@_);
	1599
	1600	return $self->{'associated_files'};
	1601	}
	1602
[20775]	1603	# the following two methods used to keep track of original associated files
	1604	# for incremental building. eg a txt file used by an item file does not end
	1605	# up as an assoc file for the doc.xml, but it needs to be recorded as a source
	1606	# file for incremental build
	1607	sub associate_source_file {
	1608	my $self = shift (@_);
	1609	my ($full_filename) = @_;
[19494]	1610
[20775]	1611	push (@{$self->{'source_assoc_files'}}, $full_filename);
	1612
	1613	}
	1614
	1615	sub get_source_assoc_files {
	1616	my $self = shift (@_);
	1617
	1618	return $self->{'source_assoc_files'};
	1619
	1620
	1621	}
[19494]	1622	sub metadata_file {
	1623	my $self = shift (@_);
	1624	my ($real_filename, $filename) = @_;
	1625
	1626	push (@{$self->{'metadata_files'}},
	1627	[$real_filename, $filename]);
	1628	}
	1629
[21566]	1630	# used for writing out the archiveinf-doc info database, to list all the metadata files
[19494]	1631	sub get_meta_files {
	1632	my $self = shift (@_);
	1633
	1634	return $self->{'metadata_files'};
	1635	}
	1636
[1241]	1637	sub delete_section_assoc_files {
	1638	my $self = shift (@_);
	1639	my ($section) = @_;
	1640
	1641	my $i=0;
	1642	while ($i < scalar (@{$self->{'associated_files'}})) {
	1643	if (defined $self->{'associated_files'}->[$i]->[3] &&
	1644	$self->{'associated_files'}->[$i]->[3] eq $section) {
	1645	splice (@{$self->{'associated_files'}}, $i, 1);
	1646	} else {
	1647	$i++;
	1648	}
	1649	}
	1650	}
	1651
	1652	sub delete_assoc_file {
	1653	my $self = shift (@_);
	1654	my ($assoc_filename) = @_;
	1655
	1656	my $i=0;
	1657	while ($i < scalar (@{$self->{'associated_files'}})) {
	1658	if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
	1659	splice (@{$self->{'associated_files'}}, $i, 1);
	1660	} else {
	1661	$i++;
	1662	}
	1663	}
	1664	}
	1665
	1666	sub reset_nextsection_ptr {
	1667	my $self = shift (@_);
	1668	my ($section) = @_;
	1669
	1670	my $section_ptr = $self->_lookup_section($section);
	1671	$section_ptr->{'next_subsection'} = 1;
	1672	}
	1673
[4]	1674	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: