Context Navigation

source: main/trunk/greenstone2/perllib/doc.pm@ 33313

Last change on this file since 33313 was 33313, checked in by ak19, 5 years ago
Minor. Changes to comment
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 50.1 KB

Rev	Line
[537]	1	###########################################################################
	2	#
	3	# doc.pm --
	4	# A component of the Greenstone digital library software
	5	# from the New Zealand Digital Library Project at the
	6	# University of Waikato, New Zealand.
	7	#
	8	# Copyright (C) 1999 New Zealand Digital Library Project
	9	#
[8894]	10	# This program is free software; you can redistr te it and/or modify
[537]	11	# it under the terms of the GNU General Public License as published by
	12	# the Free Software Foundation; either version 2 of the License, or
	13	# (at your option) any later version.
	14	#
	15	# This program is distributed in the hope that it will be useful,
	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	# GNU General Public License for more details.
	19	#
	20	# You should have received a copy of the GNU General Public License
	21	# along with this program; if not, write to the Free Software
	22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	#
	24	###########################################################################
	25
[1241]	26	# base class to hold documents
[4]	27
	28	package doc;
[3834]	29	eval {require bytes};
[4]	30
[832]	31	BEGIN {
[1241]	32	die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
	33	unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
[832]	34	}
	35
[15894]	36	use strict;
[1241]	37	use unicode;
	38	use util;
[27306]	39	use FileUtils;
[1241]	40	use ghtml;
[8220]	41	use File::stat;
[1241]	42	##use hashdoc;
[13172]	43	use docprint;
[33125]	44	use JSON;
[1241]	45
[33125]	46	# We just need pi from the Trig lib
[33293]	47	# Import constants pi2, pip2, pip4 (2*pi, pi/2, pi/4).
[33125]	48	use Math::Trig ':pi';
	49
[4]	50	# the document type may be indexed_doc, nonindexed_doc, or
	51	# classification
	52
[18528]	53	our $OIDcount = 0;
[2327]	54
[33171]	55
	56	# Declare the Package (level) Variable that will tell us whether we're in the import or buildcol phase.
	57	# This var will be set by import.pl, buildcol.pl and export.pl to the then appropriate value.
	58	# https://perlmaven.com/package-variables-and-lexical-variables-in-perl
	59	# But: https://stackoverflow.com/questions/7542120/perl-how-to-make-variables-from-requiring-script-available-in-required-script
	60	# https://stackoverflow.com/questions/33230015/how-do-i-use-a-variable-in-another-perl-script
	61	#$doc::cmd_line_mode = undef;
	62	# Decided to follow the usage of $OIDcount above to declare the necessary package-level variables 'globally' accessible
	63	# actually accessible only for those packages that import this one (with 'use doc')
[33302]	64	my $cmd_line_mode = undef;
[33171]	65
[33302]	66	# processor_mode keeps track of which buildcol pass we're at: dummy, text (sidx/didx passes) or infodb
	67	my $processor_mode = undef;
	68
[20412]	69	# rename_method can be 'url', 'none', 'base64'
[4]	70	sub new {
	71	my $class = shift (@_);
[18319]	72	my ($source_filename, $doc_type, $rename_method) = @_;
[1374]	73
[13770]	74
[1241]	75	my $self = bless {'associated_files'=>[],
	76	'subsection_order'=>[],
	77	'next_subsection'=>1,
	78	'subsections'=>{},
	79	'metadata'=>[],
[2327]	80	'text'=>"",
	81	'OIDtype'=>"hash"}, $class;
[4]	82
[10217]	83	# used to set lastmodified here, but this can screw up the HASH ids, so
	84	# the docsave processor now calls set_lastmodified
[13770]	85
[23362]	86	$self->set_source_path($source_filename);
[10217]	87
[7929]	88	if (defined $source_filename) {
[15874]	89	$source_filename = &util::filename_within_collection($source_filename);
[18508]	90	print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
[18319]	91	$self->set_source_filename ($source_filename, $rename_method);
[7929]	92	}
	93
[1374]	94	$self->set_doc_type ($doc_type) if defined $doc_type;
[485]	95
[4]	96	return $self;
	97	}
[23362]	98
	99
	100	sub set_source_path
	101	{
	102	my $self = shift @_;
	103	my ($source_filename) = @_;
	104
	105	if (defined $source_filename) {
	106	# On Windows the source_filename can be in terse DOS format
	107	# e.g. test~1.txt
	108
	109	$self->{'terse_source_path'} = $source_filename;
	110
[27306]	111	# Use the FileUtil library methods as they are aware of more special
	112	# cases such as HDFS [jmt12]
[27393]	113	if (&FileUtils::fileExists($source_filename))
	114	{
[23362]	115	# See if we can do better for Windows with a filename
	116	if ($ENV{'GSDLOS'} =~ /^windows$/i) {
	117	require Win32;
	118	$self->{'source_path'} = Win32::GetLongPathName($source_filename);
	119	}
	120	else {
	121	# For Unix-based systems, there is no difference between the two
	122	$self->{'source_path'} = $source_filename;
	123	}
	124	}
	125	else {
	126	print STDERR "Warning: In doc::set_source_path(), file\n";
	127	print STDERR " $source_filename\n";
	128	print STDERR " does not exist\n";
	129
	130	# (default) Set it to whatever we were given
	131	$self->{'source_path'} = $source_filename;
	132	}
	133	}
	134	else {
	135	# Previous code for setting source_path allowed for
	136	# it to be undefined, so continue this practice
	137	$self->{'terse_source_path'} = undef;
	138	$self->{'source_path'} = undef;
	139	}
	140	}
	141
	142
	143	sub get_source_path
	144	{
	145	my $self = shift @_;
	146
	147	return $self->{'terse_source_path'};
	148	}
	149
[10217]	150	# set lastmodified for OAI purposes, added by GRB, moved by kjdon
[23939]	151	sub set_oailastmodified {
	152	my $self = shift (@_);
	153
	154	my $source_path = $self->{'terse_source_path'};
	155
	156	if (defined $source_path && (-e $source_path)) {
	157	my $current_time = time;
	158
	159	my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
	160	$wday, $yday, $isdst) = localtime($current_time);
	161
	162	my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
	163
	164	$self->add_utf8_metadata($self->get_top_section(), "oailastmodified", $current_time);
	165	$self->add_utf8_metadata($self->get_top_section(), "oailastmodifieddate", $date_modified);
	166	}
	167	}
	168
	169	# no longer used for OAI purposes, since lastmodified is not what we want as the
	170	# Datestamp of a document. This doc metadata may be useful for general purposes.
[10217]	171	sub set_lastmodified {
	172	my $self = shift (@_);
[4]	173
[23362]	174	my $source_path = $self->{'terse_source_path'};
[13770]	175
	176	if (defined $source_path && (-e $source_path)) {
	177
	178	my $file_stat = stat($source_path);
[10217]	179	my $mtime = $file_stat->mtime;
[21862]	180	my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
	181	$wday, $yday, $isdst) = localtime($mtime);
	182
	183	my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
	184
[23923]	185	$self->add_utf8_metadata($self->get_top_section(), "lastmodified", $mtime);
[21862]	186	$self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
[10217]	187	}
	188	}
	189
[1241]	190	# clone the $self object
	191	sub duplicate {
	192	my $self = shift (@_);
	193
	194	my $newobj = {};
	195
[8716]	196	foreach my $k (keys %$self) {
[1241]	197	$newobj->{$k} = &clone ($self->{$k});
	198	}
	199
	200	bless $newobj, ref($self);
	201	return $newobj;
	202	}
	203
	204	sub clone {
	205	my ($from) = @_;
	206	my $type = ref ($from);
	207
	208	if ($type eq "HASH") {
	209	my $to = {};
[8716]	210	foreach my $key (keys %$from) {
[1241]	211	$to->{$key} = &clone ($from->{$key});
	212	}
	213	return $to;
	214	} elsif ($type eq "ARRAY") {
	215	my $to = [];
[8716]	216	foreach my $v (@$from) {
[1241]	217	push (@$to, &clone ($v));
	218	}
	219	return $to;
	220	} else {
	221	return $from;
	222	}
	223	}
	224
[2327]	225	sub set_OIDtype {
	226	my $self = shift (@_);
[12268]	227	my ($type, $metadata) = @_;
[1241]	228
[26536]	229	if (defined $type && $type =~ /^(hash\|hash_on_file\|hash_on_ga_xml\|hash_on_full_filename\|incremental\|filename\|dirname\|full_filename\|assigned)$/) {
[2327]	230	$self->{'OIDtype'} = $type;
	231	} else {
	232	$self->{'OIDtype'} = "hash";
	233	}
[16792]	234
[12268]	235	if ($type =~ /^assigned$/) {
	236	if (defined $metadata) {
	237	$self->{'OIDmetadata'} = $metadata;
	238	} else {
	239	$self->{'OIDmetadata'} = "dc.Identifier";
	240	}
	241	}
[2327]	242	}
	243
[20412]	244	# rename_method can be 'url', 'none', 'base64'
[1241]	245	sub set_source_filename {
	246	my $self = shift (@_);
[18319]	247	my ($source_filename, $rename_method) = @_;
[1241]	248
[16578]	249	# Since the gsdlsourcefilename element goes into the doc.xml it has
	250	# to be utf8. However, it should also represent the source filename
	251	# (in the import directory) which may not be utf8 at all.
[18319]	252	# For instance, if this meta element (gsdlsourcefilename) will be used
	253	# by other applications that parse doc.xml in order to locate
	254	# gsdlsourcefilename. Therefore, the solution is to URLencode or base64
	255	# encode the real filename as this is a binary-to-text encoding meaning
	256	# that the resulting string is ASCII (utf8). Decoding will give the original.
[16578]	257
[18319]	258	# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
	259
[16670]	260	# URLencode just the gsdl_source_filename, not the directory. Then prepend dir
[23278]	261	$source_filename = $self->encode_filename($source_filename, $rename_method);
	262	# my ($srcfilename,$dirname,$suffix)
	263	# = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
[16792]	264	# print STDERR "-> $srcfilename -> ";
[23278]	265	# $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
[27306]	266	# $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
[16792]	267	# print STDERR "$source_filename\n";
[16670]	268
[16578]	269	$self->set_utf8_metadata_element ($self->get_top_section(),
[1241]	270	"gsdlsourcefilename",
	271	$source_filename);
	272	}
	273
[23278]	274	sub encode_filename {
	275	my $self = shift (@_);
	276	my ($source_filename, $rename_method) = @_;
	277
	278	my ($srcfilename,$dirname,$suffix)
	279	= &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
	280	# print STDERR "-> $srcfilename -> ";
	281	$srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
[27306]	282	$source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
[23278]	283
	284	return $source_filename;
	285	}
	286
[7569]	287	sub set_converted_filename {
	288	my $self = shift (@_);
	289	my ($converted_filename) = @_;
	290
[16578]	291	# we know the converted filename is utf8
	292	$self->set_utf8_metadata_element ($self->get_top_section(),
[7569]	293	"gsdlconvertedfilename",
	294	$converted_filename);
	295	}
	296
[1241]	297	# returns the source_filename as it was provided
[19829]	298	sub get_unmodified_source_filename {
	299	my $self = shift (@_);
	300
[23362]	301	return $self->{'terse_source_path'};
[19829]	302	}
	303
	304	# returns the source_filename with whatever rename_method was given
[1241]	305	sub get_source_filename {
	306	my $self = shift (@_);
	307
	308	return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
	309	}
	310
[19829]	311
	312
[7569]	313	# returns converted filename if available else returns source filename
	314	sub get_filename_for_hashing {
	315	my $self = shift (@_);
	316
	317	my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
	318
	319	if (!defined $filename) {
[11097]	320	my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
[10980]	321	# if NULPlug processed file, then don't give a filename
[11097]	322	if (defined $plugin_name && $plugin_name eq "NULPlug") {
[10980]	323	$filename = undef;
[16670]	324	} else { # returns the URL encoded source filename!
[10980]	325	$filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
	326	}
[7569]	327	}
[23485]	328
[27306]	329	if (!&FileUtils::isFilenameAbsolute($filename)) {
	330	$filename = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$filename);
[23485]	331	}
	332
[7569]	333	return $filename;
	334	}
	335
[1241]	336	sub set_doc_type {
	337	my $self = shift (@_);
	338	my ($doc_type) = @_;
	339
	340	$self->set_metadata_element ($self->get_top_section(),
	341	"gsdldoctype",
	342	$doc_type);
	343	}
	344
[10217]	345	# returns the gsdldoctype as it was provided
[1241]	346	# the default of "indexed_doc" is used if no document
	347	# type was provided
	348	sub get_doc_type {
	349	my $self = shift (@_);
	350
	351	my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
	352	return $doc_type if (defined $doc_type);
	353	return "indexed_doc";
	354	}
	355
	356
	357	# look up the reference to the a particular section
	358	sub _lookup_section {
	359	my $self = shift (@_);
	360	my ($section) = @_;
	361
	362	my ($num);
	363	my $sectionref = $self;
	364
	365	while (defined $section && $section ne "") {
[12327]	366
[1241]	367	($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
[12327]	368
	369	$num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
	370
[1241]	371	$section = "" unless defined $section;
	372
[12327]	373
[1241]	374	if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
	375	$sectionref = $sectionref->{'subsections'}->{$num};
	376	} else {
	377	return undef;
	378	}
	379	}
	380
	381	return $sectionref;
	382	}
	383
[2327]	384	# calculate OID by hashing the contents of the document
[1241]	385	sub _calc_OID {
	386	my $self = shift (@_);
	387	my ($filename) = @_;
	388
[22855]	389
[1241]	390	my $osexe = &util::get_os_exe();
	391
[27306]	392	my $hashfile_exe = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin",
[1241]	393	$ENV{'GSDLOS'},"hashfile$osexe");
[8504]	394
[28560]	395	&util::set_gnomelib_env(); # gnomelib_env (particularly lib/libiconv2.dylib) required to run the hashfile executable on Mac Lions
	396	# The subroutine will set the gnomelib env once for each subshell launched, by first testing if GEXTGNOME is not already set
	397
[28575]	398	# A different way to set the gnomelib env would be to do it more locally: exporting the necessary vars
	399	# (specifically DYLD/LD_LIB_PATH) for gnome_lib as part of the command executed.
	400	# E.g. $result=`export LD_LIBRARY_PATH=../ext/gnome-lib/darwin/lib; hashfile...`
	401
[1241]	402	my $result = "NULL";
[22855]	403
[1679]	404
[1241]	405	if (-e "$hashfile_exe") {
[1679]	406	# $result = `\"$hashfile_exe\" \"$filename\"`;
[21862]	407	# $result = `hashfile$osexe \"$filename\" 2>&1`;
[1679]	408	$result = `hashfile$osexe \"$filename\"`;
[21862]	409
[1241]	410	($result) = $result =~ /:\s*([0-9a-f]+)/i;
	411	} else {
	412	print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
	413	}
	414	return "HASH$result";
	415	}
	416
	417	# methods dealing with OID, not groups of them.
	418
[2327]	419	# if $OID is not provided one is calculated
[1241]	420	sub set_OID {
	421	my $self = shift (@_);
	422	my ($OID) = @_;
[8504]	423
[8797]	424	my $use_hash_oid = 0;
[17057]	425	# if an OID wasn't provided calculate one
[1241]	426	if (!defined $OID) {
	427	$OID = "NULL";
[17025]	428	if ($self->{'OIDtype'} =~ /^hash/) {
[8797]	429	$use_hash_oid = 1;
	430	} elsif ($self->{'OIDtype'} eq "incremental") {
[2327]	431	$OID = "D" . $OIDcount;
	432	$OIDcount ++;
[26221]	433	} elsif ($self->{'OIDtype'} eq "filename") {
	434	my $filename = $self->get_source_filename();
	435	$OID = &File::Basename::fileparse($filename, qr/\.[^.]*/);
	436	$OID = &util::tidy_up_oid($OID);
[26536]	437	} elsif ($self->{'OIDtype'} eq "full_filename") {
	438	my $source_filename = $self->get_source_filename();
	439	my $dirsep = &util::get_os_dirsep();
	440
	441	$source_filename =~ s/^import$dirsep//;
	442	$source_filename =~ s/$dirsep/-/g;
	443	$source_filename =~ s/\./_/g;
	444
	445	$OID = $source_filename;
	446	$OID = &util::tidy_up_oid($OID);
[8716]	447	} elsif ($self->{'OIDtype'} eq "dirname") {
	448	my $filename = $self->get_source_filename();
[8797]	449	if (defined($filename)) { # && -e $filename) {
[27350]	450	# get the immediate parent directory
[8716]	451	$OID = &File::Basename::dirname($filename);
	452	if (defined $OID) {
[27350]	453	$OID = &File::Basename::basename($OID);
[19617]	454	$OID = &util::tidy_up_oid($OID);
[8716]	455	} else {
[8797]	456	print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
	457	$use_hash_oid = 1;
[8716]	458	}
	459	} else {
[27350]	460	print STDERR "Failed to find a filename, generating hash id\n";
[8797]	461	$use_hash_oid = 1;
[8716]	462	}
[2327]	463
[8797]	464	} elsif ($self->{'OIDtype'} eq "assigned") {
[12268]	465	my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
[8797]	466	if (defined $identifier && $identifier ne "") {
[17025]	467	$OID = $identifier;
[19617]	468	$OID = &util::tidy_up_oid($OID);
[8797]	469	} else {
	470	# need a hash id
[12268]	471	print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
[8797]	472	$use_hash_oid = 1;
	473	}
	474
[2327]	475	} else {
[8797]	476	$use_hash_oid = 1;
	477	}
	478
	479	if ($use_hash_oid) {
[23562]	480	my $hash_on_file = 1;
[26536]	481	my $hash_on_ga_xml = 0;
	482
[17025]	483	if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
	484	$hash_on_file = 0;
[26536]	485	$hash_on_ga_xml = 1;
[17025]	486	}
[26536]	487
	488	if ($self->{'OIDtype'} eq "hash_on_full_filename") {
	489	$hash_on_file = 0;
	490	$hash_on_ga_xml = 0;
	491
	492	my $source_filename = $self->get_source_filename();
	493	my $dirsep = &util::get_os_dirsep();
	494
	495	$source_filename =~ s/^import$dirsep//;
	496	$source_filename =~ s/$dirsep/-/g;
	497	$source_filename =~ s/\./_/g;
	498
	499	# If the filename is very short then (handled naively)
	500	# this can cause conjestion in the hash-values
	501	# computed, leading documents sharing the same leading
	502	# Hex values in the computed has.
	503	#
	504	# The solution taken here is to replace the name of
	505	# the file name a sufficient number of times (up to
	506	# the character limit defined in 'rep_limit' and
	507	# make that the content that is hashed on
	508
	509	# *** Think twice before changing the following value
	510	# as it will break backward compatability of computed
	511	# document HASH values
	512
	513	my $rep_limit = 256;
	514	my $hash_content = undef;
	515
	516	if (length($source_filename)<$rep_limit) {
	517	my $rep_string = "$source_filename\|";
	518	my $rs_len = length($rep_string);
	519
	520	my $clone_times = int(($rep_limit-1)/$rs_len) +1;
	521
	522	$hash_content = substr($rep_string x $clone_times, 0, $rep_limit);
	523	}
	524	else {
	525	$hash_content = $source_filename;
	526	}
	527
	528	my $filename = &util::get_tmp_filename();
	529	if (!open (OUTFILE, ">:utf8", $filename)) {
	530	print STDERR "doc::set_OID could not write to $filename\n";
	531	} else {
	532	print OUTFILE $hash_content;
	533	close (OUTFILE);
	534	}
	535	$OID = $self->_calc_OID ($filename);
	536
[27538]	537	&FileUtils::removeFiles ($filename);
[26536]	538	}
	539
[17025]	540	if ($hash_on_file) {
	541	# "hash" OID - feed file to hashfile.exe
	542	my $filename = $self->get_filename_for_hashing();
[23562]	543
[17025]	544	# -z: don't want to hash on the file if it is zero size
	545	if (defined($filename) && -e $filename && !-z $filename) {
	546	$OID = $self->_calc_OID ($filename);
	547	} else {
[26536]	548	$hash_on_ga_xml = 1; # switch to back-up plan, and hash on GA file instead
[17025]	549	}
	550	}
[26536]	551
	552	if ($hash_on_ga_xml) {
	553	# In addition being asked to explicity calculate the has based on the GA file,
	554	# can also end up coming into this block is doing 'hash_on_file' but the file
	555	# itself is of zero bytes (as could be the case with 'doc.nul' file
	556
[17025]	557	my $filename = &util::get_tmp_filename();
[22855]	558	if (!open (OUTFILE, ">:utf8", $filename)) {
[2327]	559	print STDERR "doc::set_OID could not write to $filename\n";
	560	} else {
[32512]	561	my $doc_text = &docprint::get_section_xml($self);
[13172]	562	print OUTFILE $doc_text;
[2327]	563	close (OUTFILE);
	564	}
	565	$OID = $self->_calc_OID ($filename);
[27393]	566	&FileUtils::removeFiles($filename);
[1374]	567	}
[1241]	568	}
	569	}
	570	$self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
	571	}
	572
	573	# this uses hashdoc (embedded c thingy) which is faster but still
[32535]	574	# needs a little work to be sufficiently stable
[1241]	575	sub ___set_OID {
	576	my $self = shift (@_);
	577	my ($OID) = @_;
	578
	579	# if an OID wasn't provided then calculate hash value based on document
	580	if (!defined $OID)
	581	{
[32512]	582	my $hash_text = &docprint::get_section_xml($self);
[1241]	583	my $hash_len = length($hash_text);
	584
	585	$OID = &hashdoc::buffer($hash_text,$hash_len);
	586	}
	587
	588	$self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
	589	}
	590
	591	# returns the OID for this document
	592	sub get_OID {
	593	my $self = shift (@_);
	594	my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
	595	return $OID if (defined $OID);
	596	return "NULL";
	597	}
	598
	599	sub delete_OID {
	600	my $self = shift (@_);
	601
	602	$self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
	603	}
	604
	605
	606	# methods for manipulating section names
	607
	608	# returns the name of the top-most section (the top
	609	# level of the document
	610	sub get_top_section {
	611	my $self = shift (@_);
	612
	613	return "";
	614	}
	615
	616	# returns a section
	617	sub get_parent_section {
	618	my $self = shift (@_);
	619	my ($section) = @_;
	620
	621	$section =~ s/(^\|\.)\d+$//;
	622
	623	return $section;
	624	}
	625
	626	# returns the first child section (or the end child
	627	# if there isn't any)
	628	sub get_begin_child {
	629	my $self = shift (@_);
	630	my ($section) = @_;
	631
	632	my $section_ptr = $self->_lookup_section($section);
	633	return "" unless defined $section_ptr;
	634
	635	if (defined $section_ptr->{'subsection_order'}->[0]) {
	636	return "$section.$section_ptr->{'subsection_order'}->[0]";
	637	}
	638
	639	return $self->get_end_child ($section);
	640	}
	641
	642	# returns the next child of a parent section
	643	sub get_next_child {
	644	my $self = shift (@_);
	645	my ($section) = @_;
	646
	647	my $parent_section = $self->get_parent_section($section);
	648	my $parent_section_ptr = $self->_lookup_section($parent_section);
	649	return undef unless defined $parent_section_ptr;
	650
	651	my ($section_num) = $section =~ /(\d+)$/;
	652	return undef unless defined $section_num;
	653
	654	my $i = 0;
	655	my $section_order = $parent_section_ptr->{'subsection_order'};
	656	while ($i < scalar(@$section_order)) {
	657	last if $section_order->[$i] eq $section_num;
	658	$i++;
	659	}
	660
	661	$i++; # the next child
	662	if ($i < scalar(@$section_order)) {
	663	return $section_order->[$i] if $parent_section eq "";
	664	return "$parent_section.$section_order->[$i]";
	665	}
	666
	667	# no more sections in this level
	668	return undef;
	669	}
	670
	671	# returns a reference to a list of children
	672	sub get_children {
	673	my $self = shift (@_);
	674	my ($section) = @_;
	675
	676	my $section_ptr = $self->_lookup_section($section);
	677	return [] unless defined $section_ptr;
	678
	679	my @children = @{$section_ptr->{'subsection_order'}};
	680
	681	map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
	682	return \@children;
	683	}
	684
	685	# returns the child section one past the last one (which
	686	# is coded as "0")
	687	sub get_end_child {
	688	my $self = shift (@_);
	689	my ($section) = @_;
	690
	691	return $section . ".0" unless $section eq "";
	692	return "0";
	693	}
	694
	695	# returns the next section in book order
	696	sub get_next_section {
	697	my $self = shift (@_);
	698	my ($section) = @_;
	699
	700	return undef unless defined $section;
	701
	702	my $section_ptr = $self->_lookup_section($section);
	703	return undef unless defined $section_ptr;
	704
	705	# first try to find first child
	706	if (defined $section_ptr->{'subsection_order'}->[0]) {
	707	return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
	708	return "$section.$section_ptr->{'subsection_order'}->[0]";
	709	}
	710
	711	do {
	712	# try to find sibling
	713	my $next_child = $self->get_next_child ($section);
	714	return $next_child if (defined $next_child);
	715
	716	# move up one level
	717	$section = $self->get_parent_section ($section);
	718	} while $section =~ /\d/;
	719
	720	return undef;
	721	}
	722
	723	sub is_leaf_section {
	724	my $self = shift (@_);
	725	my ($section) = @_;
	726
	727	my $section_ptr = $self->_lookup_section($section);
	728	return 1 unless defined $section_ptr;
	729
	730	return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
	731	}
	732
	733	# methods for dealing with sections
	734
	735	# returns the name of the inserted section
	736	sub insert_section {
	737	my $self = shift (@_);
	738	my ($before_section) = @_;
	739
	740	# get the child to insert before and its parent section
	741	my $parent_section = "";
	742	my $before_child = "0";
	743	my @before_section = split (/\./, $before_section);
	744	if (scalar(@before_section) > 0) {
	745	$before_child = pop (@before_section);
	746	$parent_section = join (".", @before_section);
	747	}
	748
	749	my $parent_section_ptr = $self->_lookup_section($parent_section);
	750	if (!defined $parent_section_ptr) {
	751	print STDERR "doc::insert_section couldn't find parent section " .
	752	"$parent_section\n";
	753	return;
	754	}
	755
	756	# get the next section number
	757	my $section_num = $parent_section_ptr->{'next_subsection'}++;
	758
	759	my $i = 0;
	760	while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
	761	$parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
	762	$i++;
	763	}
	764
	765	# insert the section number into the order list
	766	splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
	767
	768	# add this section to the parent section
	769	my $section_ptr = {'subsection_order'=>[],
	770	'next_subsection'=>1,
	771	'subsections'=>{},
	772	'metadata'=>[],
	773	'text'=>""};
	774	$parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
	775
	776	# work out the full section number
	777	my $section = $parent_section;
	778	$section .= "." unless $section eq "";
	779	$section .= $section_num;
	780
	781	return $section;
	782	}
	783
	784	# creates a pre-named section
	785	sub create_named_section {
	786	my $self = shift (@_);
	787	my ($mastersection) = @_;
	788
	789	my ($num);
	790	my $section = $mastersection;
	791	my $sectionref = $self;
	792
	793	while ($section ne "") {
	794	($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
	795	$num =~ s/^0+(\d)/$1/; # remove leading 0s
	796	$section = "" unless defined $section;
	797
	798	if (defined $num) {
	799	if (!defined $sectionref->{'subsections'}->{$num}) {
	800	push (@{$sectionref->{'subsection_order'}}, $num);
	801	$sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
	802	'next_subsection'=>1,
	803	'subsections'=>{},
	804	'metadata'=>[],
	805	'text'=>""};
	806	if ($num >= $sectionref->{'next_subsection'}) {
	807	$sectionref->{'next_subsection'} = $num + 1;
	808	}
	809	}
	810	$sectionref = $sectionref->{'subsections'}->{$num};
	811
	812	} else {
	813	print STDERR "doc::create_named_section couldn't create section ";
	814	print STDERR "$mastersection\n";
	815	last;
	816	}
	817	}
	818	}
	819
	820	# returns a reference to a list of subsections
	821	sub list_subsections {
	822	my $self = shift (@_);
	823	my ($section) = @_;
	824
	825	my $section_ptr = $self->_lookup_section ($section);
	826	if (!defined $section_ptr) {
	827	print STDERR "doc::list_subsections couldn't find section $section\n";
	828	return [];
	829	}
	830
	831	return [@{$section_ptr->{'subsection_order'}}];
	832	}
	833
	834	sub delete_section {
	835	my $self = shift (@_);
	836	my ($section) = @_;
	837
	838	# my $section_ptr = {'subsection_order'=>[],
	839	# 'next_subsection'=>1,
	840	# 'subsections'=>{},
	841	# 'metadata'=>[],
	842	# 'text'=>""};
	843
	844	# if this is the top section reset everything
	845	if ($section eq "") {
	846	$self->{'subsection_order'} = [];
	847	$self->{'subsections'} = {};
	848	$self->{'metadata'} = [];
	849	$self->{'text'} = "";
	850	return;
	851	}
	852
	853	# find the parent of the section to delete
	854	my $parent_section = "";
	855	my $child = "0";
	856	my @section = split (/\./, $section);
	857	if (scalar(@section) > 0) {
	858	$child = pop (@section);
	859	$parent_section = join (".", @section);
	860	}
	861
	862	my $parent_section_ptr = $self->_lookup_section($parent_section);
	863	if (!defined $parent_section_ptr) {
	864	print STDERR "doc::delete_section couldn't find parent section " .
	865	"$parent_section\n";
	866	return;
	867	}
	868
	869	# remove this section from the subsection_order list
	870	my $i = 0;
	871	while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
	872	if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
	873	splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
	874	last;
	875	}
	876	$i++;
	877	}
	878
	879	# remove this section from the subsection hash
	880	if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
	881	undef $parent_section_ptr->{'subsections'}->{$child};
	882	}
	883	}
	884
	885	#--
[4]	886	# methods for dealing with metadata
	887
	888	# set_metadata_element and get_metadata_element are for metadata
	889	# which should only have one value. add_meta_data and get_metadata
	890	# are for metadata which can have more than one value.
	891
[1241]	892	# returns the first metadata value which matches field
[6111]	893
	894	# This version of get metadata element works much like the one above,
	895	# except it allows for the namespace portion of a metadata element to
	896	# be ignored, thus if you are searching for dc.Title, the first piece
	897	# of matching metadata ending with the name Title (once any namespace
	898	# is removed) would be returned.
	899	# 28-11-2003 John Thompson
[1241]	900	sub get_metadata_element {
	901	my $self = shift (@_);
[6111]	902	my ($section, $field, $ignore_namespace) = @_;
[1241]	903	my ($data);
	904
[6111]	905	$ignore_namespace = 0 unless defined $ignore_namespace;
	906
[1241]	907	my $section_ptr = $self->_lookup_section($section);
	908	if (!defined $section_ptr) {
[8716]	909	print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
[1241]	910	return;
	911	}
	912
[24404]	913	# Remove any namespace if we are being told to ignore them
[6111]	914	if($ignore_namespace) {
[24404]	915	$field =~ s/^.\.//; #$field =~ s/^\w\.//;
[6111]	916	}
	917
[1241]	918	foreach $data (@{$section_ptr->{'metadata'}}) {
[6111]	919
	920	my $data_name = $data->[0];
[14966]	921
[24404]	922	# Remove any namespace if we are being told to ignore them
[6111]	923	if($ignore_namespace) {
[24404]	924	$data_name =~ s/^.\.//; #$data_name =~ s/^\w\.//;
[6111]	925	}
[24404]	926	# we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
	927	$data_name =~ s/^ex\.([^.]+)$/$1/; #$data_name =~ s/^ex\.//;
	928
[6111]	929	return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
[1241]	930	}
	931
	932	return undef; # was not found
	933	}
	934
	935	# returns a list of the form [value1, value2, ...]
	936	sub get_metadata {
	937	my $self = shift (@_);
[6111]	938	my ($section, $field, $ignore_namespace) = @_;
[1241]	939	my ($data);
	940
[6111]	941	$ignore_namespace = 0 unless defined $ignore_namespace;
	942
[1241]	943	my $section_ptr = $self->_lookup_section($section);
	944	if (!defined $section_ptr) {
[8716]	945	print STDERR "doc::get_metadata couldn't find section ",
	946	$section, "\n";
[1241]	947	return;
	948	}
	949
[24404]	950	# Remove any namespace if we are being told to ignore them
[6111]	951	if($ignore_namespace) {
[24404]	952	$field =~ s/^.*\.//;
[6111]	953	}
	954
[1241]	955	my @metadata = ();
	956	foreach $data (@{$section_ptr->{'metadata'}}) {
[6111]	957
	958	my $data_name = $data->[0];
[24404]	959
	960	# Remove any namespace if we are being told to ignore them
[6111]	961	if($ignore_namespace) {
[24404]	962	$data_name =~ s/^.*\.//;
	963	}
	964	# we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
	965	$data_name =~ s/^ex\.([^.]+)$/$1/;
	966
[6111]	967	push (@metadata, $data->[1]) if ($data_name eq $field);
[1241]	968	}
[9241]	969
[1241]	970	return \@metadata;
	971	}
	972
[23827]	973	sub get_metadata_hashmap {
	974	my $self = shift (@_);
	975	my ($section, $opt_namespace) = @_;
	976
	977	my $section_ptr = $self->_lookup_section($section);
	978	if (!defined $section_ptr) {
	979	print STDERR "doc::get_metadata couldn't find section ",
	980	$section, "\n";
	981	return;
	982	}
	983
	984	my $metadata_hashmap = {};
	985	foreach my $data (@{$section_ptr->{'metadata'}}) {
	986	my $metaname = $data->[0];
	987
	988	if ((!defined $opt_namespace) \|\| ($metaname =~ m/^$opt_namespace\./)) {
	989	if (!defined $metadata_hashmap->{$metaname}) {
	990	$metadata_hashmap->{$metaname} = [];
	991	}
	992	my $metaval_list = $metadata_hashmap->{$metaname};
	993	push(@$metaval_list, $data->[1]);
	994	}
	995	}
	996
	997	return $metadata_hashmap;
	998	}
	999
[1241]	1000	# returns a list of the form [[field,value],[field,value],...]
	1001	sub get_all_metadata {
	1002	my $self = shift (@_);
	1003	my ($section) = @_;
	1004
	1005	my $section_ptr = $self->_lookup_section($section);
	1006	if (!defined $section_ptr) {
[8716]	1007	print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
[1241]	1008	return;
	1009	}
	1010
	1011	return $section_ptr->{'metadata'};
	1012	}
	1013
	1014	# $value is optional
	1015	sub delete_metadata {
	1016	my $self = shift (@_);
	1017	my ($section, $field, $value) = @_;
	1018
	1019	my $section_ptr = $self->_lookup_section($section);
	1020	if (!defined $section_ptr) {
[23562]	1021	print STDERR "doc::delete_metadata couldn't find section ", $section, "$field\n";
[1241]	1022	return;
	1023	}
	1024
	1025	my $i = 0;
	1026	while ($i < scalar (@{$section_ptr->{'metadata'}})) {
	1027	if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
	1028	(!defined $value \|\| $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
	1029	splice (@{$section_ptr->{'metadata'}}, $i, 1);
	1030	} else {
	1031	$i++;
	1032	}
	1033	}
	1034	}
	1035
	1036	sub delete_all_metadata {
	1037	my $self = shift (@_);
	1038	my ($section) = @_;
	1039
	1040	my $section_ptr = $self->_lookup_section($section);
	1041	if (!defined $section_ptr) {
[8716]	1042	print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
[1241]	1043	return;
	1044	}
	1045
	1046	$section_ptr->{'metadata'} = [];
	1047	}
	1048
[4]	1049	sub set_metadata_element {
	1050	my $self = shift (@_);
	1051	my ($section, $field, $value) = @_;
	1052
[97]	1053	$self->set_utf8_metadata_element ($section, $field,
[1870]	1054	&unicode::ascii2utf8(\$value));
[73]	1055	}
	1056
	1057	# set_utf8_metadata_element assumes the text has already been
	1058	# converted to the UTF-8 encoding.
	1059	sub set_utf8_metadata_element {
	1060	my $self = shift (@_);
	1061	my ($section, $field, $value) = @_;
[25557]	1062
[4]	1063	$self->delete_metadata ($section, $field);
[73]	1064	$self->add_utf8_metadata ($section, $field, $value);
[4]	1065	}
	1066
	1067
[73]	1068	# add_metadata assumes the text is in (extended) ascii form. For
[8220]	1069	# text which has already been converted to the UTF-8 format use
[73]	1070	# add_utf8_metadata.
[4]	1071	sub add_metadata {
	1072	my $self = shift (@_);
	1073	my ($section, $field, $value) = @_;
[25557]	1074
[97]	1075	$self->add_utf8_metadata ($section, $field,
[1870]	1076	&unicode::ascii2utf8(\$value));
[73]	1077	}
	1078
	1079	sub add_utf8_metadata {
	1080	my $self = shift (@_);
	1081	my ($section, $field, $value) = @_;
[25557]	1082
[33171]	1083	# my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
[25557]	1084	# my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
	1085	# print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
[73]	1086
[4]	1087	my $section_ptr = $self->_lookup_section($section);
	1088	if (!defined $section_ptr) {
[8716]	1089	print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
[4]	1090	return;
	1091	}
[1732]	1092	if (!defined $value) {
	1093	print STDERR "doc::add_utf8_metadata undefined value for $field\n";
	1094	return;
	1095	}
	1096	if (!defined $field) {
	1097	print STDERR "doc::add_utf8_metadata undefined metadata type \n";
	1098	return;
	1099	}
[4]	1100
[8894]	1101	#print STDERR "###$field=$value\n";
[22950]	1102
	1103	# For now, supress this check. Given that text data read in is now
	1104	# Unicode aware, then the following block of code can (ironically enough)
	1105	# cause our unicode compliant string to be re-encoded (leading to
	1106	# a double-encoded UTF-8 string, which we definitely don't want!).
	1107
	1108
[7798]	1109	# double check that the value is utf-8
[25557]	1110	# if (!&unicode::check_is_utf8($value)) {
	1111	# print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
	1112	# &unicode::ensure_utf8(\$value);
	1113	# print STDERR " Tried converting to utf8: $value\n";
	1114	# }
[7798]	1115
[25557]	1116	#If the metadata value is either a latitude or a longitude value then we want to save a shortened version for spacial searching purposes
[26850]	1117	if ($field =~ m/^(.+\.)?Latitude$/ \|\| $field =~ m/^(.+\.)?Longitude$/)
[25557]	1118	{
[26850]	1119	my ($mdprefix,$metaname) = ($field =~ m/(.+)\.(.+)$/);
	1120	if (defined $mdprefix) {
	1121	# Add in a version of Latitude/Longitude without the metadata namespace prefix to keep Runtime happy
	1122	push (@{$section_ptr->{'metadata'}}, [$metaname, $value]);
	1123	}
	1124
[25557]	1125	my $direction;
	1126	if($value =~ m/^-/)
	1127	{
	1128	$direction = ($field eq "Latitude") ? "S" : "W";
	1129	}
	1130	else
	1131	{
	1132	$direction = ($field eq "Latitude") ? "N" : "E";
	1133	}
	1134
	1135	my ($beforeDec, $afterDec) = ($value =~ m/^-?([0-9]+)\.([0-9]+)$/);
	1136	if(defined $beforeDec && defined $afterDec)
	1137	{
	1138	my $name = ($field eq "Latitude") ? "LatShort" : "LngShort";
	1139	push (@{$section_ptr->{'metadata'}}, [$name, $beforeDec . $direction]);
	1140
	1141	for(my $i = 2; $i <= 4; $i++)
	1142	{
	1143	if(length($afterDec) >= $i)
	1144	{
	1145	push (@{$section_ptr->{'metadata'}}, [$name, substr($afterDec, 0, $i)]);
	1146	}
	1147	}
	1148
	1149	#Only add the metadata if it has not already been added
[33217]	1150	my $metaMap = $self->get_metadata_hashmap($section); # metaMap not used and called function has no apparent side-effects,
	1151	# but this line appears important for ensuring uniqueness of (Latitude, value) meta
	1152	# in the section. Also for LatShort, Longitude, LngShort.
[25557]	1153	}
[33126]	1154
	1155	if($field =~ m/^(.+\.)?Longitude$/) {
[33128]	1156	# if we are dealing with Longitude meta, we should 1. have Latitude meta too; 2. already have processed Latitude meta
[33126]	1157	# in that case, add both Lat and Lng of this section as a Coordinate meta
[33217]	1158	my $latitude = $self->get_metadata_element ($section, "Latitude");
	1159	my $longitude = $value;
	1160	# TODO: would like all Longitude meta placed together, followed by all Coordinate meta,
	1161	# but the following will add all such coord meta to the doc object and the end of this function will add Longitude meta
	1162	$self->processCoordinate($section, $latitude, $longitude);
[33126]	1163	}
[25557]	1164	}
[33302]	1165
	1166	elsif($field eq "GPS.mapOverlay") { # then the $value is a JSON string
[25557]	1167
[33313]	1168	# In order to allow searching map-data-enriched documents by map shape descriptions,
[33302]	1169	# and to run rawquery searches for other docs by proximity based on their map data,
	1170	# need to store the shape descriptions and Coordinate info for shapes into the text index.
	1171	# We add the description for each shape in the mapoverlay into the text index as GPSMapOverlayLabel
	1172	# And we add Coordinate (CD) and CoordShort (CS) info for each shape in the mapoverlay in the format (Lat, Lng) as ("37S339 175E342")
	1173	# where the digits before the N/S/E/W direction represents the whole number. And the digits after the direction are the
[33313]	1174	# decimal places which can vary between 0 and 2 to 4 digits.
[33302]	1175
[33313]	1176	# However, we only want to process GPS.mapOverlay during buildcol and only in the text indexing passes (e.g. sidx and didx for lucene)
	1177	# and certainly never during the infodb pass of buildcol. The latter can end up duplicating Coordinate/CoordShort/GPSMapOverlayLabel
	1178	# when rebuilding with the online doc editor, as that runs incremental-rebuild which then calls basebuilder::reconstruct_doc_objs_metadata()
[33302]	1179	# on all docs NOT being incrementally rebuilt. That call would get meta from the infodb and use it to reconstruct the doc objects of docs
[33313]	1180	# NOT being incrementally built. If the Coord and Label meta were written to the infodb, they would then be loaded back in memory when the
	1181	# collection is incrementally rebuilt for those docs that don't need incremental processing. Then this function would once again add the
	1182	# same meta into the infodb, thus duplicating what goes into the infodb. Hence, don't do all the following if doc::processor_mode =~ "infodb",
	1183	# or anything other than a text mode.
[33217]	1184
[33302]	1185	# Note that for incremental rebuilding, the text pass can be called textreindex for instance (and infodb pass can be incinfodb).
	1186	# So don't check for exact string match
	1187
[33312]	1188	if($doc::cmd_line_mode eq "buildcol" && $doc::processor_mode =~ m/^text/) { # currently known text processor_modes:
	1189	# text, textreindex, possibly textdelete (see ArchivesInfPlugin.pm::read() for last 2).
	1190	# OR: ..."buildcol" && $doc::processor_mode !~ m/infodb$/) # if dummy pass important
[33171]	1191
[33302]	1192	###print STDERR "GPS.mapOverlay before decoding, val = " . $value . "\n";
[33125]	1193
[33302]	1194	# TODO: html decode?
[33125]	1195	$value =~ s@[@[@g;
	1196	$value =~ s@]@]@g;
	1197	$value =~ s@"@"@g;
[33302]	1198	###print STDERR "GPS.mapOverlay after decoding, val = " . $value . "\n";
[33125]	1199
[33293]	1200	my $json_array = decode_json $value;
[33302]	1201
	1202	foreach my $shape (@$json_array) {
[33125]	1203
[33302]	1204	# Put each available shape description/label into this section's metadata with GPSMapOverlayLabel as metaname.
	1205	# Just as for Coordinate meta, don't need to know which shape a label belongs too. This is just so each label
	1206	# will be indexed, and therefore can be searched.
	1207
	1208	my $description = $shape->{"description"};
	1209	if($description) {
	1210	push (@{$section_ptr->{'metadata'}}, ["GPSMapOverlayLabel", $description]);
	1211	###print STDERR "@@@@############################################ Just added description meta: " . $description . "\n";
	1212	}
	1213
[33293]	1214	my $type = $shape->{"type"};
[33302]	1215	###print STDERR "Shape type : " . $type . "\n";
[33293]	1216
	1217	if($type eq "circle") {
[33302]	1218	###print STDERR "Found a circle:\n" . &printShape($json, $shape);
[33293]	1219
	1220	# work out bounding box
	1221	# want the inverse of this useful page:
	1222	# https://stackoverflow.com/questions/639695/how-to-convert-latitude-or-longitude-to-meters
	1223	# https://www.geodatasource.com/developers/javascript
[33125]	1224
[33293]	1225	# Dr Bainbridge wants us to use: https://gis.stackexchange.com/questions/5821/calculating-latitude-longitude-x-miles-from-point
	1226	# Since for N,E,S,W the following works out the same as above link, we're using the rule of thumb in the following:
	1227	# https://gis.stackexchange.com/questions/2951/algorithm-for-offsetting-a-latitude-longitude-by-some-amount-of-meters
	1228	# which states
	1229	# "If your displacements aren't too great (less than a few kilometers) and you're not right at the poles,
	1230	# use the quick and dirty estimate that 111,111 meters (111.111 km) in the y direction is 1 degree
	1231	# (of latitude) and 111,111 * cos(latitude) meters in the x direction is 1 degree (of longitude)."
	1232	my $centre_lat = $shape->{"center"}->{"lat"};
	1233	my $centre_lng = $shape->{"center"}->{"lng"};
	1234	my $radius = $shape->{"radius"}; # in metres!
	1235
[33302]	1236	###print STDERR "@@@ circle centre: ($centre_lat, $centre_lng), radius: $radius\n";
[33293]	1237
	1238	my $lat_north = $centre_lat + ($radius/111111);
	1239	my $lat_south = $centre_lat - ($radius/111111);
	1240
[33302]	1241	###print STDERR "### lat_north: $lat_north\n";
	1242	###print STDERR "### lat_south: $lat_south\n";
[33125]	1243
[33293]	1244	# our latitude and longitude values are in degrees. But cos and sin etc in perl and generally all prog languages
	1245	# all expect their angle to be in radians. So need to convert from degree to radians before we can take the cos of it.
	1246	my $centre_lat_radians = $self->degreesToRadians($centre_lat);
	1247	my $cos_in_radians = cos($centre_lat_radians);
[33302]	1248	###print STDERR "cos $centre_lat_radians " . cos($centre_lat_radians) . "\n";
[33293]	1249	my $lng_east = $centre_lng + ($radius/(111111 * $cos_in_radians));
	1250	my $lng_west = $centre_lng - ($radius/(111111 * $cos_in_radians));
[33302]	1251	###print STDERR "### lng_east $lng_east\n";
	1252	###print STDERR "### lng_west $lng_west\n";
[33293]	1253
	1254	my $cos_lat = cos($centre_lat);
[33302]	1255	###print STDERR "cos $centre_lat is $cos_lat\n";
[33293]	1256
	1257	$self->processCoordinate($section, $lat_north, $lng_east);
	1258	$self->processCoordinate($section, $lat_south, $lng_east);
	1259	$self->processCoordinate($section, $lat_south, $lng_west);
	1260	$self->processCoordinate($section, $lat_north, $lng_west);
	1261
	1262	}
	1263	elsif ($type eq "marker") {
[33302]	1264	###print STDERR "@@ MARKER FOUND WITH LAT: " . $shape->{"position"}->{"lat"} . "\n";
	1265	###print STDERR "@@ MARKER FOUND WITH LNG: " . $shape->{"position"}->{"lng"} . "\n";
[33293]	1266	$self->processCoordinate($section, $shape->{"position"}->{"lat"}, $shape->{"position"}->{"lng"});
	1267	}
	1268	elsif ($type eq "polyline" \|\| $type eq "polygon") {
	1269	my $path_array = $shape->{"path"};
	1270	foreach my $position (@$path_array) {
	1271	$self->processCoordinate($section, $position->{"lat"}, $position->{"lng"});
	1272	}
	1273	}
	1274	elsif ($type eq "rectangle") {
	1275
	1276	my $bounds = $shape->{"bounds"};
	1277	$self->processCoordinate($section, $bounds->{"north"}, $bounds->{"east"});
	1278	$self->processCoordinate($section, $bounds->{"south"}, $bounds->{"east"});
	1279	$self->processCoordinate($section, $bounds->{"south"}, $bounds->{"west"});
	1280	$self->processCoordinate($section, $bounds->{"north"}, $bounds->{"west"});
	1281	}
	1282
[33125]	1283	} # end for on each shape in GPS.mapOverlay
[33302]	1284	} # end if(buildcol and text pass)
[33125]	1285	} # end GPS.mapOverlay meta
	1286
[4]	1287	push (@{$section_ptr->{'metadata'}}, [$field, $value]);
	1288	}
	1289
[33125]	1290	# https://en.wikipedia.org/wiki/Radian
	1291	sub degreesToRadians
	1292	{
	1293	my $self = shift (@_);
	1294	my ($degrees) = @_;
[4]	1295
[33125]	1296	return $degrees * pi /180; # returns radians
	1297	}
	1298
	1299	sub radiansToDegrees
	1300	{
	1301	my $self = shift (@_);
	1302	my ($radians) = @_;
	1303
	1304	return $radians * 180 / pi; # returns degrees
	1305	}
[33293]	1306
[33302]	1307	# Call as:
	1308	# my $json = JSON->new->allow_nonref;
	1309	# &printAllShapes($json, $json_array);
[33293]	1310	sub printAllShapes {
	1311	my ($json, $json_array) = @_;
	1312
	1313
	1314	#my $pretty_print_shape = $json->pretty->encode( $json_array->[0] );
	1315	foreach my $shape (@$json_array) {
	1316	my $pretty_print_shape = $json->pretty->encode( $shape );
	1317	print STDERR "Shape: $pretty_print_shape\n";
	1318	}
	1319
[33185]	1320	}
[33125]	1321
[33217]	1322	# For the (lat, lng) coordinate given,
	1323	# attaches Coordinate and multiple CoordShort (different precision level) metadata to the doc object
[33125]	1324	sub processCoordinate {
	1325	my $self = shift (@_);
[33217]	1326	my ($section, $latitude, $longitude) = @_;
[33125]	1327
[33128]	1328	my $section_ptr = $self->_lookup_section($section);
	1329
[33125]	1330	my $lat_direction = ($latitude =~ m/^-/) ? "S" : "N";
	1331	my $lng_direction = ($longitude =~ m/^-/) ? "W" : "E";
	1332
	1333	# have to store (lat, lng) in pairs, when there are so many coords to store
	1334	#push (@{$section_ptr->{'metadata'}}, ["Latitude", $latitude]);
	1335	#push (@{$section_ptr->{'metadata'}}, ["Longitude", $longitude]);
	1336
	1337	push (@{$section_ptr->{'metadata'}}, ["Coordinate", "$latitude $longitude"]); # "$latitude$lat_direction $longitude$lng_direction"
	1338
	1339	my ($latBeforeDec, $latAfterDec);
	1340	my ($lngBeforeDec, $lngAfterDec);
	1341
	1342	if($latitude !~ m/\./) {
	1343	$latBeforeDec = $latitude;
	1344	$latAfterDec = "";
	1345	} else {
	1346	($latBeforeDec, $latAfterDec) = ($latitude =~ m/^-?([0-9]+)\.([0-9]+)$/);
	1347	}
	1348	if($longitude !~ m/\./) {
	1349	$lngBeforeDec = $longitude;
	1350	$lngAfterDec = "";
	1351	} else {
	1352	($lngBeforeDec, $lngAfterDec) = ($longitude =~ m/^-?([0-9]+)\.([0-9]+)$/);
	1353	}
	1354
	1355
[33217]	1356	my $name = "CoordShort";
	1357	push (@{$section_ptr->{'metadata'}}, [$name, "$latBeforeDec$lat_direction $lngBeforeDec$lng_direction"]);
	1358
[33185]	1359	for(my $i = 2; $i <= 4; $i++)
	1360	{
	1361	my $latDecPlaces = (length($latAfterDec) >= $i) ? substr($latAfterDec, 0, $i) : "";
	1362	my $lngDecPlaces = (length($lngAfterDec) >= $i) ? substr($lngAfterDec, 0, $i) : "";
	1363
[33217]	1364	push (@{$section_ptr->{'metadata'}}, [$name,
	1365	$latBeforeDec . $lat_direction. $latDecPlaces . " " . $lngBeforeDec . $lng_direction. $lngDecPlaces]);
[33185]	1366
	1367	}
	1368
	1369	#Only add the metadata if it has not already been added
[33217]	1370	#my $metaMap = $self->get_metadata_hashmap($section); ### TODO: metaMap not used. Unnecesssary step? (Called function has no apparent side-effects.)
[33125]	1371	}
	1372
	1373
[4]	1374	# methods for dealing with text
	1375
[1241]	1376	# returns the text for a section
	1377	sub get_text {
	1378	my $self = shift (@_);
	1379	my ($section) = @_;
	1380
	1381	my $section_ptr = $self->_lookup_section($section);
	1382	if (!defined $section_ptr) {
	1383	print STDERR "doc::get_text couldn't find section " .
	1384	"$section\n";
	1385	return "";
	1386	}
	1387
	1388	return $section_ptr->{'text'};
	1389	}
	1390
	1391	# returns the (utf-8 encoded) length of the text for a section
	1392	sub get_text_length {
	1393	my $self = shift (@_);
	1394	my ($section) = @_;
	1395
	1396	my $section_ptr = $self->_lookup_section($section);
	1397	if (!defined $section_ptr) {
	1398	print STDERR "doc::get_text_length couldn't find section " .
	1399	"$section\n";
	1400	return 0;
	1401	}
	1402
	1403	return length ($section_ptr->{'text'});
	1404	}
	1405
[23131]	1406	# returns the total length for all the sections
	1407	sub get_total_text_length {
	1408	my $self = shift (@_);
	1409
	1410	my $section = $self->get_top_section();
	1411	my $length = 0;
	1412	while (defined $section) {
	1413	$length += $self->get_text_length($section);
	1414	$section = $self->get_next_section($section);
	1415	}
	1416	return $length;
	1417	}
	1418
[1241]	1419	sub delete_text {
	1420	my $self = shift (@_);
	1421	my ($section) = @_;
	1422
	1423	my $section_ptr = $self->_lookup_section($section);
	1424	if (!defined $section_ptr) {
	1425	print STDERR "doc::delete_text couldn't find section " .
	1426	"$section\n";
	1427	return;
	1428	}
	1429
	1430	$section_ptr->{'text'} = "";
	1431	}
	1432
[73]	1433	# add_text assumes the text is in (extended) ascii form. For
	1434	# text which has been already converted to the UTF-8 format
	1435	# use add_utf8_text.
[4]	1436	sub add_text {
	1437	my $self = shift (@_);
	1438	my ($section, $text) = @_;
	1439
[73]	1440	# convert the text to UTF-8 encoded unicode characters
	1441	# and add the text
[1870]	1442	$self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
[73]	1443	}
	1444
	1445
	1446	# add_utf8_text assumes the text to be added has already
	1447	# been converted to the UTF-8 encoding. For ascii text use
	1448	# add_text
[32590]	1449	# Pass by value version (internally calls pass by ref version
	1450	# to avoid code duplication)
[73]	1451	sub add_utf8_text {
	1452	my $self = shift (@_);
	1453	my ($section, $text) = @_;
[32590]	1454
	1455	$self->add_utf8_textref($section, \$text);
	1456	}
[73]	1457
[32590]	1458	# Pass by reference version, used by GreenstoneSQLPlugin for fulltext
	1459	sub add_utf8_textref {
	1460	my $self = shift (@_);
	1461	my ($section, $text_ref) = @_;
	1462
[4]	1463	my $section_ptr = $self->_lookup_section($section);
	1464	if (!defined $section_ptr) {
[32590]	1465	print STDERR "doc::add_utf8_textref couldn't find section " .
[4]	1466	"$section\n";
	1467	return;
	1468	}
	1469
[32590]	1470	$section_ptr->{'text'} .= $$text_ref;
[4]	1471	}
	1472
[16950]	1473	# returns the Source meta, which is the utf8 filename generated.
	1474	# Added a separate method here for convenience
	1475	sub get_source {
[16924]	1476	my $self = shift (@_);
[16950]	1477	return $self->get_metadata_element ($self->get_top_section(), "Source");
	1478	}
	1479
	1480	# returns the SourceFile meta, which is the url reference to the URL-encoded
	1481	# version of Source (the utf8 filename). Added a separate method here for convenience
	1482	sub get_sourcefile {
	1483	my $self = shift (@_);
	1484	return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
	1485	}
	1486
	1487	# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
	1488	# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
	1489	sub get_assocfile_from_sourcefile {
	1490	my $self = shift (@_);
[16924]	1491
	1492	# get the SourceFile meta, which is a URL to a file on the filesystem
	1493	my $top_section = $self->get_top_section();
	1494	my $source_file = $self->get_metadata_element($top_section, "SourceFile");
[4]	1495
[16924]	1496	# get the actual filename as it exists on the filesystem which this url refers to
[16928]	1497	$source_file = &unicode::url_to_filename($source_file);
[16924]	1498	my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
	1499	return $assocfilename;
	1500	}
	1501
[1241]	1502	# methods for dealing with associated files
	1503
	1504	# a file is associated with a document, NOT a section.
	1505	# if section is defined it is noted in the data structure
	1506	# only so that files associated from a particular section
	1507	# may be removed later (using delete_section_assoc_files)
	1508	sub associate_file {
	1509	my $self = shift (@_);
	1510	my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
	1511	$mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
	1512
	1513	# remove all associated files with the same name
	1514	$self->delete_assoc_file ($assoc_filename);
[23413]	1515
	1516	# Too harsh a requirement
	1517	# Definitely get HTML docs, for example, with some missing
	1518	# support files
	1519	# if (!&util::fd_exists($real_filename)) {
	1520	# print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n";
	1521	# exit -1;
	1522	# }
	1523
[23387]	1524	# print STDERR "**** is the following a UTF8 rep of real filename?\n $real_filename\n";
	1525	# print STDERR "****##### so, ensure it is before storing?!?!?\n";
	1526	## my $utf8_filename = Encode::encode("utf8",$filename);
	1527
[1241]	1528	push (@{$self->{'associated_files'}},
	1529	[$real_filename, $assoc_filename, $mime_type, $section]);
	1530	}
	1531
	1532	# returns a list of associated files in the form
	1533	# [[real_filename, assoc_filename, mimetype], ...]
	1534	sub get_assoc_files {
	1535	my $self = shift (@_);
	1536
	1537	return $self->{'associated_files'};
	1538	}
	1539
[20775]	1540	# the following two methods used to keep track of original associated files
	1541	# for incremental building. eg a txt file used by an item file does not end
	1542	# up as an assoc file for the doc.xml, but it needs to be recorded as a source
	1543	# file for incremental build
	1544	sub associate_source_file {
	1545	my $self = shift (@_);
	1546	my ($full_filename) = @_;
[19494]	1547
[20775]	1548	push (@{$self->{'source_assoc_files'}}, $full_filename);
	1549
	1550	}
	1551
	1552	sub get_source_assoc_files {
	1553	my $self = shift (@_);
	1554
	1555	return $self->{'source_assoc_files'};
	1556
	1557
	1558	}
[19494]	1559	sub metadata_file {
	1560	my $self = shift (@_);
	1561	my ($real_filename, $filename) = @_;
	1562
	1563	push (@{$self->{'metadata_files'}},
	1564	[$real_filename, $filename]);
	1565	}
	1566
[21566]	1567	# used for writing out the archiveinf-doc info database, to list all the metadata files
[19494]	1568	sub get_meta_files {
	1569	my $self = shift (@_);
	1570
	1571	return $self->{'metadata_files'};
	1572	}
	1573
[1241]	1574	sub delete_section_assoc_files {
	1575	my $self = shift (@_);
	1576	my ($section) = @_;
	1577
	1578	my $i=0;
	1579	while ($i < scalar (@{$self->{'associated_files'}})) {
	1580	if (defined $self->{'associated_files'}->[$i]->[3] &&
	1581	$self->{'associated_files'}->[$i]->[3] eq $section) {
	1582	splice (@{$self->{'associated_files'}}, $i, 1);
	1583	} else {
	1584	$i++;
	1585	}
	1586	}
	1587	}
	1588
	1589	sub delete_assoc_file {
	1590	my $self = shift (@_);
	1591	my ($assoc_filename) = @_;
	1592
	1593	my $i=0;
	1594	while ($i < scalar (@{$self->{'associated_files'}})) {
	1595	if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
	1596	splice (@{$self->{'associated_files'}}, $i, 1);
	1597	} else {
	1598	$i++;
	1599	}
	1600	}
	1601	}
	1602
	1603	sub reset_nextsection_ptr {
	1604	my $self = shift (@_);
	1605	my ($section) = @_;
	1606
	1607	my $section_ptr = $self->_lookup_section($section);
	1608	$section_ptr->{'next_subsection'} = 1;
	1609	}
	1610
[4]	1611	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: