source: main/trunk/greenstone2/perllib/doc.pm@ 33302

Last change on this file since 33302 was 33302, checked in by ak19, 5 years ago
  1. Adding GPSMapOverlayLabel extracted from GPS.mapOverlay meta to text indexes for searching, as with Coordinte and CoordShort. 2. Added a shortname for this index, ML for MapLabel. 3. On testing the indexing of the GPSMapOverlayLabel text, the old problem of increasingly duplicated Coordinate/CoordShort and now also GPSMapOverlayLabel meta in the infodb reappeared. Dr Bainbridge explained why this was (documented as comments in this commit) and fixed the problem by not processing GPS.mapOverlay meta into Coordinate and Label meta during the infodb pass (and dummy pass, so specifically specifically non-text passes) of buildcol. A natural consequence is that to check whether Coord and Label meta have been indexed, can no longer check the index/text/col.jdb but need to use Luke (if a lucene collection ) to check contents of index/sidx and index/didx. 4. An important change needed for the bugfix in 3 is reordering call to &classify::reconstruct_doc_objs_metadata() in basebuilder.pm to take place AFTER build_proc->set_mode(infodb) has taken place. 5. Changed cross-files global variables declared in doc.pm from our to my variables and tested this works.
  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 49.9 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use FileUtils;
40use ghtml;
41use File::stat;
42##use hashdoc;
43use docprint;
44use JSON;
45
46# We just need pi from the Trig lib
47# Import constants pi2, pip2, pip4 (2*pi, pi/2, pi/4).
48use Math::Trig ':pi';
49
50# the document type may be indexed_doc, nonindexed_doc, or
51# classification
52
53our $OIDcount = 0;
54
55
56# Declare the Package (level) Variable that will tell us whether we're in the import or buildcol phase.
57# This var will be set by import.pl, buildcol.pl and export.pl to the then appropriate value.
58# https://perlmaven.com/package-variables-and-lexical-variables-in-perl
59# But: https://stackoverflow.com/questions/7542120/perl-how-to-make-variables-from-requiring-script-available-in-required-script
60# https://stackoverflow.com/questions/33230015/how-do-i-use-a-variable-in-another-perl-script
61#$doc::cmd_line_mode = undef;
62# Decided to follow the usage of $OIDcount above to declare the necessary package-level variables 'globally' accessible
63# actually accessible only for those packages that import this one (with 'use doc')
64my $cmd_line_mode = undef;
65
66# processor_mode keeps track of which buildcol pass we're at: dummy, text (sidx/didx passes) or infodb
67my $processor_mode = undef;
68
69# rename_method can be 'url', 'none', 'base64'
70sub new {
71 my $class = shift (@_);
72 my ($source_filename, $doc_type, $rename_method) = @_;
73
74
75 my $self = bless {'associated_files'=>[],
76 'subsection_order'=>[],
77 'next_subsection'=>1,
78 'subsections'=>{},
79 'metadata'=>[],
80 'text'=>"",
81 'OIDtype'=>"hash"}, $class;
82
83 # used to set lastmodified here, but this can screw up the HASH ids, so
84 # the docsave processor now calls set_lastmodified
85
86 $self->set_source_path($source_filename);
87
88 if (defined $source_filename) {
89 $source_filename = &util::filename_within_collection($source_filename);
90 print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
91 $self->set_source_filename ($source_filename, $rename_method);
92 }
93
94 $self->set_doc_type ($doc_type) if defined $doc_type;
95
96 return $self;
97}
98
99
100sub set_source_path
101{
102 my $self = shift @_;
103 my ($source_filename) = @_;
104
105 if (defined $source_filename) {
106 # On Windows the source_filename can be in terse DOS format
107 # e.g. test~1.txt
108
109 $self->{'terse_source_path'} = $source_filename;
110
111 # Use the FileUtil library methods as they are aware of more special
112 # cases such as HDFS [jmt12]
113 if (&FileUtils::fileExists($source_filename))
114 {
115 # See if we can do better for Windows with a filename
116 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
117 require Win32;
118 $self->{'source_path'} = Win32::GetLongPathName($source_filename);
119 }
120 else {
121 # For Unix-based systems, there is no difference between the two
122 $self->{'source_path'} = $source_filename;
123 }
124 }
125 else {
126 print STDERR "Warning: In doc::set_source_path(), file\n";
127 print STDERR " $source_filename\n";
128 print STDERR " does not exist\n";
129
130 # (default) Set it to whatever we were given
131 $self->{'source_path'} = $source_filename;
132 }
133 }
134 else {
135 # Previous code for setting source_path allowed for
136 # it to be undefined, so continue this practice
137 $self->{'terse_source_path'} = undef;
138 $self->{'source_path'} = undef;
139 }
140}
141
142
143sub get_source_path
144{
145 my $self = shift @_;
146
147 return $self->{'terse_source_path'};
148}
149
150# set lastmodified for OAI purposes, added by GRB, moved by kjdon
151sub set_oailastmodified {
152 my $self = shift (@_);
153
154 my $source_path = $self->{'terse_source_path'};
155
156 if (defined $source_path && (-e $source_path)) {
157 my $current_time = time;
158
159 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
160 $wday, $yday, $isdst) = localtime($current_time);
161
162 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
163
164 $self->add_utf8_metadata($self->get_top_section(), "oailastmodified", $current_time);
165 $self->add_utf8_metadata($self->get_top_section(), "oailastmodifieddate", $date_modified);
166 }
167}
168
169# no longer used for OAI purposes, since lastmodified is not what we want as the
170# Datestamp of a document. This doc metadata may be useful for general purposes.
171sub set_lastmodified {
172 my $self = shift (@_);
173
174 my $source_path = $self->{'terse_source_path'};
175
176 if (defined $source_path && (-e $source_path)) {
177
178 my $file_stat = stat($source_path);
179 my $mtime = $file_stat->mtime;
180 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
181 $wday, $yday, $isdst) = localtime($mtime);
182
183 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
184
185 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $mtime);
186 $self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
187 }
188}
189
190# clone the $self object
191sub duplicate {
192 my $self = shift (@_);
193
194 my $newobj = {};
195
196 foreach my $k (keys %$self) {
197 $newobj->{$k} = &clone ($self->{$k});
198 }
199
200 bless $newobj, ref($self);
201 return $newobj;
202}
203
204sub clone {
205 my ($from) = @_;
206 my $type = ref ($from);
207
208 if ($type eq "HASH") {
209 my $to = {};
210 foreach my $key (keys %$from) {
211 $to->{$key} = &clone ($from->{$key});
212 }
213 return $to;
214 } elsif ($type eq "ARRAY") {
215 my $to = [];
216 foreach my $v (@$from) {
217 push (@$to, &clone ($v));
218 }
219 return $to;
220 } else {
221 return $from;
222 }
223}
224
225sub set_OIDtype {
226 my $self = shift (@_);
227 my ($type, $metadata) = @_;
228
229 if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|hash_on_full_filename|incremental|filename|dirname|full_filename|assigned)$/) {
230 $self->{'OIDtype'} = $type;
231 } else {
232 $self->{'OIDtype'} = "hash";
233 }
234
235 if ($type =~ /^assigned$/) {
236 if (defined $metadata) {
237 $self->{'OIDmetadata'} = $metadata;
238 } else {
239 $self->{'OIDmetadata'} = "dc.Identifier";
240 }
241 }
242}
243
244# rename_method can be 'url', 'none', 'base64'
245sub set_source_filename {
246 my $self = shift (@_);
247 my ($source_filename, $rename_method) = @_;
248
249 # Since the gsdlsourcefilename element goes into the doc.xml it has
250 # to be utf8. However, it should also *represent* the source filename
251 # (in the import directory) which may not be utf8 at all.
252 # For instance, if this meta element (gsdlsourcefilename) will be used
253 # by other applications that parse doc.xml in order to locate
254 # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
255 # encode the real filename as this is a binary-to-text encoding meaning
256 # that the resulting string is ASCII (utf8). Decoding will give the original.
257
258# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
259
260 # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
261 $source_filename = $self->encode_filename($source_filename, $rename_method);
262# my ($srcfilename,$dirname,$suffix)
263# = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
264# print STDERR "-> $srcfilename -> ";
265# $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
266# $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
267# print STDERR "$source_filename\n";
268
269 $self->set_utf8_metadata_element ($self->get_top_section(),
270 "gsdlsourcefilename",
271 $source_filename);
272}
273
274sub encode_filename {
275 my $self = shift (@_);
276 my ($source_filename, $rename_method) = @_;
277
278 my ($srcfilename,$dirname,$suffix)
279 = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
280# print STDERR "-> $srcfilename -> ";
281 $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
282 $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
283
284 return $source_filename;
285}
286
287sub set_converted_filename {
288 my $self = shift (@_);
289 my ($converted_filename) = @_;
290
291 # we know the converted filename is utf8
292 $self->set_utf8_metadata_element ($self->get_top_section(),
293 "gsdlconvertedfilename",
294 $converted_filename);
295}
296
297# returns the source_filename as it was provided
298sub get_unmodified_source_filename {
299 my $self = shift (@_);
300
301 return $self->{'terse_source_path'};
302}
303
304# returns the source_filename with whatever rename_method was given
305sub get_source_filename {
306 my $self = shift (@_);
307
308 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
309}
310
311
312
313# returns converted filename if available else returns source filename
314sub get_filename_for_hashing {
315 my $self = shift (@_);
316
317 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
318
319 if (!defined $filename) {
320 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
321 # if NULPlug processed file, then don't give a filename
322 if (defined $plugin_name && $plugin_name eq "NULPlug") {
323 $filename = undef;
324 } else { # returns the URL encoded source filename!
325 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
326 }
327 }
328
329 if (!&FileUtils::isFilenameAbsolute($filename)) {
330 $filename = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$filename);
331 }
332
333 return $filename;
334}
335
336sub set_doc_type {
337 my $self = shift (@_);
338 my ($doc_type) = @_;
339
340 $self->set_metadata_element ($self->get_top_section(),
341 "gsdldoctype",
342 $doc_type);
343}
344
345# returns the gsdldoctype as it was provided
346# the default of "indexed_doc" is used if no document
347# type was provided
348sub get_doc_type {
349 my $self = shift (@_);
350
351 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
352 return $doc_type if (defined $doc_type);
353 return "indexed_doc";
354}
355
356
357# look up the reference to the a particular section
358sub _lookup_section {
359 my $self = shift (@_);
360 my ($section) = @_;
361
362 my ($num);
363 my $sectionref = $self;
364
365 while (defined $section && $section ne "") {
366
367 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
368
369 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
370
371 $section = "" unless defined $section;
372
373
374 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
375 $sectionref = $sectionref->{'subsections'}->{$num};
376 } else {
377 return undef;
378 }
379 }
380
381 return $sectionref;
382}
383
384# calculate OID by hashing the contents of the document
385sub _calc_OID {
386 my $self = shift (@_);
387 my ($filename) = @_;
388
389
390 my $osexe = &util::get_os_exe();
391
392 my $hashfile_exe = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin",
393 $ENV{'GSDLOS'},"hashfile$osexe");
394
395 &util::set_gnomelib_env(); # gnomelib_env (particularly lib/libiconv2.dylib) required to run the hashfile executable on Mac Lions
396 # The subroutine will set the gnomelib env once for each subshell launched, by first testing if GEXTGNOME is not already set
397
398 # A different way to set the gnomelib env would be to do it more locally: exporting the necessary vars
399 # (specifically DYLD/LD_LIB_PATH) for gnome_lib as part of the command executed.
400 # E.g. $result=`export LD_LIBRARY_PATH=../ext/gnome-lib/darwin/lib; hashfile...`
401
402 my $result = "NULL";
403
404
405 if (-e "$hashfile_exe") {
406# $result = `\"$hashfile_exe\" \"$filename\"`;
407# $result = `hashfile$osexe \"$filename\" 2>&1`;
408 $result = `hashfile$osexe \"$filename\"`;
409
410 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
411 } else {
412 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
413 }
414 return "HASH$result";
415}
416
417# methods dealing with OID, not groups of them.
418
419# if $OID is not provided one is calculated
420sub set_OID {
421 my $self = shift (@_);
422 my ($OID) = @_;
423
424 my $use_hash_oid = 0;
425 # if an OID wasn't provided calculate one
426 if (!defined $OID) {
427 $OID = "NULL";
428 if ($self->{'OIDtype'} =~ /^hash/) {
429 $use_hash_oid = 1;
430 } elsif ($self->{'OIDtype'} eq "incremental") {
431 $OID = "D" . $OIDcount;
432 $OIDcount ++;
433 } elsif ($self->{'OIDtype'} eq "filename") {
434 my $filename = $self->get_source_filename();
435 $OID = &File::Basename::fileparse($filename, qr/\.[^.]*/);
436 $OID = &util::tidy_up_oid($OID);
437 } elsif ($self->{'OIDtype'} eq "full_filename") {
438 my $source_filename = $self->get_source_filename();
439 my $dirsep = &util::get_os_dirsep();
440
441 $source_filename =~ s/^import$dirsep//;
442 $source_filename =~ s/$dirsep/-/g;
443 $source_filename =~ s/\./_/g;
444
445 $OID = $source_filename;
446 $OID = &util::tidy_up_oid($OID);
447 } elsif ($self->{'OIDtype'} eq "dirname") {
448 my $filename = $self->get_source_filename();
449 if (defined($filename)) { # && -e $filename) {
450 # get the immediate parent directory
451 $OID = &File::Basename::dirname($filename);
452 if (defined $OID) {
453 $OID = &File::Basename::basename($OID);
454 $OID = &util::tidy_up_oid($OID);
455 } else {
456 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
457 $use_hash_oid = 1;
458 }
459 } else {
460 print STDERR "Failed to find a filename, generating hash id\n";
461 $use_hash_oid = 1;
462 }
463
464 } elsif ($self->{'OIDtype'} eq "assigned") {
465 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
466 if (defined $identifier && $identifier ne "") {
467 $OID = $identifier;
468 $OID = &util::tidy_up_oid($OID);
469 } else {
470 # need a hash id
471 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
472 $use_hash_oid = 1;
473 }
474
475 } else {
476 $use_hash_oid = 1;
477 }
478
479 if ($use_hash_oid) {
480 my $hash_on_file = 1;
481 my $hash_on_ga_xml = 0;
482
483 if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
484 $hash_on_file = 0;
485 $hash_on_ga_xml = 1;
486 }
487
488 if ($self->{'OIDtype'} eq "hash_on_full_filename") {
489 $hash_on_file = 0;
490 $hash_on_ga_xml = 0;
491
492 my $source_filename = $self->get_source_filename();
493 my $dirsep = &util::get_os_dirsep();
494
495 $source_filename =~ s/^import$dirsep//;
496 $source_filename =~ s/$dirsep/-/g;
497 $source_filename =~ s/\./_/g;
498
499 # If the filename is very short then (handled naively)
500 # this can cause conjestion in the hash-values
501 # computed, leading documents sharing the same leading
502 # Hex values in the computed has.
503 #
504 # The solution taken here is to replace the name of
505 # the file name a sufficient number of times (up to
506 # the character limit defined in 'rep_limit' and
507 # make that the content that is hashed on
508
509 # *** Think twice before changing the following value
510 # as it will break backward compatability of computed
511 # document HASH values
512
513 my $rep_limit = 256;
514 my $hash_content = undef;
515
516 if (length($source_filename)<$rep_limit) {
517 my $rep_string = "$source_filename|";
518 my $rs_len = length($rep_string);
519
520 my $clone_times = int(($rep_limit-1)/$rs_len) +1;
521
522 $hash_content = substr($rep_string x $clone_times, 0, $rep_limit);
523 }
524 else {
525 $hash_content = $source_filename;
526 }
527
528 my $filename = &util::get_tmp_filename();
529 if (!open (OUTFILE, ">:utf8", $filename)) {
530 print STDERR "doc::set_OID could not write to $filename\n";
531 } else {
532 print OUTFILE $hash_content;
533 close (OUTFILE);
534 }
535 $OID = $self->_calc_OID ($filename);
536
537 &FileUtils::removeFiles ($filename);
538 }
539
540 if ($hash_on_file) {
541 # "hash" OID - feed file to hashfile.exe
542 my $filename = $self->get_filename_for_hashing();
543
544 # -z: don't want to hash on the file if it is zero size
545 if (defined($filename) && -e $filename && !-z $filename) {
546 $OID = $self->_calc_OID ($filename);
547 } else {
548 $hash_on_ga_xml = 1; # switch to back-up plan, and hash on GA file instead
549 }
550 }
551
552 if ($hash_on_ga_xml) {
553 # In addition being asked to explicity calculate the has based on the GA file,
554 # can also end up coming into this block is doing 'hash_on_file' but the file
555 # itself is of zero bytes (as could be the case with 'doc.nul' file
556
557 my $filename = &util::get_tmp_filename();
558 if (!open (OUTFILE, ">:utf8", $filename)) {
559 print STDERR "doc::set_OID could not write to $filename\n";
560 } else {
561 my $doc_text = &docprint::get_section_xml($self);
562 print OUTFILE $doc_text;
563 close (OUTFILE);
564 }
565 $OID = $self->_calc_OID ($filename);
566 &FileUtils::removeFiles($filename);
567 }
568 }
569 }
570 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
571}
572
573# this uses hashdoc (embedded c thingy) which is faster but still
574# needs a little work to be sufficiently stable
575sub ___set_OID {
576 my $self = shift (@_);
577 my ($OID) = @_;
578
579 # if an OID wasn't provided then calculate hash value based on document
580 if (!defined $OID)
581 {
582 my $hash_text = &docprint::get_section_xml($self);
583 my $hash_len = length($hash_text);
584
585 $OID = &hashdoc::buffer($hash_text,$hash_len);
586 }
587
588 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
589}
590
591# returns the OID for this document
592sub get_OID {
593 my $self = shift (@_);
594 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
595 return $OID if (defined $OID);
596 return "NULL";
597}
598
599sub delete_OID {
600 my $self = shift (@_);
601
602 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
603}
604
605
606# methods for manipulating section names
607
608# returns the name of the top-most section (the top
609# level of the document
610sub get_top_section {
611 my $self = shift (@_);
612
613 return "";
614}
615
616# returns a section
617sub get_parent_section {
618 my $self = shift (@_);
619 my ($section) = @_;
620
621 $section =~ s/(^|\.)\d+$//;
622
623 return $section;
624}
625
626# returns the first child section (or the end child
627# if there isn't any)
628sub get_begin_child {
629 my $self = shift (@_);
630 my ($section) = @_;
631
632 my $section_ptr = $self->_lookup_section($section);
633 return "" unless defined $section_ptr;
634
635 if (defined $section_ptr->{'subsection_order'}->[0]) {
636 return "$section.$section_ptr->{'subsection_order'}->[0]";
637 }
638
639 return $self->get_end_child ($section);
640}
641
642# returns the next child of a parent section
643sub get_next_child {
644 my $self = shift (@_);
645 my ($section) = @_;
646
647 my $parent_section = $self->get_parent_section($section);
648 my $parent_section_ptr = $self->_lookup_section($parent_section);
649 return undef unless defined $parent_section_ptr;
650
651 my ($section_num) = $section =~ /(\d+)$/;
652 return undef unless defined $section_num;
653
654 my $i = 0;
655 my $section_order = $parent_section_ptr->{'subsection_order'};
656 while ($i < scalar(@$section_order)) {
657 last if $section_order->[$i] eq $section_num;
658 $i++;
659 }
660
661 $i++; # the next child
662 if ($i < scalar(@$section_order)) {
663 return $section_order->[$i] if $parent_section eq "";
664 return "$parent_section.$section_order->[$i]";
665 }
666
667 # no more sections in this level
668 return undef;
669}
670
671# returns a reference to a list of children
672sub get_children {
673 my $self = shift (@_);
674 my ($section) = @_;
675
676 my $section_ptr = $self->_lookup_section($section);
677 return [] unless defined $section_ptr;
678
679 my @children = @{$section_ptr->{'subsection_order'}};
680
681 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
682 return \@children;
683}
684
685# returns the child section one past the last one (which
686# is coded as "0")
687sub get_end_child {
688 my $self = shift (@_);
689 my ($section) = @_;
690
691 return $section . ".0" unless $section eq "";
692 return "0";
693}
694
695# returns the next section in book order
696sub get_next_section {
697 my $self = shift (@_);
698 my ($section) = @_;
699
700 return undef unless defined $section;
701
702 my $section_ptr = $self->_lookup_section($section);
703 return undef unless defined $section_ptr;
704
705 # first try to find first child
706 if (defined $section_ptr->{'subsection_order'}->[0]) {
707 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
708 return "$section.$section_ptr->{'subsection_order'}->[0]";
709 }
710
711 do {
712 # try to find sibling
713 my $next_child = $self->get_next_child ($section);
714 return $next_child if (defined $next_child);
715
716 # move up one level
717 $section = $self->get_parent_section ($section);
718 } while $section =~ /\d/;
719
720 return undef;
721}
722
723sub is_leaf_section {
724 my $self = shift (@_);
725 my ($section) = @_;
726
727 my $section_ptr = $self->_lookup_section($section);
728 return 1 unless defined $section_ptr;
729
730 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
731}
732
733# methods for dealing with sections
734
735# returns the name of the inserted section
736sub insert_section {
737 my $self = shift (@_);
738 my ($before_section) = @_;
739
740 # get the child to insert before and its parent section
741 my $parent_section = "";
742 my $before_child = "0";
743 my @before_section = split (/\./, $before_section);
744 if (scalar(@before_section) > 0) {
745 $before_child = pop (@before_section);
746 $parent_section = join (".", @before_section);
747 }
748
749 my $parent_section_ptr = $self->_lookup_section($parent_section);
750 if (!defined $parent_section_ptr) {
751 print STDERR "doc::insert_section couldn't find parent section " .
752 "$parent_section\n";
753 return;
754 }
755
756 # get the next section number
757 my $section_num = $parent_section_ptr->{'next_subsection'}++;
758
759 my $i = 0;
760 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
761 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
762 $i++;
763 }
764
765 # insert the section number into the order list
766 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
767
768 # add this section to the parent section
769 my $section_ptr = {'subsection_order'=>[],
770 'next_subsection'=>1,
771 'subsections'=>{},
772 'metadata'=>[],
773 'text'=>""};
774 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
775
776 # work out the full section number
777 my $section = $parent_section;
778 $section .= "." unless $section eq "";
779 $section .= $section_num;
780
781 return $section;
782}
783
784# creates a pre-named section
785sub create_named_section {
786 my $self = shift (@_);
787 my ($mastersection) = @_;
788
789 my ($num);
790 my $section = $mastersection;
791 my $sectionref = $self;
792
793 while ($section ne "") {
794 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
795 $num =~ s/^0+(\d)/$1/; # remove leading 0s
796 $section = "" unless defined $section;
797
798 if (defined $num) {
799 if (!defined $sectionref->{'subsections'}->{$num}) {
800 push (@{$sectionref->{'subsection_order'}}, $num);
801 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
802 'next_subsection'=>1,
803 'subsections'=>{},
804 'metadata'=>[],
805 'text'=>""};
806 if ($num >= $sectionref->{'next_subsection'}) {
807 $sectionref->{'next_subsection'} = $num + 1;
808 }
809 }
810 $sectionref = $sectionref->{'subsections'}->{$num};
811
812 } else {
813 print STDERR "doc::create_named_section couldn't create section ";
814 print STDERR "$mastersection\n";
815 last;
816 }
817 }
818}
819
820# returns a reference to a list of subsections
821sub list_subsections {
822 my $self = shift (@_);
823 my ($section) = @_;
824
825 my $section_ptr = $self->_lookup_section ($section);
826 if (!defined $section_ptr) {
827 print STDERR "doc::list_subsections couldn't find section $section\n";
828 return [];
829 }
830
831 return [@{$section_ptr->{'subsection_order'}}];
832}
833
834sub delete_section {
835 my $self = shift (@_);
836 my ($section) = @_;
837
838# my $section_ptr = {'subsection_order'=>[],
839# 'next_subsection'=>1,
840# 'subsections'=>{},
841# 'metadata'=>[],
842# 'text'=>""};
843
844 # if this is the top section reset everything
845 if ($section eq "") {
846 $self->{'subsection_order'} = [];
847 $self->{'subsections'} = {};
848 $self->{'metadata'} = [];
849 $self->{'text'} = "";
850 return;
851 }
852
853 # find the parent of the section to delete
854 my $parent_section = "";
855 my $child = "0";
856 my @section = split (/\./, $section);
857 if (scalar(@section) > 0) {
858 $child = pop (@section);
859 $parent_section = join (".", @section);
860 }
861
862 my $parent_section_ptr = $self->_lookup_section($parent_section);
863 if (!defined $parent_section_ptr) {
864 print STDERR "doc::delete_section couldn't find parent section " .
865 "$parent_section\n";
866 return;
867 }
868
869 # remove this section from the subsection_order list
870 my $i = 0;
871 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
872 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
873 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
874 last;
875 }
876 $i++;
877 }
878
879 # remove this section from the subsection hash
880 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
881 undef $parent_section_ptr->{'subsections'}->{$child};
882 }
883}
884
885#--
886# methods for dealing with metadata
887
888# set_metadata_element and get_metadata_element are for metadata
889# which should only have one value. add_meta_data and get_metadata
890# are for metadata which can have more than one value.
891
892# returns the first metadata value which matches field
893
894# This version of get metadata element works much like the one above,
895# except it allows for the namespace portion of a metadata element to
896# be ignored, thus if you are searching for dc.Title, the first piece
897# of matching metadata ending with the name Title (once any namespace
898# is removed) would be returned.
899# 28-11-2003 John Thompson
900sub get_metadata_element {
901 my $self = shift (@_);
902 my ($section, $field, $ignore_namespace) = @_;
903 my ($data);
904
905 $ignore_namespace = 0 unless defined $ignore_namespace;
906
907 my $section_ptr = $self->_lookup_section($section);
908 if (!defined $section_ptr) {
909 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
910 return;
911 }
912
913 # Remove any namespace if we are being told to ignore them
914 if($ignore_namespace) {
915 $field =~ s/^.*\.//; #$field =~ s/^\w*\.//;
916 }
917
918 foreach $data (@{$section_ptr->{'metadata'}}) {
919
920 my $data_name = $data->[0];
921
922 # Remove any namespace if we are being told to ignore them
923 if($ignore_namespace) {
924 $data_name =~ s/^.*\.//; #$data_name =~ s/^\w*\.//;
925 }
926 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
927 $data_name =~ s/^ex\.([^.]+)$/$1/; #$data_name =~ s/^ex\.//;
928
929 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
930 }
931
932 return undef; # was not found
933}
934
935# returns a list of the form [value1, value2, ...]
936sub get_metadata {
937 my $self = shift (@_);
938 my ($section, $field, $ignore_namespace) = @_;
939 my ($data);
940
941 $ignore_namespace = 0 unless defined $ignore_namespace;
942
943 my $section_ptr = $self->_lookup_section($section);
944 if (!defined $section_ptr) {
945 print STDERR "doc::get_metadata couldn't find section ",
946 $section, "\n";
947 return;
948 }
949
950 # Remove any namespace if we are being told to ignore them
951 if($ignore_namespace) {
952 $field =~ s/^.*\.//;
953 }
954
955 my @metadata = ();
956 foreach $data (@{$section_ptr->{'metadata'}}) {
957
958 my $data_name = $data->[0];
959
960 # Remove any namespace if we are being told to ignore them
961 if($ignore_namespace) {
962 $data_name =~ s/^.*\.//;
963 }
964 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
965 $data_name =~ s/^ex\.([^.]+)$/$1/;
966
967 push (@metadata, $data->[1]) if ($data_name eq $field);
968 }
969
970 return \@metadata;
971}
972
973sub get_metadata_hashmap {
974 my $self = shift (@_);
975 my ($section, $opt_namespace) = @_;
976
977 my $section_ptr = $self->_lookup_section($section);
978 if (!defined $section_ptr) {
979 print STDERR "doc::get_metadata couldn't find section ",
980 $section, "\n";
981 return;
982 }
983
984 my $metadata_hashmap = {};
985 foreach my $data (@{$section_ptr->{'metadata'}}) {
986 my $metaname = $data->[0];
987
988 if ((!defined $opt_namespace) || ($metaname =~ m/^$opt_namespace\./)) {
989 if (!defined $metadata_hashmap->{$metaname}) {
990 $metadata_hashmap->{$metaname} = [];
991 }
992 my $metaval_list = $metadata_hashmap->{$metaname};
993 push(@$metaval_list, $data->[1]);
994 }
995 }
996
997 return $metadata_hashmap;
998}
999
1000# returns a list of the form [[field,value],[field,value],...]
1001sub get_all_metadata {
1002 my $self = shift (@_);
1003 my ($section) = @_;
1004
1005 my $section_ptr = $self->_lookup_section($section);
1006 if (!defined $section_ptr) {
1007 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
1008 return;
1009 }
1010
1011 return $section_ptr->{'metadata'};
1012}
1013
1014# $value is optional
1015sub delete_metadata {
1016 my $self = shift (@_);
1017 my ($section, $field, $value) = @_;
1018
1019 my $section_ptr = $self->_lookup_section($section);
1020 if (!defined $section_ptr) {
1021 print STDERR "doc::delete_metadata couldn't find section ", $section, "$field\n";
1022 return;
1023 }
1024
1025 my $i = 0;
1026 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
1027 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
1028 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
1029 splice (@{$section_ptr->{'metadata'}}, $i, 1);
1030 } else {
1031 $i++;
1032 }
1033 }
1034}
1035
1036sub delete_all_metadata {
1037 my $self = shift (@_);
1038 my ($section) = @_;
1039
1040 my $section_ptr = $self->_lookup_section($section);
1041 if (!defined $section_ptr) {
1042 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
1043 return;
1044 }
1045
1046 $section_ptr->{'metadata'} = [];
1047}
1048
1049sub set_metadata_element {
1050 my $self = shift (@_);
1051 my ($section, $field, $value) = @_;
1052
1053 $self->set_utf8_metadata_element ($section, $field,
1054 &unicode::ascii2utf8(\$value));
1055}
1056
1057# set_utf8_metadata_element assumes the text has already been
1058# converted to the UTF-8 encoding.
1059sub set_utf8_metadata_element {
1060 my $self = shift (@_);
1061 my ($section, $field, $value) = @_;
1062
1063 $self->delete_metadata ($section, $field);
1064 $self->add_utf8_metadata ($section, $field, $value);
1065}
1066
1067
1068# add_metadata assumes the text is in (extended) ascii form. For
1069# text which has already been converted to the UTF-8 format use
1070# add_utf8_metadata.
1071sub add_metadata {
1072 my $self = shift (@_);
1073 my ($section, $field, $value) = @_;
1074
1075 $self->add_utf8_metadata ($section, $field,
1076 &unicode::ascii2utf8(\$value));
1077}
1078
1079sub add_utf8_metadata {
1080 my $self = shift (@_);
1081 my ($section, $field, $value) = @_;
1082
1083 # my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1084 # my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
1085 # print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
1086
1087 my $section_ptr = $self->_lookup_section($section);
1088 if (!defined $section_ptr) {
1089 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
1090 return;
1091 }
1092 if (!defined $value) {
1093 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
1094 return;
1095 }
1096 if (!defined $field) {
1097 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
1098 return;
1099 }
1100
1101 #print STDERR "###$field=$value\n";
1102
1103 # For now, supress this check. Given that text data read in is now
1104 # Unicode aware, then the following block of code can (ironically enough)
1105 # cause our unicode compliant string to be re-encoded (leading to
1106 # a double-encoded UTF-8 string, which we definitely don't want!).
1107
1108
1109 # double check that the value is utf-8
1110 # if (!&unicode::check_is_utf8($value)) {
1111 # print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
1112 # &unicode::ensure_utf8(\$value);
1113 # print STDERR " Tried converting to utf8: $value\n";
1114 # }
1115
1116 #If the metadata value is either a latitude or a longitude value then we want to save a shortened version for spacial searching purposes
1117 if ($field =~ m/^(.+\.)?Latitude$/ || $field =~ m/^(.+\.)?Longitude$/)
1118 {
1119 my ($mdprefix,$metaname) = ($field =~ m/(.+)\.(.+)$/);
1120 if (defined $mdprefix) {
1121 # Add in a version of Latitude/Longitude without the metadata namespace prefix to keep Runtime happy
1122 push (@{$section_ptr->{'metadata'}}, [$metaname, $value]);
1123 }
1124
1125 my $direction;
1126 if($value =~ m/^-/)
1127 {
1128 $direction = ($field eq "Latitude") ? "S" : "W";
1129 }
1130 else
1131 {
1132 $direction = ($field eq "Latitude") ? "N" : "E";
1133 }
1134
1135 my ($beforeDec, $afterDec) = ($value =~ m/^-?([0-9]+)\.([0-9]+)$/);
1136 if(defined $beforeDec && defined $afterDec)
1137 {
1138 my $name = ($field eq "Latitude") ? "LatShort" : "LngShort";
1139 push (@{$section_ptr->{'metadata'}}, [$name, $beforeDec . $direction]);
1140
1141 for(my $i = 2; $i <= 4; $i++)
1142 {
1143 if(length($afterDec) >= $i)
1144 {
1145 push (@{$section_ptr->{'metadata'}}, [$name, substr($afterDec, 0, $i)]);
1146 }
1147 }
1148
1149 #Only add the metadata if it has not already been added
1150 my $metaMap = $self->get_metadata_hashmap($section); # metaMap not used and called function has no apparent side-effects,
1151 # but this line appears important for ensuring uniqueness of (Latitude, value) meta
1152 # in the section. Also for LatShort, Longitude, LngShort.
1153 }
1154
1155 if($field =~ m/^(.+\.)?Longitude$/) {
1156 # if we are dealing with Longitude meta, we should 1. have Latitude meta too; 2. already have processed Latitude meta
1157 # in that case, add both Lat and Lng of this section as a Coordinate meta
1158 my $latitude = $self->get_metadata_element ($section, "Latitude");
1159 my $longitude = $value;
1160 # TODO: would like all Longitude meta placed together, followed by all Coordinate meta,
1161 # but the following will add all such coord meta to the doc object and the end of this function will add Longitude meta
1162 $self->processCoordinate($section, $latitude, $longitude);
1163 }
1164 }
1165
1166 elsif($field eq "GPS.mapOverlay") { # then the $value is a JSON string
1167
1168 # In order to allow searching map data enriched documents by map shape descriptions,
1169 # and to run rawquery searches for other docs by proximity based on their map data,
1170 # need to store the shape descriptions and Coordinate info for shapes into the text index.
1171 # We add the description for each shape in the mapoverlay into the text index as GPSMapOverlayLabel
1172 # And we add Coordinate (CD) and CoordShort (CS) info for each shape in the mapoverlay in the format (Lat, Lng) as ("37S339 175E342")
1173 # where the digits before the N/S/E/W direction represents the whole number. And the digits after the direction are the
1174 # decimal places which can range from 0 and 2-4 digits.
1175
1176 # However, we only want to process GPS.mapOverlay only during buildcol and only in the text indexing passes (e.g. sidx and didx for lucene)
1177 # and certainly never during the infodb pass of buildcol. The latter can end up duplicating Coordinate/CoordShort/GPSMapOverlayLabel for
1178 # when rebuilding with the online doc editor as that runs incremental-rebuild which then calls basebuilder::reconstruct_doc_objs_metadata()
1179 # on all docs NOT being incrementally rebuilt. That call would get meta from the infodb and use it to reconstruct the doc objects of docs
1180 # NOT being incrementally built. If the Coord and Label meta were written to the infodb, they would then be loaded back in when the collection
1181 # is incrementally rebuilt for those docs that don't need incremental processing. Then this function would once again add the same meta into
1182 # the infodb, thus duplicating what goes into the infodb. Hence, don't do all the following if doc::processor_mode eq "infodb".
1183
1184 # Note that for incremental rebuilding, the text pass can be called textreindex for instance (and infodb pass can be incinfodb).
1185 # So don't check for exact string match
1186
1187 if($doc::cmd_line_mode eq "buildcol" && $doc::processor_mode =~ m/text/) {# && $doc::processor_mode !~ m/infodb/) # if dummy pass important
1188
1189 ###print STDERR "GPS.mapOverlay before decoding, val = " . $value . "\n";
1190
1191 # TODO: html decode?
1192 $value =~ s@&#091;@[@g;
1193 $value =~ s@&#093;@]@g;
1194 $value =~ s@&quot;@"@g;
1195 ###print STDERR "GPS.mapOverlay after decoding, val = " . $value . "\n";
1196
1197 my $json_array = decode_json $value;
1198
1199 foreach my $shape (@$json_array) {
1200
1201 # Put each available shape description/label into this section's metadata with GPSMapOverlayLabel as metaname.
1202 # Just as for Coordinate meta, don't need to know which shape a label belongs too. This is just so each label
1203 # will be indexed, and therefore can be searched.
1204
1205 my $description = $shape->{"description"};
1206 if($description) {
1207 push (@{$section_ptr->{'metadata'}}, ["GPSMapOverlayLabel", $description]);
1208 ###print STDERR "@@@@############################################ Just added description meta: " . $description . "\n";
1209 }
1210
1211 my $type = $shape->{"type"};
1212 ###print STDERR "Shape type : " . $type . "\n";
1213
1214 if($type eq "circle") {
1215 ###print STDERR "Found a circle:\n" . &printShape($json, $shape);
1216
1217 # work out bounding box
1218 # want the inverse of this useful page:
1219 # https://stackoverflow.com/questions/639695/how-to-convert-latitude-or-longitude-to-meters
1220 # https://www.geodatasource.com/developers/javascript
1221
1222 # Dr Bainbridge wants us to use: https://gis.stackexchange.com/questions/5821/calculating-latitude-longitude-x-miles-from-point
1223 # Since for N,E,S,W the following works out the same as above link, we're using the rule of thumb in the following:
1224 # https://gis.stackexchange.com/questions/2951/algorithm-for-offsetting-a-latitude-longitude-by-some-amount-of-meters
1225 # which states
1226 # "If your displacements aren't too great (less than a few kilometers) and you're not right at the poles,
1227 # use the quick and dirty estimate that 111,111 meters (111.111 km) in the y direction is 1 degree
1228 # (of latitude) and 111,111 * cos(latitude) meters in the x direction is 1 degree (of longitude)."
1229 my $centre_lat = $shape->{"center"}->{"lat"};
1230 my $centre_lng = $shape->{"center"}->{"lng"};
1231 my $radius = $shape->{"radius"}; # in metres!
1232
1233 ###print STDERR "@@@ circle centre: ($centre_lat, $centre_lng), radius: $radius\n";
1234
1235 my $lat_north = $centre_lat + ($radius/111111);
1236 my $lat_south = $centre_lat - ($radius/111111);
1237
1238 ###print STDERR "### lat_north: $lat_north\n";
1239 ###print STDERR "### lat_south: $lat_south\n";
1240
1241 # our latitude and longitude values are in degrees. But cos and sin etc in perl and generally all prog languages
1242 # all expect their angle to be in radians. So need to convert from degree to radians before we can take the cos of it.
1243 my $centre_lat_radians = $self->degreesToRadians($centre_lat);
1244 my $cos_in_radians = cos($centre_lat_radians);
1245 ###print STDERR "cos $centre_lat_radians " . cos($centre_lat_radians) . "\n";
1246 my $lng_east = $centre_lng + ($radius/(111111 * $cos_in_radians));
1247 my $lng_west = $centre_lng - ($radius/(111111 * $cos_in_radians));
1248 ###print STDERR "### lng_east $lng_east\n";
1249 ###print STDERR "### lng_west $lng_west\n";
1250
1251 my $cos_lat = cos($centre_lat);
1252 ###print STDERR "cos $centre_lat is $cos_lat\n";
1253
1254 $self->processCoordinate($section, $lat_north, $lng_east);
1255 $self->processCoordinate($section, $lat_south, $lng_east);
1256 $self->processCoordinate($section, $lat_south, $lng_west);
1257 $self->processCoordinate($section, $lat_north, $lng_west);
1258
1259 }
1260 elsif ($type eq "marker") {
1261 ###print STDERR "@@ MARKER FOUND WITH LAT: " . $shape->{"position"}->{"lat"} . "\n";
1262 ###print STDERR "@@ MARKER FOUND WITH LNG: " . $shape->{"position"}->{"lng"} . "\n";
1263 $self->processCoordinate($section, $shape->{"position"}->{"lat"}, $shape->{"position"}->{"lng"});
1264 }
1265 elsif ($type eq "polyline" || $type eq "polygon") {
1266 my $path_array = $shape->{"path"};
1267 foreach my $position (@$path_array) {
1268 $self->processCoordinate($section, $position->{"lat"}, $position->{"lng"});
1269 }
1270 }
1271 elsif ($type eq "rectangle") {
1272
1273 my $bounds = $shape->{"bounds"};
1274 $self->processCoordinate($section, $bounds->{"north"}, $bounds->{"east"});
1275 $self->processCoordinate($section, $bounds->{"south"}, $bounds->{"east"});
1276 $self->processCoordinate($section, $bounds->{"south"}, $bounds->{"west"});
1277 $self->processCoordinate($section, $bounds->{"north"}, $bounds->{"west"});
1278 }
1279
1280 } # end for on each shape in GPS.mapOverlay
1281 } # end if(buildcol and text pass)
1282 } # end GPS.mapOverlay meta
1283
1284 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1285}
1286
1287# https://en.wikipedia.org/wiki/Radian
1288sub degreesToRadians
1289{
1290 my $self = shift (@_);
1291 my ($degrees) = @_;
1292
1293 return $degrees * pi /180; # returns radians
1294}
1295
1296sub radiansToDegrees
1297{
1298 my $self = shift (@_);
1299 my ($radians) = @_;
1300
1301 return $radians * 180 / pi; # returns degrees
1302}
1303
1304# Call as:
1305# my $json = JSON->new->allow_nonref;
1306# &printAllShapes($json, $json_array);
1307sub printAllShapes {
1308 my ($json, $json_array) = @_;
1309
1310
1311 #my $pretty_print_shape = $json->pretty->encode( $json_array->[0] );
1312 foreach my $shape (@$json_array) {
1313 my $pretty_print_shape = $json->pretty->encode( $shape );
1314 print STDERR "Shape: $pretty_print_shape\n";
1315 }
1316
1317}
1318
1319# For the (lat, lng) coordinate given,
1320# attaches Coordinate and multiple CoordShort (different precision level) metadata to the doc object
1321sub processCoordinate {
1322 my $self = shift (@_);
1323 my ($section, $latitude, $longitude) = @_;
1324
1325 my $section_ptr = $self->_lookup_section($section);
1326
1327 my $lat_direction = ($latitude =~ m/^-/) ? "S" : "N";
1328 my $lng_direction = ($longitude =~ m/^-/) ? "W" : "E";
1329
1330 # have to store (lat, lng) in pairs, when there are so many coords to store
1331 #push (@{$section_ptr->{'metadata'}}, ["Latitude", $latitude]);
1332 #push (@{$section_ptr->{'metadata'}}, ["Longitude", $longitude]);
1333
1334 push (@{$section_ptr->{'metadata'}}, ["Coordinate", "$latitude $longitude"]); # "$latitude$lat_direction $longitude$lng_direction"
1335
1336 my ($latBeforeDec, $latAfterDec);
1337 my ($lngBeforeDec, $lngAfterDec);
1338
1339 if($latitude !~ m/\./) {
1340 $latBeforeDec = $latitude;
1341 $latAfterDec = "";
1342 } else {
1343 ($latBeforeDec, $latAfterDec) = ($latitude =~ m/^-?([0-9]+)\.([0-9]+)$/);
1344 }
1345 if($longitude !~ m/\./) {
1346 $lngBeforeDec = $longitude;
1347 $lngAfterDec = "";
1348 } else {
1349 ($lngBeforeDec, $lngAfterDec) = ($longitude =~ m/^-?([0-9]+)\.([0-9]+)$/);
1350 }
1351
1352
1353 my $name = "CoordShort";
1354 push (@{$section_ptr->{'metadata'}}, [$name, "$latBeforeDec$lat_direction $lngBeforeDec$lng_direction"]);
1355
1356 for(my $i = 2; $i <= 4; $i++)
1357 {
1358 my $latDecPlaces = (length($latAfterDec) >= $i) ? substr($latAfterDec, 0, $i) : "";
1359 my $lngDecPlaces = (length($lngAfterDec) >= $i) ? substr($lngAfterDec, 0, $i) : "";
1360
1361 push (@{$section_ptr->{'metadata'}}, [$name,
1362 $latBeforeDec . $lat_direction. $latDecPlaces . " " . $lngBeforeDec . $lng_direction. $lngDecPlaces]);
1363
1364 }
1365
1366 #Only add the metadata if it has not already been added
1367 #my $metaMap = $self->get_metadata_hashmap($section); ### TODO: metaMap not used. Unnecesssary step? (Called function has no apparent side-effects.)
1368}
1369
1370
1371# methods for dealing with text
1372
1373# returns the text for a section
1374sub get_text {
1375 my $self = shift (@_);
1376 my ($section) = @_;
1377
1378 my $section_ptr = $self->_lookup_section($section);
1379 if (!defined $section_ptr) {
1380 print STDERR "doc::get_text couldn't find section " .
1381 "$section\n";
1382 return "";
1383 }
1384
1385 return $section_ptr->{'text'};
1386}
1387
1388# returns the (utf-8 encoded) length of the text for a section
1389sub get_text_length {
1390 my $self = shift (@_);
1391 my ($section) = @_;
1392
1393 my $section_ptr = $self->_lookup_section($section);
1394 if (!defined $section_ptr) {
1395 print STDERR "doc::get_text_length couldn't find section " .
1396 "$section\n";
1397 return 0;
1398 }
1399
1400 return length ($section_ptr->{'text'});
1401}
1402
1403# returns the total length for all the sections
1404sub get_total_text_length {
1405 my $self = shift (@_);
1406
1407 my $section = $self->get_top_section();
1408 my $length = 0;
1409 while (defined $section) {
1410 $length += $self->get_text_length($section);
1411 $section = $self->get_next_section($section);
1412 }
1413 return $length;
1414}
1415
1416sub delete_text {
1417 my $self = shift (@_);
1418 my ($section) = @_;
1419
1420 my $section_ptr = $self->_lookup_section($section);
1421 if (!defined $section_ptr) {
1422 print STDERR "doc::delete_text couldn't find section " .
1423 "$section\n";
1424 return;
1425 }
1426
1427 $section_ptr->{'text'} = "";
1428}
1429
1430# add_text assumes the text is in (extended) ascii form. For
1431# text which has been already converted to the UTF-8 format
1432# use add_utf8_text.
1433sub add_text {
1434 my $self = shift (@_);
1435 my ($section, $text) = @_;
1436
1437 # convert the text to UTF-8 encoded unicode characters
1438 # and add the text
1439 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1440}
1441
1442
1443# add_utf8_text assumes the text to be added has already
1444# been converted to the UTF-8 encoding. For ascii text use
1445# add_text
1446# Pass by value version (internally calls pass by ref version
1447# to avoid code duplication)
1448sub add_utf8_text {
1449 my $self = shift (@_);
1450 my ($section, $text) = @_;
1451
1452 $self->add_utf8_textref($section, \$text);
1453}
1454
1455# Pass by reference version, used by GreenstoneSQLPlugin for fulltext
1456sub add_utf8_textref {
1457 my $self = shift (@_);
1458 my ($section, $text_ref) = @_;
1459
1460 my $section_ptr = $self->_lookup_section($section);
1461 if (!defined $section_ptr) {
1462 print STDERR "doc::add_utf8_textref couldn't find section " .
1463 "$section\n";
1464 return;
1465 }
1466
1467 $section_ptr->{'text'} .= $$text_ref;
1468}
1469
1470# returns the Source meta, which is the utf8 filename generated.
1471# Added a separate method here for convenience
1472sub get_source {
1473 my $self = shift (@_);
1474 return $self->get_metadata_element ($self->get_top_section(), "Source");
1475}
1476
1477# returns the SourceFile meta, which is the url reference to the URL-encoded
1478# version of Source (the utf8 filename). Added a separate method here for convenience
1479sub get_sourcefile {
1480 my $self = shift (@_);
1481 return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
1482}
1483
1484# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
1485# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
1486sub get_assocfile_from_sourcefile {
1487 my $self = shift (@_);
1488
1489 # get the SourceFile meta, which is a *URL* to a file on the filesystem
1490 my $top_section = $self->get_top_section();
1491 my $source_file = $self->get_metadata_element($top_section, "SourceFile");
1492
1493 # get the actual filename as it exists on the filesystem which this url refers to
1494 $source_file = &unicode::url_to_filename($source_file);
1495 my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
1496 return $assocfilename;
1497}
1498
1499# methods for dealing with associated files
1500
1501# a file is associated with a document, NOT a section.
1502# if section is defined it is noted in the data structure
1503# only so that files associated from a particular section
1504# may be removed later (using delete_section_assoc_files)
1505sub associate_file {
1506 my $self = shift (@_);
1507 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1508 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1509
1510 # remove all associated files with the same name
1511 $self->delete_assoc_file ($assoc_filename);
1512
1513 # Too harsh a requirement
1514 # Definitely get HTML docs, for example, with some missing
1515 # support files
1516# if (!&util::fd_exists($real_filename)) {
1517# print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n";
1518# exit -1;
1519# }
1520
1521# print STDERR "**** is the following a UTF8 rep of *real* filename?\n $real_filename\n";
1522# print STDERR "****##### so, ensure it is before storing?!?!?\n";
1523## my $utf8_filename = Encode::encode("utf8",$filename);
1524
1525 push (@{$self->{'associated_files'}},
1526 [$real_filename, $assoc_filename, $mime_type, $section]);
1527}
1528
1529# returns a list of associated files in the form
1530# [[real_filename, assoc_filename, mimetype], ...]
1531sub get_assoc_files {
1532 my $self = shift (@_);
1533
1534 return $self->{'associated_files'};
1535}
1536
1537# the following two methods used to keep track of original associated files
1538# for incremental building. eg a txt file used by an item file does not end
1539# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1540# file for incremental build
1541sub associate_source_file {
1542 my $self = shift (@_);
1543 my ($full_filename) = @_;
1544
1545 push (@{$self->{'source_assoc_files'}}, $full_filename);
1546
1547}
1548
1549sub get_source_assoc_files {
1550 my $self = shift (@_);
1551
1552 return $self->{'source_assoc_files'};
1553
1554
1555}
1556sub metadata_file {
1557 my $self = shift (@_);
1558 my ($real_filename, $filename) = @_;
1559
1560 push (@{$self->{'metadata_files'}},
1561 [$real_filename, $filename]);
1562}
1563
1564# used for writing out the archiveinf-doc info database, to list all the metadata files
1565sub get_meta_files {
1566 my $self = shift (@_);
1567
1568 return $self->{'metadata_files'};
1569}
1570
1571sub delete_section_assoc_files {
1572 my $self = shift (@_);
1573 my ($section) = @_;
1574
1575 my $i=0;
1576 while ($i < scalar (@{$self->{'associated_files'}})) {
1577 if (defined $self->{'associated_files'}->[$i]->[3] &&
1578 $self->{'associated_files'}->[$i]->[3] eq $section) {
1579 splice (@{$self->{'associated_files'}}, $i, 1);
1580 } else {
1581 $i++;
1582 }
1583 }
1584}
1585
1586sub delete_assoc_file {
1587 my $self = shift (@_);
1588 my ($assoc_filename) = @_;
1589
1590 my $i=0;
1591 while ($i < scalar (@{$self->{'associated_files'}})) {
1592 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1593 splice (@{$self->{'associated_files'}}, $i, 1);
1594 } else {
1595 $i++;
1596 }
1597 }
1598}
1599
1600sub reset_nextsection_ptr {
1601 my $self = shift (@_);
1602 my ($section) = @_;
1603
1604 my $section_ptr = $self->_lookup_section($section);
1605 $section_ptr->{'next_subsection'} = 1;
1606}
1607
16081;
Note: See TracBrowser for help on using the repository browser.