source: main/trunk/greenstone2/perllib/doc.pm@ 33416

Last change on this file since 33416 was 33416, checked in by ak19, 5 years ago

DEC collections weren't getting built on 32 bit linux VM after trying to use Math::Trig's pi the way it was imported into doc.pm for GPSMapOverlay stuff. Now changing the import statement to check that generating DEC examples collection will work again.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 50.2 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use FileUtils;
40use ghtml;
41use File::stat;
42##use hashdoc;
43use docprint;
44use JSON;
45
46# We just need pi from the Trig lib
47# Import constants pi2, pip2, pip4 (2*pi, pi/2, pi/4).
48# But doing
49# use Math::Trig ':pi';
50# resulted in errors on 32 bit linux VM with perl v8 of "bareword pi not allowed while strict subs in use"
51# So, now doing just
52use Math::Trig;
53
54# the document type may be indexed_doc, nonindexed_doc, or
55# classification
56
57our $OIDcount = 0;
58
59
60# Declare the Package (level) Variable that will tell us whether we're in the import or buildcol phase.
61# This var will be set by import.pl, buildcol.pl and export.pl to the then appropriate value.
62# https://perlmaven.com/package-variables-and-lexical-variables-in-perl
63# But: https://stackoverflow.com/questions/7542120/perl-how-to-make-variables-from-requiring-script-available-in-required-script
64# https://stackoverflow.com/questions/33230015/how-do-i-use-a-variable-in-another-perl-script
65#$doc::cmd_line_mode = undef;
66# Decided to follow the usage of $OIDcount above to declare the necessary package-level variables 'globally' accessible
67# actually accessible only for those packages that import this one (with 'use doc')
68my $cmd_line_mode = undef;
69
70# processor_mode keeps track of which buildcol pass we're at: dummy, text (sidx/didx passes) or infodb
71my $processor_mode = undef;
72
73# rename_method can be 'url', 'none', 'base64'
74sub new {
75 my $class = shift (@_);
76 my ($source_filename, $doc_type, $rename_method) = @_;
77
78
79 my $self = bless {'associated_files'=>[],
80 'subsection_order'=>[],
81 'next_subsection'=>1,
82 'subsections'=>{},
83 'metadata'=>[],
84 'text'=>"",
85 'OIDtype'=>"hash"}, $class;
86
87 # used to set lastmodified here, but this can screw up the HASH ids, so
88 # the docsave processor now calls set_lastmodified
89
90 $self->set_source_path($source_filename);
91
92 if (defined $source_filename) {
93 $source_filename = &util::filename_within_collection($source_filename);
94 print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
95 $self->set_source_filename ($source_filename, $rename_method);
96 }
97
98 $self->set_doc_type ($doc_type) if defined $doc_type;
99
100 return $self;
101}
102
103
104sub set_source_path
105{
106 my $self = shift @_;
107 my ($source_filename) = @_;
108
109 if (defined $source_filename) {
110 # On Windows the source_filename can be in terse DOS format
111 # e.g. test~1.txt
112
113 $self->{'terse_source_path'} = $source_filename;
114
115 # Use the FileUtil library methods as they are aware of more special
116 # cases such as HDFS [jmt12]
117 if (&FileUtils::fileExists($source_filename))
118 {
119 # See if we can do better for Windows with a filename
120 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
121 require Win32;
122 $self->{'source_path'} = Win32::GetLongPathName($source_filename);
123 }
124 else {
125 # For Unix-based systems, there is no difference between the two
126 $self->{'source_path'} = $source_filename;
127 }
128 }
129 else {
130 print STDERR "Warning: In doc::set_source_path(), file\n";
131 print STDERR " $source_filename\n";
132 print STDERR " does not exist\n";
133
134 # (default) Set it to whatever we were given
135 $self->{'source_path'} = $source_filename;
136 }
137 }
138 else {
139 # Previous code for setting source_path allowed for
140 # it to be undefined, so continue this practice
141 $self->{'terse_source_path'} = undef;
142 $self->{'source_path'} = undef;
143 }
144}
145
146
147sub get_source_path
148{
149 my $self = shift @_;
150
151 return $self->{'terse_source_path'};
152}
153
154# set lastmodified for OAI purposes, added by GRB, moved by kjdon
155sub set_oailastmodified {
156 my $self = shift (@_);
157
158 my $source_path = $self->{'terse_source_path'};
159
160 if (defined $source_path && (-e $source_path)) {
161 my $current_time = time;
162
163 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
164 $wday, $yday, $isdst) = localtime($current_time);
165
166 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
167
168 $self->add_utf8_metadata($self->get_top_section(), "oailastmodified", $current_time);
169 $self->add_utf8_metadata($self->get_top_section(), "oailastmodifieddate", $date_modified);
170 }
171}
172
173# no longer used for OAI purposes, since lastmodified is not what we want as the
174# Datestamp of a document. This doc metadata may be useful for general purposes.
175sub set_lastmodified {
176 my $self = shift (@_);
177
178 my $source_path = $self->{'terse_source_path'};
179
180 if (defined $source_path && (-e $source_path)) {
181
182 my $file_stat = stat($source_path);
183 my $mtime = $file_stat->mtime;
184 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
185 $wday, $yday, $isdst) = localtime($mtime);
186
187 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
188
189 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $mtime);
190 $self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
191 }
192}
193
194# clone the $self object
195sub duplicate {
196 my $self = shift (@_);
197
198 my $newobj = {};
199
200 foreach my $k (keys %$self) {
201 $newobj->{$k} = &clone ($self->{$k});
202 }
203
204 bless $newobj, ref($self);
205 return $newobj;
206}
207
208sub clone {
209 my ($from) = @_;
210 my $type = ref ($from);
211
212 if ($type eq "HASH") {
213 my $to = {};
214 foreach my $key (keys %$from) {
215 $to->{$key} = &clone ($from->{$key});
216 }
217 return $to;
218 } elsif ($type eq "ARRAY") {
219 my $to = [];
220 foreach my $v (@$from) {
221 push (@$to, &clone ($v));
222 }
223 return $to;
224 } else {
225 return $from;
226 }
227}
228
229sub set_OIDtype {
230 my $self = shift (@_);
231 my ($type, $metadata) = @_;
232
233 if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|hash_on_full_filename|incremental|filename|dirname|full_filename|assigned)$/) {
234 $self->{'OIDtype'} = $type;
235 } else {
236 $self->{'OIDtype'} = "hash";
237 }
238
239 if ($type =~ /^assigned$/) {
240 if (defined $metadata) {
241 $self->{'OIDmetadata'} = $metadata;
242 } else {
243 $self->{'OIDmetadata'} = "dc.Identifier";
244 }
245 }
246}
247
248# rename_method can be 'url', 'none', 'base64'
249sub set_source_filename {
250 my $self = shift (@_);
251 my ($source_filename, $rename_method) = @_;
252
253 # Since the gsdlsourcefilename element goes into the doc.xml it has
254 # to be utf8. However, it should also *represent* the source filename
255 # (in the import directory) which may not be utf8 at all.
256 # For instance, if this meta element (gsdlsourcefilename) will be used
257 # by other applications that parse doc.xml in order to locate
258 # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
259 # encode the real filename as this is a binary-to-text encoding meaning
260 # that the resulting string is ASCII (utf8). Decoding will give the original.
261
262# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
263
264 # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
265 $source_filename = $self->encode_filename($source_filename, $rename_method);
266# my ($srcfilename,$dirname,$suffix)
267# = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
268# print STDERR "-> $srcfilename -> ";
269# $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
270# $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
271# print STDERR "$source_filename\n";
272
273 $self->set_utf8_metadata_element ($self->get_top_section(),
274 "gsdlsourcefilename",
275 $source_filename);
276}
277
278sub encode_filename {
279 my $self = shift (@_);
280 my ($source_filename, $rename_method) = @_;
281
282 my ($srcfilename,$dirname,$suffix)
283 = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
284# print STDERR "-> $srcfilename -> ";
285 $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
286 $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
287
288 return $source_filename;
289}
290
291sub set_converted_filename {
292 my $self = shift (@_);
293 my ($converted_filename) = @_;
294
295 # we know the converted filename is utf8
296 $self->set_utf8_metadata_element ($self->get_top_section(),
297 "gsdlconvertedfilename",
298 $converted_filename);
299}
300
301# returns the source_filename as it was provided
302sub get_unmodified_source_filename {
303 my $self = shift (@_);
304
305 return $self->{'terse_source_path'};
306}
307
308# returns the source_filename with whatever rename_method was given
309sub get_source_filename {
310 my $self = shift (@_);
311
312 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
313}
314
315
316
317# returns converted filename if available else returns source filename
318sub get_filename_for_hashing {
319 my $self = shift (@_);
320
321 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
322
323 if (!defined $filename) {
324 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
325 # if NULPlug processed file, then don't give a filename
326 if (defined $plugin_name && $plugin_name eq "NULPlug") {
327 $filename = undef;
328 } else { # returns the URL encoded source filename!
329 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
330 }
331 }
332
333 if (!&FileUtils::isFilenameAbsolute($filename)) {
334 $filename = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$filename);
335 }
336
337 return $filename;
338}
339
340sub set_doc_type {
341 my $self = shift (@_);
342 my ($doc_type) = @_;
343
344 $self->set_metadata_element ($self->get_top_section(),
345 "gsdldoctype",
346 $doc_type);
347}
348
349# returns the gsdldoctype as it was provided
350# the default of "indexed_doc" is used if no document
351# type was provided
352sub get_doc_type {
353 my $self = shift (@_);
354
355 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
356 return $doc_type if (defined $doc_type);
357 return "indexed_doc";
358}
359
360
361# look up the reference to the a particular section
362sub _lookup_section {
363 my $self = shift (@_);
364 my ($section) = @_;
365
366 my ($num);
367 my $sectionref = $self;
368
369 while (defined $section && $section ne "") {
370
371 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
372
373 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
374
375 $section = "" unless defined $section;
376
377
378 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
379 $sectionref = $sectionref->{'subsections'}->{$num};
380 } else {
381 return undef;
382 }
383 }
384
385 return $sectionref;
386}
387
388# calculate OID by hashing the contents of the document
389sub _calc_OID {
390 my $self = shift (@_);
391 my ($filename) = @_;
392
393
394 my $osexe = &util::get_os_exe();
395
396 my $hashfile_exe = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin",
397 $ENV{'GSDLOS'},"hashfile$osexe");
398
399 &util::set_gnomelib_env(); # gnomelib_env (particularly lib/libiconv2.dylib) required to run the hashfile executable on Mac Lions
400 # The subroutine will set the gnomelib env once for each subshell launched, by first testing if GEXTGNOME is not already set
401
402 # A different way to set the gnomelib env would be to do it more locally: exporting the necessary vars
403 # (specifically DYLD/LD_LIB_PATH) for gnome_lib as part of the command executed.
404 # E.g. $result=`export LD_LIBRARY_PATH=../ext/gnome-lib/darwin/lib; hashfile...`
405
406 my $result = "NULL";
407
408
409 if (-e "$hashfile_exe") {
410# $result = `\"$hashfile_exe\" \"$filename\"`;
411# $result = `hashfile$osexe \"$filename\" 2>&1`;
412 $result = `hashfile$osexe \"$filename\"`;
413
414 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
415 } else {
416 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
417 }
418 return "HASH$result";
419}
420
421# methods dealing with OID, not groups of them.
422
423# if $OID is not provided one is calculated
424sub set_OID {
425 my $self = shift (@_);
426 my ($OID) = @_;
427
428 my $use_hash_oid = 0;
429 # if an OID wasn't provided calculate one
430 if (!defined $OID) {
431 $OID = "NULL";
432 if ($self->{'OIDtype'} =~ /^hash/) {
433 $use_hash_oid = 1;
434 } elsif ($self->{'OIDtype'} eq "incremental") {
435 $OID = "D" . $OIDcount;
436 $OIDcount ++;
437 } elsif ($self->{'OIDtype'} eq "filename") {
438 my $filename = $self->get_source_filename();
439 $OID = &File::Basename::fileparse($filename, qr/\.[^.]*/);
440 $OID = &util::tidy_up_oid($OID);
441 } elsif ($self->{'OIDtype'} eq "full_filename") {
442 my $source_filename = $self->get_source_filename();
443 my $dirsep = &util::get_os_dirsep();
444
445 $source_filename =~ s/^import$dirsep//;
446 $source_filename =~ s/$dirsep/-/g;
447 $source_filename =~ s/\./_/g;
448
449 $OID = $source_filename;
450 $OID = &util::tidy_up_oid($OID);
451 } elsif ($self->{'OIDtype'} eq "dirname") {
452 my $filename = $self->get_source_filename();
453 if (defined($filename)) { # && -e $filename) {
454 # get the immediate parent directory
455 $OID = &File::Basename::dirname($filename);
456 if (defined $OID) {
457 $OID = &File::Basename::basename($OID);
458 $OID = &util::tidy_up_oid($OID);
459 } else {
460 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
461 $use_hash_oid = 1;
462 }
463 } else {
464 print STDERR "Failed to find a filename, generating hash id\n";
465 $use_hash_oid = 1;
466 }
467
468 } elsif ($self->{'OIDtype'} eq "assigned") {
469 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
470 if (defined $identifier && $identifier ne "") {
471 $OID = $identifier;
472 $OID = &util::tidy_up_oid($OID);
473 } else {
474 # need a hash id
475 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
476 $use_hash_oid = 1;
477 }
478
479 } else {
480 $use_hash_oid = 1;
481 }
482
483 if ($use_hash_oid) {
484 my $hash_on_file = 1;
485 my $hash_on_ga_xml = 0;
486
487 if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
488 $hash_on_file = 0;
489 $hash_on_ga_xml = 1;
490 }
491
492 if ($self->{'OIDtype'} eq "hash_on_full_filename") {
493 $hash_on_file = 0;
494 $hash_on_ga_xml = 0;
495
496 my $source_filename = $self->get_source_filename();
497 my $dirsep = &util::get_os_dirsep();
498
499 $source_filename =~ s/^import$dirsep//;
500 $source_filename =~ s/$dirsep/-/g;
501 $source_filename =~ s/\./_/g;
502
503 # If the filename is very short then (handled naively)
504 # this can cause conjestion in the hash-values
505 # computed, leading documents sharing the same leading
506 # Hex values in the computed has.
507 #
508 # The solution taken here is to replace the name of
509 # the file name a sufficient number of times (up to
510 # the character limit defined in 'rep_limit' and
511 # make that the content that is hashed on
512
513 # *** Think twice before changing the following value
514 # as it will break backward compatability of computed
515 # document HASH values
516
517 my $rep_limit = 256;
518 my $hash_content = undef;
519
520 if (length($source_filename)<$rep_limit) {
521 my $rep_string = "$source_filename|";
522 my $rs_len = length($rep_string);
523
524 my $clone_times = int(($rep_limit-1)/$rs_len) +1;
525
526 $hash_content = substr($rep_string x $clone_times, 0, $rep_limit);
527 }
528 else {
529 $hash_content = $source_filename;
530 }
531
532 my $filename = &util::get_tmp_filename();
533 if (!open (OUTFILE, ">:utf8", $filename)) {
534 print STDERR "doc::set_OID could not write to $filename\n";
535 } else {
536 print OUTFILE $hash_content;
537 close (OUTFILE);
538 }
539 $OID = $self->_calc_OID ($filename);
540
541 &FileUtils::removeFiles ($filename);
542 }
543
544 if ($hash_on_file) {
545 # "hash" OID - feed file to hashfile.exe
546 my $filename = $self->get_filename_for_hashing();
547
548 # -z: don't want to hash on the file if it is zero size
549 if (defined($filename) && -e $filename && !-z $filename) {
550 $OID = $self->_calc_OID ($filename);
551 } else {
552 $hash_on_ga_xml = 1; # switch to back-up plan, and hash on GA file instead
553 }
554 }
555
556 if ($hash_on_ga_xml) {
557 # In addition being asked to explicity calculate the has based on the GA file,
558 # can also end up coming into this block is doing 'hash_on_file' but the file
559 # itself is of zero bytes (as could be the case with 'doc.nul' file
560
561 my $filename = &util::get_tmp_filename();
562 if (!open (OUTFILE, ">:utf8", $filename)) {
563 print STDERR "doc::set_OID could not write to $filename\n";
564 } else {
565 my $doc_text = &docprint::get_section_xml($self);
566 print OUTFILE $doc_text;
567 close (OUTFILE);
568 }
569 $OID = $self->_calc_OID ($filename);
570 &FileUtils::removeFiles($filename);
571 }
572 }
573 }
574 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
575}
576
577# this uses hashdoc (embedded c thingy) which is faster but still
578# needs a little work to be sufficiently stable
579sub ___set_OID {
580 my $self = shift (@_);
581 my ($OID) = @_;
582
583 # if an OID wasn't provided then calculate hash value based on document
584 if (!defined $OID)
585 {
586 my $hash_text = &docprint::get_section_xml($self);
587 my $hash_len = length($hash_text);
588
589 $OID = &hashdoc::buffer($hash_text,$hash_len);
590 }
591
592 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
593}
594
595# returns the OID for this document
596sub get_OID {
597 my $self = shift (@_);
598 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
599 return $OID if (defined $OID);
600 return "NULL";
601}
602
603sub delete_OID {
604 my $self = shift (@_);
605
606 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
607}
608
609
610# methods for manipulating section names
611
612# returns the name of the top-most section (the top
613# level of the document
614sub get_top_section {
615 my $self = shift (@_);
616
617 return "";
618}
619
620# returns a section
621sub get_parent_section {
622 my $self = shift (@_);
623 my ($section) = @_;
624
625 $section =~ s/(^|\.)\d+$//;
626
627 return $section;
628}
629
630# returns the first child section (or the end child
631# if there isn't any)
632sub get_begin_child {
633 my $self = shift (@_);
634 my ($section) = @_;
635
636 my $section_ptr = $self->_lookup_section($section);
637 return "" unless defined $section_ptr;
638
639 if (defined $section_ptr->{'subsection_order'}->[0]) {
640 return "$section.$section_ptr->{'subsection_order'}->[0]";
641 }
642
643 return $self->get_end_child ($section);
644}
645
646# returns the next child of a parent section
647sub get_next_child {
648 my $self = shift (@_);
649 my ($section) = @_;
650
651 my $parent_section = $self->get_parent_section($section);
652 my $parent_section_ptr = $self->_lookup_section($parent_section);
653 return undef unless defined $parent_section_ptr;
654
655 my ($section_num) = $section =~ /(\d+)$/;
656 return undef unless defined $section_num;
657
658 my $i = 0;
659 my $section_order = $parent_section_ptr->{'subsection_order'};
660 while ($i < scalar(@$section_order)) {
661 last if $section_order->[$i] eq $section_num;
662 $i++;
663 }
664
665 $i++; # the next child
666 if ($i < scalar(@$section_order)) {
667 return $section_order->[$i] if $parent_section eq "";
668 return "$parent_section.$section_order->[$i]";
669 }
670
671 # no more sections in this level
672 return undef;
673}
674
675# returns a reference to a list of children
676sub get_children {
677 my $self = shift (@_);
678 my ($section) = @_;
679
680 my $section_ptr = $self->_lookup_section($section);
681 return [] unless defined $section_ptr;
682
683 my @children = @{$section_ptr->{'subsection_order'}};
684
685 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
686 return \@children;
687}
688
689# returns the child section one past the last one (which
690# is coded as "0")
691sub get_end_child {
692 my $self = shift (@_);
693 my ($section) = @_;
694
695 return $section . ".0" unless $section eq "";
696 return "0";
697}
698
699# returns the next section in book order
700sub get_next_section {
701 my $self = shift (@_);
702 my ($section) = @_;
703
704 return undef unless defined $section;
705
706 my $section_ptr = $self->_lookup_section($section);
707 return undef unless defined $section_ptr;
708
709 # first try to find first child
710 if (defined $section_ptr->{'subsection_order'}->[0]) {
711 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
712 return "$section.$section_ptr->{'subsection_order'}->[0]";
713 }
714
715 do {
716 # try to find sibling
717 my $next_child = $self->get_next_child ($section);
718 return $next_child if (defined $next_child);
719
720 # move up one level
721 $section = $self->get_parent_section ($section);
722 } while $section =~ /\d/;
723
724 return undef;
725}
726
727sub is_leaf_section {
728 my $self = shift (@_);
729 my ($section) = @_;
730
731 my $section_ptr = $self->_lookup_section($section);
732 return 1 unless defined $section_ptr;
733
734 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
735}
736
737# methods for dealing with sections
738
739# returns the name of the inserted section
740sub insert_section {
741 my $self = shift (@_);
742 my ($before_section) = @_;
743
744 # get the child to insert before and its parent section
745 my $parent_section = "";
746 my $before_child = "0";
747 my @before_section = split (/\./, $before_section);
748 if (scalar(@before_section) > 0) {
749 $before_child = pop (@before_section);
750 $parent_section = join (".", @before_section);
751 }
752
753 my $parent_section_ptr = $self->_lookup_section($parent_section);
754 if (!defined $parent_section_ptr) {
755 print STDERR "doc::insert_section couldn't find parent section " .
756 "$parent_section\n";
757 return;
758 }
759
760 # get the next section number
761 my $section_num = $parent_section_ptr->{'next_subsection'}++;
762
763 my $i = 0;
764 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
765 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
766 $i++;
767 }
768
769 # insert the section number into the order list
770 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
771
772 # add this section to the parent section
773 my $section_ptr = {'subsection_order'=>[],
774 'next_subsection'=>1,
775 'subsections'=>{},
776 'metadata'=>[],
777 'text'=>""};
778 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
779
780 # work out the full section number
781 my $section = $parent_section;
782 $section .= "." unless $section eq "";
783 $section .= $section_num;
784
785 return $section;
786}
787
788# creates a pre-named section
789sub create_named_section {
790 my $self = shift (@_);
791 my ($mastersection) = @_;
792
793 my ($num);
794 my $section = $mastersection;
795 my $sectionref = $self;
796
797 while ($section ne "") {
798 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
799 $num =~ s/^0+(\d)/$1/; # remove leading 0s
800 $section = "" unless defined $section;
801
802 if (defined $num) {
803 if (!defined $sectionref->{'subsections'}->{$num}) {
804 push (@{$sectionref->{'subsection_order'}}, $num);
805 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
806 'next_subsection'=>1,
807 'subsections'=>{},
808 'metadata'=>[],
809 'text'=>""};
810 if ($num >= $sectionref->{'next_subsection'}) {
811 $sectionref->{'next_subsection'} = $num + 1;
812 }
813 }
814 $sectionref = $sectionref->{'subsections'}->{$num};
815
816 } else {
817 print STDERR "doc::create_named_section couldn't create section ";
818 print STDERR "$mastersection\n";
819 last;
820 }
821 }
822}
823
824# returns a reference to a list of subsections
825sub list_subsections {
826 my $self = shift (@_);
827 my ($section) = @_;
828
829 my $section_ptr = $self->_lookup_section ($section);
830 if (!defined $section_ptr) {
831 print STDERR "doc::list_subsections couldn't find section $section\n";
832 return [];
833 }
834
835 return [@{$section_ptr->{'subsection_order'}}];
836}
837
838sub delete_section {
839 my $self = shift (@_);
840 my ($section) = @_;
841
842# my $section_ptr = {'subsection_order'=>[],
843# 'next_subsection'=>1,
844# 'subsections'=>{},
845# 'metadata'=>[],
846# 'text'=>""};
847
848 # if this is the top section reset everything
849 if ($section eq "") {
850 $self->{'subsection_order'} = [];
851 $self->{'subsections'} = {};
852 $self->{'metadata'} = [];
853 $self->{'text'} = "";
854 return;
855 }
856
857 # find the parent of the section to delete
858 my $parent_section = "";
859 my $child = "0";
860 my @section = split (/\./, $section);
861 if (scalar(@section) > 0) {
862 $child = pop (@section);
863 $parent_section = join (".", @section);
864 }
865
866 my $parent_section_ptr = $self->_lookup_section($parent_section);
867 if (!defined $parent_section_ptr) {
868 print STDERR "doc::delete_section couldn't find parent section " .
869 "$parent_section\n";
870 return;
871 }
872
873 # remove this section from the subsection_order list
874 my $i = 0;
875 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
876 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
877 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
878 last;
879 }
880 $i++;
881 }
882
883 # remove this section from the subsection hash
884 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
885 undef $parent_section_ptr->{'subsections'}->{$child};
886 }
887}
888
889#--
890# methods for dealing with metadata
891
892# set_metadata_element and get_metadata_element are for metadata
893# which should only have one value. add_meta_data and get_metadata
894# are for metadata which can have more than one value.
895
896# returns the first metadata value which matches field
897
898# This version of get metadata element works much like the one above,
899# except it allows for the namespace portion of a metadata element to
900# be ignored, thus if you are searching for dc.Title, the first piece
901# of matching metadata ending with the name Title (once any namespace
902# is removed) would be returned.
903# 28-11-2003 John Thompson
904sub get_metadata_element {
905 my $self = shift (@_);
906 my ($section, $field, $ignore_namespace) = @_;
907 my ($data);
908
909 $ignore_namespace = 0 unless defined $ignore_namespace;
910
911 my $section_ptr = $self->_lookup_section($section);
912 if (!defined $section_ptr) {
913 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
914 return;
915 }
916
917 # Remove any namespace if we are being told to ignore them
918 if($ignore_namespace) {
919 $field =~ s/^.*\.//; #$field =~ s/^\w*\.//;
920 }
921
922 foreach $data (@{$section_ptr->{'metadata'}}) {
923
924 my $data_name = $data->[0];
925
926 # Remove any namespace if we are being told to ignore them
927 if($ignore_namespace) {
928 $data_name =~ s/^.*\.//; #$data_name =~ s/^\w*\.//;
929 }
930 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
931 $data_name =~ s/^ex\.([^.]+)$/$1/; #$data_name =~ s/^ex\.//;
932
933 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
934 }
935
936 return undef; # was not found
937}
938
939# returns a list of the form [value1, value2, ...]
940sub get_metadata {
941 my $self = shift (@_);
942 my ($section, $field, $ignore_namespace) = @_;
943 my ($data);
944
945 $ignore_namespace = 0 unless defined $ignore_namespace;
946
947 my $section_ptr = $self->_lookup_section($section);
948 if (!defined $section_ptr) {
949 print STDERR "doc::get_metadata couldn't find section ",
950 $section, "\n";
951 return;
952 }
953
954 # Remove any namespace if we are being told to ignore them
955 if($ignore_namespace) {
956 $field =~ s/^.*\.//;
957 }
958
959 my @metadata = ();
960 foreach $data (@{$section_ptr->{'metadata'}}) {
961
962 my $data_name = $data->[0];
963
964 # Remove any namespace if we are being told to ignore them
965 if($ignore_namespace) {
966 $data_name =~ s/^.*\.//;
967 }
968 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
969 $data_name =~ s/^ex\.([^.]+)$/$1/;
970
971 push (@metadata, $data->[1]) if ($data_name eq $field);
972 }
973
974 return \@metadata;
975}
976
977sub get_metadata_hashmap {
978 my $self = shift (@_);
979 my ($section, $opt_namespace) = @_;
980
981 my $section_ptr = $self->_lookup_section($section);
982 if (!defined $section_ptr) {
983 print STDERR "doc::get_metadata couldn't find section ",
984 $section, "\n";
985 return;
986 }
987
988 my $metadata_hashmap = {};
989 foreach my $data (@{$section_ptr->{'metadata'}}) {
990 my $metaname = $data->[0];
991
992 if ((!defined $opt_namespace) || ($metaname =~ m/^$opt_namespace\./)) {
993 if (!defined $metadata_hashmap->{$metaname}) {
994 $metadata_hashmap->{$metaname} = [];
995 }
996 my $metaval_list = $metadata_hashmap->{$metaname};
997 push(@$metaval_list, $data->[1]);
998 }
999 }
1000
1001 return $metadata_hashmap;
1002}
1003
1004# returns a list of the form [[field,value],[field,value],...]
1005sub get_all_metadata {
1006 my $self = shift (@_);
1007 my ($section) = @_;
1008
1009 my $section_ptr = $self->_lookup_section($section);
1010 if (!defined $section_ptr) {
1011 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
1012 return;
1013 }
1014
1015 return $section_ptr->{'metadata'};
1016}
1017
1018# $value is optional
1019sub delete_metadata {
1020 my $self = shift (@_);
1021 my ($section, $field, $value) = @_;
1022
1023 my $section_ptr = $self->_lookup_section($section);
1024 if (!defined $section_ptr) {
1025 print STDERR "doc::delete_metadata couldn't find section ", $section, "$field\n";
1026 return;
1027 }
1028
1029 my $i = 0;
1030 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
1031 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
1032 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
1033 splice (@{$section_ptr->{'metadata'}}, $i, 1);
1034 } else {
1035 $i++;
1036 }
1037 }
1038}
1039
1040sub delete_all_metadata {
1041 my $self = shift (@_);
1042 my ($section) = @_;
1043
1044 my $section_ptr = $self->_lookup_section($section);
1045 if (!defined $section_ptr) {
1046 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
1047 return;
1048 }
1049
1050 $section_ptr->{'metadata'} = [];
1051}
1052
1053sub set_metadata_element {
1054 my $self = shift (@_);
1055 my ($section, $field, $value) = @_;
1056
1057 $self->set_utf8_metadata_element ($section, $field,
1058 &unicode::ascii2utf8(\$value));
1059}
1060
1061# set_utf8_metadata_element assumes the text has already been
1062# converted to the UTF-8 encoding.
1063sub set_utf8_metadata_element {
1064 my $self = shift (@_);
1065 my ($section, $field, $value) = @_;
1066
1067 $self->delete_metadata ($section, $field);
1068 $self->add_utf8_metadata ($section, $field, $value);
1069}
1070
1071
1072# add_metadata assumes the text is in (extended) ascii form. For
1073# text which has already been converted to the UTF-8 format use
1074# add_utf8_metadata.
1075sub add_metadata {
1076 my $self = shift (@_);
1077 my ($section, $field, $value) = @_;
1078
1079 $self->add_utf8_metadata ($section, $field,
1080 &unicode::ascii2utf8(\$value));
1081}
1082
1083sub add_utf8_metadata {
1084 my $self = shift (@_);
1085 my ($section, $field, $value) = @_;
1086
1087 # my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1088 # my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
1089 # print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
1090
1091 my $section_ptr = $self->_lookup_section($section);
1092 if (!defined $section_ptr) {
1093 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
1094 return;
1095 }
1096 if (!defined $value) {
1097 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
1098 return;
1099 }
1100 if (!defined $field) {
1101 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
1102 return;
1103 }
1104
1105 #print STDERR "###$field=$value\n";
1106
1107 # For now, supress this check. Given that text data read in is now
1108 # Unicode aware, then the following block of code can (ironically enough)
1109 # cause our unicode compliant string to be re-encoded (leading to
1110 # a double-encoded UTF-8 string, which we definitely don't want!).
1111
1112
1113 # double check that the value is utf-8
1114 # if (!&unicode::check_is_utf8($value)) {
1115 # print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
1116 # &unicode::ensure_utf8(\$value);
1117 # print STDERR " Tried converting to utf8: $value\n";
1118 # }
1119
1120 #If the metadata value is either a latitude or a longitude value then we want to save a shortened version for spacial searching purposes
1121 if ($field =~ m/^(.+\.)?Latitude$/ || $field =~ m/^(.+\.)?Longitude$/)
1122 {
1123 my ($mdprefix,$metaname) = ($field =~ m/(.+)\.(.+)$/);
1124 if (defined $mdprefix) {
1125 # Add in a version of Latitude/Longitude without the metadata namespace prefix to keep Runtime happy
1126 push (@{$section_ptr->{'metadata'}}, [$metaname, $value]);
1127 }
1128
1129 my $direction;
1130 if($value =~ m/^-/)
1131 {
1132 $direction = ($field eq "Latitude") ? "S" : "W";
1133 }
1134 else
1135 {
1136 $direction = ($field eq "Latitude") ? "N" : "E";
1137 }
1138
1139 my ($beforeDec, $afterDec) = ($value =~ m/^-?([0-9]+)\.([0-9]+)$/);
1140 if(defined $beforeDec && defined $afterDec)
1141 {
1142 my $name = ($field eq "Latitude") ? "LatShort" : "LngShort";
1143 push (@{$section_ptr->{'metadata'}}, [$name, $beforeDec . $direction]);
1144
1145 for(my $i = 2; $i <= 4; $i++)
1146 {
1147 if(length($afterDec) >= $i)
1148 {
1149 push (@{$section_ptr->{'metadata'}}, [$name, substr($afterDec, 0, $i)]);
1150 }
1151 }
1152
1153 #Only add the metadata if it has not already been added
1154 my $metaMap = $self->get_metadata_hashmap($section); # metaMap not used and called function has no apparent side-effects,
1155 # but this line appears important for ensuring uniqueness of (Latitude, value) meta
1156 # in the section. Also for LatShort, Longitude, LngShort.
1157 }
1158
1159 if($field =~ m/^(.+\.)?Longitude$/) {
1160 # if we are dealing with Longitude meta, we should 1. have Latitude meta too; 2. already have processed Latitude meta
1161 # in that case, add both Lat and Lng of this section as a Coordinate meta
1162 my $latitude = $self->get_metadata_element ($section, "Latitude");
1163 my $longitude = $value;
1164 # TODO: would like all Longitude meta placed together, followed by all Coordinate meta,
1165 # but the following will add all such coord meta to the doc object and the end of this function will add Longitude meta
1166 $self->processCoordinate($section, $latitude, $longitude);
1167 }
1168 }
1169
1170 elsif($field eq "GPS.mapOverlay") { # then the $value is a JSON string
1171
1172 # In order to allow searching map-data-enriched documents by map shape descriptions,
1173 # and to run rawquery searches for other docs by proximity based on their map data,
1174 # need to store the shape descriptions and Coordinate info for shapes into the text index.
1175 # We add the description for each shape in the mapoverlay into the text index as GPSMapOverlayLabel
1176 # And we add Coordinate (CD) and CoordShort (CS) info for each shape in the mapoverlay in the format (Lat, Lng) as ("37S339 175E342")
1177 # where the digits before the N/S/E/W direction represents the whole number. And the digits after the direction are the
1178 # decimal places which can vary between 0 and 2 to 4 digits.
1179
1180 # However, we only want to process GPS.mapOverlay during buildcol and only in the text indexing passes (e.g. sidx and didx for lucene)
1181 # and certainly never during the infodb pass of buildcol. The latter can end up duplicating Coordinate/CoordShort/GPSMapOverlayLabel
1182 # when rebuilding with the online doc editor, as that runs incremental-rebuild which then calls basebuilder::reconstruct_doc_objs_metadata()
1183 # on all docs NOT being incrementally rebuilt. That call would get meta from the infodb and use it to reconstruct the doc objects of docs
1184 # NOT being incrementally built. If the Coord and Label meta were written to the infodb, they would then be loaded back in memory when the
1185 # collection is incrementally rebuilt for those docs that don't need incremental processing. Then this function would once again add the
1186 # same meta into the infodb, thus duplicating what goes into the infodb. Hence, don't do all the following if doc::processor_mode =~ "infodb",
1187 # or anything other than a text mode.
1188
1189 # Note that for incremental rebuilding, the text pass can be called textreindex for instance (and infodb pass can be incinfodb).
1190 # So don't check for exact string match
1191
1192 if($doc::cmd_line_mode eq "buildcol" && $doc::processor_mode =~ m/^text/) { # currently known text processor_modes:
1193 # text, textreindex, possibly textdelete (see ArchivesInfPlugin.pm::read() for last 2).
1194 # OR: ..."buildcol" && $doc::processor_mode !~ m/infodb$/) # if dummy pass important
1195
1196 ###print STDERR "GPS.mapOverlay before decoding, val = " . $value . "\n";
1197
1198 # TODO: html decode?
1199 $value =~ s@&#091;@[@g;
1200 $value =~ s@&#093;@]@g;
1201 $value =~ s@&quot;@"@g;
1202 ###print STDERR "GPS.mapOverlay after decoding, val = " . $value . "\n";
1203
1204 my $json_array = decode_json $value;
1205
1206 foreach my $shape (@$json_array) {
1207
1208 # Put each available shape description/label into this section's metadata with GPSMapOverlayLabel as metaname.
1209 # Just as for Coordinate meta, don't need to know which shape a label belongs too. This is just so each label
1210 # will be indexed, and therefore can be searched.
1211
1212 my $description = $shape->{"description"};
1213 if($description) {
1214 push (@{$section_ptr->{'metadata'}}, ["GPSMapOverlayLabel", $description]);
1215 ###print STDERR "@@@@############################################ Just added description meta: " . $description . "\n";
1216 }
1217
1218 my $type = $shape->{"type"};
1219 ###print STDERR "Shape type : " . $type . "\n";
1220
1221 if($type eq "circle") {
1222 ###print STDERR "Found a circle:\n" . &printShape($json, $shape);
1223
1224 # work out bounding box
1225 # want the inverse of this useful page:
1226 # https://stackoverflow.com/questions/639695/how-to-convert-latitude-or-longitude-to-meters
1227 # https://www.geodatasource.com/developers/javascript
1228
1229 # Dr Bainbridge wants us to use: https://gis.stackexchange.com/questions/5821/calculating-latitude-longitude-x-miles-from-point
1230 # Since for N,E,S,W the following works out the same as above link, we're using the rule of thumb in the following:
1231 # https://gis.stackexchange.com/questions/2951/algorithm-for-offsetting-a-latitude-longitude-by-some-amount-of-meters
1232 # which states
1233 # "If your displacements aren't too great (less than a few kilometers) and you're not right at the poles,
1234 # use the quick and dirty estimate that 111,111 meters (111.111 km) in the y direction is 1 degree
1235 # (of latitude) and 111,111 * cos(latitude) meters in the x direction is 1 degree (of longitude)."
1236 my $centre_lat = $shape->{"center"}->{"lat"};
1237 my $centre_lng = $shape->{"center"}->{"lng"};
1238 my $radius = $shape->{"radius"}; # in metres!
1239
1240 ###print STDERR "@@@ circle centre: ($centre_lat, $centre_lng), radius: $radius\n";
1241
1242 my $lat_north = $centre_lat + ($radius/111111);
1243 my $lat_south = $centre_lat - ($radius/111111);
1244
1245 ###print STDERR "### lat_north: $lat_north\n";
1246 ###print STDERR "### lat_south: $lat_south\n";
1247
1248 # our latitude and longitude values are in degrees. But cos and sin etc in perl and generally all prog languages
1249 # all expect their angle to be in radians. So need to convert from degree to radians before we can take the cos of it.
1250 my $centre_lat_radians = $self->degreesToRadians($centre_lat);
1251 my $cos_in_radians = cos($centre_lat_radians);
1252 ###print STDERR "cos $centre_lat_radians " . cos($centre_lat_radians) . "\n";
1253 my $lng_east = $centre_lng + ($radius/(111111 * $cos_in_radians));
1254 my $lng_west = $centre_lng - ($radius/(111111 * $cos_in_radians));
1255 ###print STDERR "### lng_east $lng_east\n";
1256 ###print STDERR "### lng_west $lng_west\n";
1257
1258 my $cos_lat = cos($centre_lat);
1259 ###print STDERR "cos $centre_lat is $cos_lat\n";
1260
1261 $self->processCoordinate($section, $lat_north, $lng_east);
1262 $self->processCoordinate($section, $lat_south, $lng_east);
1263 $self->processCoordinate($section, $lat_south, $lng_west);
1264 $self->processCoordinate($section, $lat_north, $lng_west);
1265
1266 }
1267 elsif ($type eq "marker") {
1268 ###print STDERR "@@ MARKER FOUND WITH LAT: " . $shape->{"position"}->{"lat"} . "\n";
1269 ###print STDERR "@@ MARKER FOUND WITH LNG: " . $shape->{"position"}->{"lng"} . "\n";
1270 $self->processCoordinate($section, $shape->{"position"}->{"lat"}, $shape->{"position"}->{"lng"});
1271 }
1272 elsif ($type eq "polyline" || $type eq "polygon") {
1273 my $path_array = $shape->{"path"};
1274 foreach my $position (@$path_array) {
1275 $self->processCoordinate($section, $position->{"lat"}, $position->{"lng"});
1276 }
1277 }
1278 elsif ($type eq "rectangle") {
1279
1280 my $bounds = $shape->{"bounds"};
1281 $self->processCoordinate($section, $bounds->{"north"}, $bounds->{"east"});
1282 $self->processCoordinate($section, $bounds->{"south"}, $bounds->{"east"});
1283 $self->processCoordinate($section, $bounds->{"south"}, $bounds->{"west"});
1284 $self->processCoordinate($section, $bounds->{"north"}, $bounds->{"west"});
1285 }
1286
1287 } # end for on each shape in GPS.mapOverlay
1288 } # end if(buildcol and text pass)
1289 } # end GPS.mapOverlay meta
1290
1291 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1292}
1293
1294# https://en.wikipedia.org/wiki/Radian
1295sub degreesToRadians
1296{
1297 my $self = shift (@_);
1298 my ($degrees) = @_;
1299
1300 return $degrees * pi /180; # returns radians
1301}
1302
1303sub radiansToDegrees
1304{
1305 my $self = shift (@_);
1306 my ($radians) = @_;
1307
1308 return $radians * 180 / pi; # returns degrees
1309}
1310
1311# Call as:
1312# my $json = JSON->new->allow_nonref;
1313# &printAllShapes($json, $json_array);
1314sub printAllShapes {
1315 my ($json, $json_array) = @_;
1316
1317
1318 #my $pretty_print_shape = $json->pretty->encode( $json_array->[0] );
1319 foreach my $shape (@$json_array) {
1320 my $pretty_print_shape = $json->pretty->encode( $shape );
1321 print STDERR "Shape: $pretty_print_shape\n";
1322 }
1323
1324}
1325
1326# For the (lat, lng) coordinate given,
1327# attaches Coordinate and multiple CoordShort (different precision level) metadata to the doc object
1328sub processCoordinate {
1329 my $self = shift (@_);
1330 my ($section, $latitude, $longitude) = @_;
1331
1332 my $section_ptr = $self->_lookup_section($section);
1333
1334 my $lat_direction = ($latitude =~ m/^-/) ? "S" : "N";
1335 my $lng_direction = ($longitude =~ m/^-/) ? "W" : "E";
1336
1337 # have to store (lat, lng) in pairs, when there are so many coords to store
1338 #push (@{$section_ptr->{'metadata'}}, ["Latitude", $latitude]);
1339 #push (@{$section_ptr->{'metadata'}}, ["Longitude", $longitude]);
1340
1341 push (@{$section_ptr->{'metadata'}}, ["Coordinate", "$latitude $longitude"]); # "$latitude$lat_direction $longitude$lng_direction"
1342
1343 my ($latBeforeDec, $latAfterDec);
1344 my ($lngBeforeDec, $lngAfterDec);
1345
1346 if($latitude !~ m/\./) {
1347 $latBeforeDec = $latitude;
1348 $latAfterDec = "";
1349 } else {
1350 ($latBeforeDec, $latAfterDec) = ($latitude =~ m/^-?([0-9]+)\.([0-9]+)$/);
1351 }
1352 if($longitude !~ m/\./) {
1353 $lngBeforeDec = $longitude;
1354 $lngAfterDec = "";
1355 } else {
1356 ($lngBeforeDec, $lngAfterDec) = ($longitude =~ m/^-?([0-9]+)\.([0-9]+)$/);
1357 }
1358
1359
1360 my $name = "CoordShort";
1361 push (@{$section_ptr->{'metadata'}}, [$name, "$latBeforeDec$lat_direction $lngBeforeDec$lng_direction"]);
1362
1363 for(my $i = 2; $i <= 4; $i++)
1364 {
1365 my $latDecPlaces = (length($latAfterDec) >= $i) ? substr($latAfterDec, 0, $i) : "";
1366 my $lngDecPlaces = (length($lngAfterDec) >= $i) ? substr($lngAfterDec, 0, $i) : "";
1367
1368 push (@{$section_ptr->{'metadata'}}, [$name,
1369 $latBeforeDec . $lat_direction. $latDecPlaces . " " . $lngBeforeDec . $lng_direction. $lngDecPlaces]);
1370
1371 }
1372
1373 #Only add the metadata if it has not already been added
1374 #my $metaMap = $self->get_metadata_hashmap($section); ### TODO: metaMap not used. Unnecesssary step? (Called function has no apparent side-effects.)
1375}
1376
1377
1378# methods for dealing with text
1379
1380# returns the text for a section
1381sub get_text {
1382 my $self = shift (@_);
1383 my ($section) = @_;
1384
1385 my $section_ptr = $self->_lookup_section($section);
1386 if (!defined $section_ptr) {
1387 print STDERR "doc::get_text couldn't find section " .
1388 "$section\n";
1389 return "";
1390 }
1391
1392 return $section_ptr->{'text'};
1393}
1394
1395# returns the (utf-8 encoded) length of the text for a section
1396sub get_text_length {
1397 my $self = shift (@_);
1398 my ($section) = @_;
1399
1400 my $section_ptr = $self->_lookup_section($section);
1401 if (!defined $section_ptr) {
1402 print STDERR "doc::get_text_length couldn't find section " .
1403 "$section\n";
1404 return 0;
1405 }
1406
1407 return length ($section_ptr->{'text'});
1408}
1409
1410# returns the total length for all the sections
1411sub get_total_text_length {
1412 my $self = shift (@_);
1413
1414 my $section = $self->get_top_section();
1415 my $length = 0;
1416 while (defined $section) {
1417 $length += $self->get_text_length($section);
1418 $section = $self->get_next_section($section);
1419 }
1420 return $length;
1421}
1422
1423sub delete_text {
1424 my $self = shift (@_);
1425 my ($section) = @_;
1426
1427 my $section_ptr = $self->_lookup_section($section);
1428 if (!defined $section_ptr) {
1429 print STDERR "doc::delete_text couldn't find section " .
1430 "$section\n";
1431 return;
1432 }
1433
1434 $section_ptr->{'text'} = "";
1435}
1436
1437# add_text assumes the text is in (extended) ascii form. For
1438# text which has been already converted to the UTF-8 format
1439# use add_utf8_text.
1440sub add_text {
1441 my $self = shift (@_);
1442 my ($section, $text) = @_;
1443
1444 # convert the text to UTF-8 encoded unicode characters
1445 # and add the text
1446 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1447}
1448
1449
1450# add_utf8_text assumes the text to be added has already
1451# been converted to the UTF-8 encoding. For ascii text use
1452# add_text
1453# Pass by value version (internally calls pass by ref version
1454# to avoid code duplication)
1455sub add_utf8_text {
1456 my $self = shift (@_);
1457 my ($section, $text) = @_;
1458
1459 $self->add_utf8_textref($section, \$text);
1460}
1461
1462# Pass by reference version, used by GreenstoneSQLPlugin for fulltext
1463sub add_utf8_textref {
1464 my $self = shift (@_);
1465 my ($section, $text_ref) = @_;
1466
1467 my $section_ptr = $self->_lookup_section($section);
1468 if (!defined $section_ptr) {
1469 print STDERR "doc::add_utf8_textref couldn't find section " .
1470 "$section\n";
1471 return;
1472 }
1473
1474 $section_ptr->{'text'} .= $$text_ref;
1475}
1476
1477# returns the Source meta, which is the utf8 filename generated.
1478# Added a separate method here for convenience
1479sub get_source {
1480 my $self = shift (@_);
1481 return $self->get_metadata_element ($self->get_top_section(), "Source");
1482}
1483
1484# returns the SourceFile meta, which is the url reference to the URL-encoded
1485# version of Source (the utf8 filename). Added a separate method here for convenience
1486sub get_sourcefile {
1487 my $self = shift (@_);
1488 return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
1489}
1490
1491# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
1492# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
1493sub get_assocfile_from_sourcefile {
1494 my $self = shift (@_);
1495
1496 # get the SourceFile meta, which is a *URL* to a file on the filesystem
1497 my $top_section = $self->get_top_section();
1498 my $source_file = $self->get_metadata_element($top_section, "SourceFile");
1499
1500 # get the actual filename as it exists on the filesystem which this url refers to
1501 $source_file = &unicode::url_to_filename($source_file);
1502 my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
1503 return $assocfilename;
1504}
1505
1506# methods for dealing with associated files
1507
1508# a file is associated with a document, NOT a section.
1509# if section is defined it is noted in the data structure
1510# only so that files associated from a particular section
1511# may be removed later (using delete_section_assoc_files)
1512sub associate_file {
1513 my $self = shift (@_);
1514 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1515 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1516
1517 # remove all associated files with the same name
1518 $self->delete_assoc_file ($assoc_filename);
1519
1520 # Too harsh a requirement
1521 # Definitely get HTML docs, for example, with some missing
1522 # support files
1523# if (!&util::fd_exists($real_filename)) {
1524# print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n";
1525# exit -1;
1526# }
1527
1528# print STDERR "**** is the following a UTF8 rep of *real* filename?\n $real_filename\n";
1529# print STDERR "****##### so, ensure it is before storing?!?!?\n";
1530## my $utf8_filename = Encode::encode("utf8",$filename);
1531
1532 push (@{$self->{'associated_files'}},
1533 [$real_filename, $assoc_filename, $mime_type, $section]);
1534}
1535
1536# returns a list of associated files in the form
1537# [[real_filename, assoc_filename, mimetype], ...]
1538sub get_assoc_files {
1539 my $self = shift (@_);
1540
1541 return $self->{'associated_files'};
1542}
1543
1544# the following two methods used to keep track of original associated files
1545# for incremental building. eg a txt file used by an item file does not end
1546# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1547# file for incremental build
1548sub associate_source_file {
1549 my $self = shift (@_);
1550 my ($full_filename) = @_;
1551
1552 push (@{$self->{'source_assoc_files'}}, $full_filename);
1553
1554}
1555
1556sub get_source_assoc_files {
1557 my $self = shift (@_);
1558
1559 return $self->{'source_assoc_files'};
1560
1561
1562}
1563sub metadata_file {
1564 my $self = shift (@_);
1565 my ($real_filename, $filename) = @_;
1566
1567 push (@{$self->{'metadata_files'}},
1568 [$real_filename, $filename]);
1569}
1570
1571# used for writing out the archiveinf-doc info database, to list all the metadata files
1572sub get_meta_files {
1573 my $self = shift (@_);
1574
1575 return $self->{'metadata_files'};
1576}
1577
1578sub delete_section_assoc_files {
1579 my $self = shift (@_);
1580 my ($section) = @_;
1581
1582 my $i=0;
1583 while ($i < scalar (@{$self->{'associated_files'}})) {
1584 if (defined $self->{'associated_files'}->[$i]->[3] &&
1585 $self->{'associated_files'}->[$i]->[3] eq $section) {
1586 splice (@{$self->{'associated_files'}}, $i, 1);
1587 } else {
1588 $i++;
1589 }
1590 }
1591}
1592
1593sub delete_assoc_file {
1594 my $self = shift (@_);
1595 my ($assoc_filename) = @_;
1596
1597 my $i=0;
1598 while ($i < scalar (@{$self->{'associated_files'}})) {
1599 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1600 splice (@{$self->{'associated_files'}}, $i, 1);
1601 } else {
1602 $i++;
1603 }
1604 }
1605}
1606
1607sub reset_nextsection_ptr {
1608 my $self = shift (@_);
1609 my ($section) = @_;
1610
1611 my $section_ptr = $self->_lookup_section($section);
1612 $section_ptr->{'next_subsection'} = 1;
1613}
1614
16151;
Note: See TracBrowser for help on using the repository browser.