source: main/trunk/greenstone2/perllib/doc.pm@ 33312

Last change on this file since 33312 was 33312, checked in by ak19, 5 years ago

Better regex for determining if processor_mode is a legitimate textmode. And added the clarifying comment suggested by Dr Bainbridge

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 50.0 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use FileUtils;
40use ghtml;
41use File::stat;
42##use hashdoc;
43use docprint;
44use JSON;
45
46# We just need pi from the Trig lib
47# Import constants pi2, pip2, pip4 (2*pi, pi/2, pi/4).
48use Math::Trig ':pi';
49
50# the document type may be indexed_doc, nonindexed_doc, or
51# classification
52
53our $OIDcount = 0;
54
55
56# Declare the Package (level) Variable that will tell us whether we're in the import or buildcol phase.
57# This var will be set by import.pl, buildcol.pl and export.pl to the then appropriate value.
58# https://perlmaven.com/package-variables-and-lexical-variables-in-perl
59# But: https://stackoverflow.com/questions/7542120/perl-how-to-make-variables-from-requiring-script-available-in-required-script
60# https://stackoverflow.com/questions/33230015/how-do-i-use-a-variable-in-another-perl-script
61#$doc::cmd_line_mode = undef;
62# Decided to follow the usage of $OIDcount above to declare the necessary package-level variables 'globally' accessible
63# actually accessible only for those packages that import this one (with 'use doc')
64my $cmd_line_mode = undef;
65
66# processor_mode keeps track of which buildcol pass we're at: dummy, text (sidx/didx passes) or infodb
67my $processor_mode = undef;
68
69# rename_method can be 'url', 'none', 'base64'
70sub new {
71 my $class = shift (@_);
72 my ($source_filename, $doc_type, $rename_method) = @_;
73
74
75 my $self = bless {'associated_files'=>[],
76 'subsection_order'=>[],
77 'next_subsection'=>1,
78 'subsections'=>{},
79 'metadata'=>[],
80 'text'=>"",
81 'OIDtype'=>"hash"}, $class;
82
83 # used to set lastmodified here, but this can screw up the HASH ids, so
84 # the docsave processor now calls set_lastmodified
85
86 $self->set_source_path($source_filename);
87
88 if (defined $source_filename) {
89 $source_filename = &util::filename_within_collection($source_filename);
90 print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
91 $self->set_source_filename ($source_filename, $rename_method);
92 }
93
94 $self->set_doc_type ($doc_type) if defined $doc_type;
95
96 return $self;
97}
98
99
100sub set_source_path
101{
102 my $self = shift @_;
103 my ($source_filename) = @_;
104
105 if (defined $source_filename) {
106 # On Windows the source_filename can be in terse DOS format
107 # e.g. test~1.txt
108
109 $self->{'terse_source_path'} = $source_filename;
110
111 # Use the FileUtil library methods as they are aware of more special
112 # cases such as HDFS [jmt12]
113 if (&FileUtils::fileExists($source_filename))
114 {
115 # See if we can do better for Windows with a filename
116 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
117 require Win32;
118 $self->{'source_path'} = Win32::GetLongPathName($source_filename);
119 }
120 else {
121 # For Unix-based systems, there is no difference between the two
122 $self->{'source_path'} = $source_filename;
123 }
124 }
125 else {
126 print STDERR "Warning: In doc::set_source_path(), file\n";
127 print STDERR " $source_filename\n";
128 print STDERR " does not exist\n";
129
130 # (default) Set it to whatever we were given
131 $self->{'source_path'} = $source_filename;
132 }
133 }
134 else {
135 # Previous code for setting source_path allowed for
136 # it to be undefined, so continue this practice
137 $self->{'terse_source_path'} = undef;
138 $self->{'source_path'} = undef;
139 }
140}
141
142
143sub get_source_path
144{
145 my $self = shift @_;
146
147 return $self->{'terse_source_path'};
148}
149
150# set lastmodified for OAI purposes, added by GRB, moved by kjdon
151sub set_oailastmodified {
152 my $self = shift (@_);
153
154 my $source_path = $self->{'terse_source_path'};
155
156 if (defined $source_path && (-e $source_path)) {
157 my $current_time = time;
158
159 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
160 $wday, $yday, $isdst) = localtime($current_time);
161
162 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
163
164 $self->add_utf8_metadata($self->get_top_section(), "oailastmodified", $current_time);
165 $self->add_utf8_metadata($self->get_top_section(), "oailastmodifieddate", $date_modified);
166 }
167}
168
169# no longer used for OAI purposes, since lastmodified is not what we want as the
170# Datestamp of a document. This doc metadata may be useful for general purposes.
171sub set_lastmodified {
172 my $self = shift (@_);
173
174 my $source_path = $self->{'terse_source_path'};
175
176 if (defined $source_path && (-e $source_path)) {
177
178 my $file_stat = stat($source_path);
179 my $mtime = $file_stat->mtime;
180 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
181 $wday, $yday, $isdst) = localtime($mtime);
182
183 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
184
185 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $mtime);
186 $self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
187 }
188}
189
190# clone the $self object
191sub duplicate {
192 my $self = shift (@_);
193
194 my $newobj = {};
195
196 foreach my $k (keys %$self) {
197 $newobj->{$k} = &clone ($self->{$k});
198 }
199
200 bless $newobj, ref($self);
201 return $newobj;
202}
203
204sub clone {
205 my ($from) = @_;
206 my $type = ref ($from);
207
208 if ($type eq "HASH") {
209 my $to = {};
210 foreach my $key (keys %$from) {
211 $to->{$key} = &clone ($from->{$key});
212 }
213 return $to;
214 } elsif ($type eq "ARRAY") {
215 my $to = [];
216 foreach my $v (@$from) {
217 push (@$to, &clone ($v));
218 }
219 return $to;
220 } else {
221 return $from;
222 }
223}
224
225sub set_OIDtype {
226 my $self = shift (@_);
227 my ($type, $metadata) = @_;
228
229 if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|hash_on_full_filename|incremental|filename|dirname|full_filename|assigned)$/) {
230 $self->{'OIDtype'} = $type;
231 } else {
232 $self->{'OIDtype'} = "hash";
233 }
234
235 if ($type =~ /^assigned$/) {
236 if (defined $metadata) {
237 $self->{'OIDmetadata'} = $metadata;
238 } else {
239 $self->{'OIDmetadata'} = "dc.Identifier";
240 }
241 }
242}
243
244# rename_method can be 'url', 'none', 'base64'
245sub set_source_filename {
246 my $self = shift (@_);
247 my ($source_filename, $rename_method) = @_;
248
249 # Since the gsdlsourcefilename element goes into the doc.xml it has
250 # to be utf8. However, it should also *represent* the source filename
251 # (in the import directory) which may not be utf8 at all.
252 # For instance, if this meta element (gsdlsourcefilename) will be used
253 # by other applications that parse doc.xml in order to locate
254 # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
255 # encode the real filename as this is a binary-to-text encoding meaning
256 # that the resulting string is ASCII (utf8). Decoding will give the original.
257
258# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
259
260 # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
261 $source_filename = $self->encode_filename($source_filename, $rename_method);
262# my ($srcfilename,$dirname,$suffix)
263# = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
264# print STDERR "-> $srcfilename -> ";
265# $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
266# $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
267# print STDERR "$source_filename\n";
268
269 $self->set_utf8_metadata_element ($self->get_top_section(),
270 "gsdlsourcefilename",
271 $source_filename);
272}
273
274sub encode_filename {
275 my $self = shift (@_);
276 my ($source_filename, $rename_method) = @_;
277
278 my ($srcfilename,$dirname,$suffix)
279 = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
280# print STDERR "-> $srcfilename -> ";
281 $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
282 $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
283
284 return $source_filename;
285}
286
287sub set_converted_filename {
288 my $self = shift (@_);
289 my ($converted_filename) = @_;
290
291 # we know the converted filename is utf8
292 $self->set_utf8_metadata_element ($self->get_top_section(),
293 "gsdlconvertedfilename",
294 $converted_filename);
295}
296
297# returns the source_filename as it was provided
298sub get_unmodified_source_filename {
299 my $self = shift (@_);
300
301 return $self->{'terse_source_path'};
302}
303
304# returns the source_filename with whatever rename_method was given
305sub get_source_filename {
306 my $self = shift (@_);
307
308 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
309}
310
311
312
313# returns converted filename if available else returns source filename
314sub get_filename_for_hashing {
315 my $self = shift (@_);
316
317 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
318
319 if (!defined $filename) {
320 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
321 # if NULPlug processed file, then don't give a filename
322 if (defined $plugin_name && $plugin_name eq "NULPlug") {
323 $filename = undef;
324 } else { # returns the URL encoded source filename!
325 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
326 }
327 }
328
329 if (!&FileUtils::isFilenameAbsolute($filename)) {
330 $filename = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$filename);
331 }
332
333 return $filename;
334}
335
336sub set_doc_type {
337 my $self = shift (@_);
338 my ($doc_type) = @_;
339
340 $self->set_metadata_element ($self->get_top_section(),
341 "gsdldoctype",
342 $doc_type);
343}
344
345# returns the gsdldoctype as it was provided
346# the default of "indexed_doc" is used if no document
347# type was provided
348sub get_doc_type {
349 my $self = shift (@_);
350
351 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
352 return $doc_type if (defined $doc_type);
353 return "indexed_doc";
354}
355
356
357# look up the reference to the a particular section
358sub _lookup_section {
359 my $self = shift (@_);
360 my ($section) = @_;
361
362 my ($num);
363 my $sectionref = $self;
364
365 while (defined $section && $section ne "") {
366
367 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
368
369 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
370
371 $section = "" unless defined $section;
372
373
374 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
375 $sectionref = $sectionref->{'subsections'}->{$num};
376 } else {
377 return undef;
378 }
379 }
380
381 return $sectionref;
382}
383
384# calculate OID by hashing the contents of the document
385sub _calc_OID {
386 my $self = shift (@_);
387 my ($filename) = @_;
388
389
390 my $osexe = &util::get_os_exe();
391
392 my $hashfile_exe = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin",
393 $ENV{'GSDLOS'},"hashfile$osexe");
394
395 &util::set_gnomelib_env(); # gnomelib_env (particularly lib/libiconv2.dylib) required to run the hashfile executable on Mac Lions
396 # The subroutine will set the gnomelib env once for each subshell launched, by first testing if GEXTGNOME is not already set
397
398 # A different way to set the gnomelib env would be to do it more locally: exporting the necessary vars
399 # (specifically DYLD/LD_LIB_PATH) for gnome_lib as part of the command executed.
400 # E.g. $result=`export LD_LIBRARY_PATH=../ext/gnome-lib/darwin/lib; hashfile...`
401
402 my $result = "NULL";
403
404
405 if (-e "$hashfile_exe") {
406# $result = `\"$hashfile_exe\" \"$filename\"`;
407# $result = `hashfile$osexe \"$filename\" 2>&1`;
408 $result = `hashfile$osexe \"$filename\"`;
409
410 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
411 } else {
412 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
413 }
414 return "HASH$result";
415}
416
417# methods dealing with OID, not groups of them.
418
419# if $OID is not provided one is calculated
420sub set_OID {
421 my $self = shift (@_);
422 my ($OID) = @_;
423
424 my $use_hash_oid = 0;
425 # if an OID wasn't provided calculate one
426 if (!defined $OID) {
427 $OID = "NULL";
428 if ($self->{'OIDtype'} =~ /^hash/) {
429 $use_hash_oid = 1;
430 } elsif ($self->{'OIDtype'} eq "incremental") {
431 $OID = "D" . $OIDcount;
432 $OIDcount ++;
433 } elsif ($self->{'OIDtype'} eq "filename") {
434 my $filename = $self->get_source_filename();
435 $OID = &File::Basename::fileparse($filename, qr/\.[^.]*/);
436 $OID = &util::tidy_up_oid($OID);
437 } elsif ($self->{'OIDtype'} eq "full_filename") {
438 my $source_filename = $self->get_source_filename();
439 my $dirsep = &util::get_os_dirsep();
440
441 $source_filename =~ s/^import$dirsep//;
442 $source_filename =~ s/$dirsep/-/g;
443 $source_filename =~ s/\./_/g;
444
445 $OID = $source_filename;
446 $OID = &util::tidy_up_oid($OID);
447 } elsif ($self->{'OIDtype'} eq "dirname") {
448 my $filename = $self->get_source_filename();
449 if (defined($filename)) { # && -e $filename) {
450 # get the immediate parent directory
451 $OID = &File::Basename::dirname($filename);
452 if (defined $OID) {
453 $OID = &File::Basename::basename($OID);
454 $OID = &util::tidy_up_oid($OID);
455 } else {
456 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
457 $use_hash_oid = 1;
458 }
459 } else {
460 print STDERR "Failed to find a filename, generating hash id\n";
461 $use_hash_oid = 1;
462 }
463
464 } elsif ($self->{'OIDtype'} eq "assigned") {
465 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
466 if (defined $identifier && $identifier ne "") {
467 $OID = $identifier;
468 $OID = &util::tidy_up_oid($OID);
469 } else {
470 # need a hash id
471 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
472 $use_hash_oid = 1;
473 }
474
475 } else {
476 $use_hash_oid = 1;
477 }
478
479 if ($use_hash_oid) {
480 my $hash_on_file = 1;
481 my $hash_on_ga_xml = 0;
482
483 if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
484 $hash_on_file = 0;
485 $hash_on_ga_xml = 1;
486 }
487
488 if ($self->{'OIDtype'} eq "hash_on_full_filename") {
489 $hash_on_file = 0;
490 $hash_on_ga_xml = 0;
491
492 my $source_filename = $self->get_source_filename();
493 my $dirsep = &util::get_os_dirsep();
494
495 $source_filename =~ s/^import$dirsep//;
496 $source_filename =~ s/$dirsep/-/g;
497 $source_filename =~ s/\./_/g;
498
499 # If the filename is very short then (handled naively)
500 # this can cause conjestion in the hash-values
501 # computed, leading documents sharing the same leading
502 # Hex values in the computed has.
503 #
504 # The solution taken here is to replace the name of
505 # the file name a sufficient number of times (up to
506 # the character limit defined in 'rep_limit' and
507 # make that the content that is hashed on
508
509 # *** Think twice before changing the following value
510 # as it will break backward compatability of computed
511 # document HASH values
512
513 my $rep_limit = 256;
514 my $hash_content = undef;
515
516 if (length($source_filename)<$rep_limit) {
517 my $rep_string = "$source_filename|";
518 my $rs_len = length($rep_string);
519
520 my $clone_times = int(($rep_limit-1)/$rs_len) +1;
521
522 $hash_content = substr($rep_string x $clone_times, 0, $rep_limit);
523 }
524 else {
525 $hash_content = $source_filename;
526 }
527
528 my $filename = &util::get_tmp_filename();
529 if (!open (OUTFILE, ">:utf8", $filename)) {
530 print STDERR "doc::set_OID could not write to $filename\n";
531 } else {
532 print OUTFILE $hash_content;
533 close (OUTFILE);
534 }
535 $OID = $self->_calc_OID ($filename);
536
537 &FileUtils::removeFiles ($filename);
538 }
539
540 if ($hash_on_file) {
541 # "hash" OID - feed file to hashfile.exe
542 my $filename = $self->get_filename_for_hashing();
543
544 # -z: don't want to hash on the file if it is zero size
545 if (defined($filename) && -e $filename && !-z $filename) {
546 $OID = $self->_calc_OID ($filename);
547 } else {
548 $hash_on_ga_xml = 1; # switch to back-up plan, and hash on GA file instead
549 }
550 }
551
552 if ($hash_on_ga_xml) {
553 # In addition being asked to explicity calculate the has based on the GA file,
554 # can also end up coming into this block is doing 'hash_on_file' but the file
555 # itself is of zero bytes (as could be the case with 'doc.nul' file
556
557 my $filename = &util::get_tmp_filename();
558 if (!open (OUTFILE, ">:utf8", $filename)) {
559 print STDERR "doc::set_OID could not write to $filename\n";
560 } else {
561 my $doc_text = &docprint::get_section_xml($self);
562 print OUTFILE $doc_text;
563 close (OUTFILE);
564 }
565 $OID = $self->_calc_OID ($filename);
566 &FileUtils::removeFiles($filename);
567 }
568 }
569 }
570 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
571}
572
573# this uses hashdoc (embedded c thingy) which is faster but still
574# needs a little work to be sufficiently stable
575sub ___set_OID {
576 my $self = shift (@_);
577 my ($OID) = @_;
578
579 # if an OID wasn't provided then calculate hash value based on document
580 if (!defined $OID)
581 {
582 my $hash_text = &docprint::get_section_xml($self);
583 my $hash_len = length($hash_text);
584
585 $OID = &hashdoc::buffer($hash_text,$hash_len);
586 }
587
588 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
589}
590
591# returns the OID for this document
592sub get_OID {
593 my $self = shift (@_);
594 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
595 return $OID if (defined $OID);
596 return "NULL";
597}
598
599sub delete_OID {
600 my $self = shift (@_);
601
602 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
603}
604
605
606# methods for manipulating section names
607
608# returns the name of the top-most section (the top
609# level of the document
610sub get_top_section {
611 my $self = shift (@_);
612
613 return "";
614}
615
616# returns a section
617sub get_parent_section {
618 my $self = shift (@_);
619 my ($section) = @_;
620
621 $section =~ s/(^|\.)\d+$//;
622
623 return $section;
624}
625
626# returns the first child section (or the end child
627# if there isn't any)
628sub get_begin_child {
629 my $self = shift (@_);
630 my ($section) = @_;
631
632 my $section_ptr = $self->_lookup_section($section);
633 return "" unless defined $section_ptr;
634
635 if (defined $section_ptr->{'subsection_order'}->[0]) {
636 return "$section.$section_ptr->{'subsection_order'}->[0]";
637 }
638
639 return $self->get_end_child ($section);
640}
641
642# returns the next child of a parent section
643sub get_next_child {
644 my $self = shift (@_);
645 my ($section) = @_;
646
647 my $parent_section = $self->get_parent_section($section);
648 my $parent_section_ptr = $self->_lookup_section($parent_section);
649 return undef unless defined $parent_section_ptr;
650
651 my ($section_num) = $section =~ /(\d+)$/;
652 return undef unless defined $section_num;
653
654 my $i = 0;
655 my $section_order = $parent_section_ptr->{'subsection_order'};
656 while ($i < scalar(@$section_order)) {
657 last if $section_order->[$i] eq $section_num;
658 $i++;
659 }
660
661 $i++; # the next child
662 if ($i < scalar(@$section_order)) {
663 return $section_order->[$i] if $parent_section eq "";
664 return "$parent_section.$section_order->[$i]";
665 }
666
667 # no more sections in this level
668 return undef;
669}
670
671# returns a reference to a list of children
672sub get_children {
673 my $self = shift (@_);
674 my ($section) = @_;
675
676 my $section_ptr = $self->_lookup_section($section);
677 return [] unless defined $section_ptr;
678
679 my @children = @{$section_ptr->{'subsection_order'}};
680
681 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
682 return \@children;
683}
684
685# returns the child section one past the last one (which
686# is coded as "0")
687sub get_end_child {
688 my $self = shift (@_);
689 my ($section) = @_;
690
691 return $section . ".0" unless $section eq "";
692 return "0";
693}
694
695# returns the next section in book order
696sub get_next_section {
697 my $self = shift (@_);
698 my ($section) = @_;
699
700 return undef unless defined $section;
701
702 my $section_ptr = $self->_lookup_section($section);
703 return undef unless defined $section_ptr;
704
705 # first try to find first child
706 if (defined $section_ptr->{'subsection_order'}->[0]) {
707 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
708 return "$section.$section_ptr->{'subsection_order'}->[0]";
709 }
710
711 do {
712 # try to find sibling
713 my $next_child = $self->get_next_child ($section);
714 return $next_child if (defined $next_child);
715
716 # move up one level
717 $section = $self->get_parent_section ($section);
718 } while $section =~ /\d/;
719
720 return undef;
721}
722
723sub is_leaf_section {
724 my $self = shift (@_);
725 my ($section) = @_;
726
727 my $section_ptr = $self->_lookup_section($section);
728 return 1 unless defined $section_ptr;
729
730 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
731}
732
733# methods for dealing with sections
734
735# returns the name of the inserted section
736sub insert_section {
737 my $self = shift (@_);
738 my ($before_section) = @_;
739
740 # get the child to insert before and its parent section
741 my $parent_section = "";
742 my $before_child = "0";
743 my @before_section = split (/\./, $before_section);
744 if (scalar(@before_section) > 0) {
745 $before_child = pop (@before_section);
746 $parent_section = join (".", @before_section);
747 }
748
749 my $parent_section_ptr = $self->_lookup_section($parent_section);
750 if (!defined $parent_section_ptr) {
751 print STDERR "doc::insert_section couldn't find parent section " .
752 "$parent_section\n";
753 return;
754 }
755
756 # get the next section number
757 my $section_num = $parent_section_ptr->{'next_subsection'}++;
758
759 my $i = 0;
760 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
761 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
762 $i++;
763 }
764
765 # insert the section number into the order list
766 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
767
768 # add this section to the parent section
769 my $section_ptr = {'subsection_order'=>[],
770 'next_subsection'=>1,
771 'subsections'=>{},
772 'metadata'=>[],
773 'text'=>""};
774 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
775
776 # work out the full section number
777 my $section = $parent_section;
778 $section .= "." unless $section eq "";
779 $section .= $section_num;
780
781 return $section;
782}
783
784# creates a pre-named section
785sub create_named_section {
786 my $self = shift (@_);
787 my ($mastersection) = @_;
788
789 my ($num);
790 my $section = $mastersection;
791 my $sectionref = $self;
792
793 while ($section ne "") {
794 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
795 $num =~ s/^0+(\d)/$1/; # remove leading 0s
796 $section = "" unless defined $section;
797
798 if (defined $num) {
799 if (!defined $sectionref->{'subsections'}->{$num}) {
800 push (@{$sectionref->{'subsection_order'}}, $num);
801 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
802 'next_subsection'=>1,
803 'subsections'=>{},
804 'metadata'=>[],
805 'text'=>""};
806 if ($num >= $sectionref->{'next_subsection'}) {
807 $sectionref->{'next_subsection'} = $num + 1;
808 }
809 }
810 $sectionref = $sectionref->{'subsections'}->{$num};
811
812 } else {
813 print STDERR "doc::create_named_section couldn't create section ";
814 print STDERR "$mastersection\n";
815 last;
816 }
817 }
818}
819
820# returns a reference to a list of subsections
821sub list_subsections {
822 my $self = shift (@_);
823 my ($section) = @_;
824
825 my $section_ptr = $self->_lookup_section ($section);
826 if (!defined $section_ptr) {
827 print STDERR "doc::list_subsections couldn't find section $section\n";
828 return [];
829 }
830
831 return [@{$section_ptr->{'subsection_order'}}];
832}
833
834sub delete_section {
835 my $self = shift (@_);
836 my ($section) = @_;
837
838# my $section_ptr = {'subsection_order'=>[],
839# 'next_subsection'=>1,
840# 'subsections'=>{},
841# 'metadata'=>[],
842# 'text'=>""};
843
844 # if this is the top section reset everything
845 if ($section eq "") {
846 $self->{'subsection_order'} = [];
847 $self->{'subsections'} = {};
848 $self->{'metadata'} = [];
849 $self->{'text'} = "";
850 return;
851 }
852
853 # find the parent of the section to delete
854 my $parent_section = "";
855 my $child = "0";
856 my @section = split (/\./, $section);
857 if (scalar(@section) > 0) {
858 $child = pop (@section);
859 $parent_section = join (".", @section);
860 }
861
862 my $parent_section_ptr = $self->_lookup_section($parent_section);
863 if (!defined $parent_section_ptr) {
864 print STDERR "doc::delete_section couldn't find parent section " .
865 "$parent_section\n";
866 return;
867 }
868
869 # remove this section from the subsection_order list
870 my $i = 0;
871 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
872 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
873 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
874 last;
875 }
876 $i++;
877 }
878
879 # remove this section from the subsection hash
880 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
881 undef $parent_section_ptr->{'subsections'}->{$child};
882 }
883}
884
885#--
886# methods for dealing with metadata
887
888# set_metadata_element and get_metadata_element are for metadata
889# which should only have one value. add_meta_data and get_metadata
890# are for metadata which can have more than one value.
891
892# returns the first metadata value which matches field
893
894# This version of get metadata element works much like the one above,
895# except it allows for the namespace portion of a metadata element to
896# be ignored, thus if you are searching for dc.Title, the first piece
897# of matching metadata ending with the name Title (once any namespace
898# is removed) would be returned.
899# 28-11-2003 John Thompson
900sub get_metadata_element {
901 my $self = shift (@_);
902 my ($section, $field, $ignore_namespace) = @_;
903 my ($data);
904
905 $ignore_namespace = 0 unless defined $ignore_namespace;
906
907 my $section_ptr = $self->_lookup_section($section);
908 if (!defined $section_ptr) {
909 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
910 return;
911 }
912
913 # Remove any namespace if we are being told to ignore them
914 if($ignore_namespace) {
915 $field =~ s/^.*\.//; #$field =~ s/^\w*\.//;
916 }
917
918 foreach $data (@{$section_ptr->{'metadata'}}) {
919
920 my $data_name = $data->[0];
921
922 # Remove any namespace if we are being told to ignore them
923 if($ignore_namespace) {
924 $data_name =~ s/^.*\.//; #$data_name =~ s/^\w*\.//;
925 }
926 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
927 $data_name =~ s/^ex\.([^.]+)$/$1/; #$data_name =~ s/^ex\.//;
928
929 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
930 }
931
932 return undef; # was not found
933}
934
935# returns a list of the form [value1, value2, ...]
936sub get_metadata {
937 my $self = shift (@_);
938 my ($section, $field, $ignore_namespace) = @_;
939 my ($data);
940
941 $ignore_namespace = 0 unless defined $ignore_namespace;
942
943 my $section_ptr = $self->_lookup_section($section);
944 if (!defined $section_ptr) {
945 print STDERR "doc::get_metadata couldn't find section ",
946 $section, "\n";
947 return;
948 }
949
950 # Remove any namespace if we are being told to ignore them
951 if($ignore_namespace) {
952 $field =~ s/^.*\.//;
953 }
954
955 my @metadata = ();
956 foreach $data (@{$section_ptr->{'metadata'}}) {
957
958 my $data_name = $data->[0];
959
960 # Remove any namespace if we are being told to ignore them
961 if($ignore_namespace) {
962 $data_name =~ s/^.*\.//;
963 }
964 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
965 $data_name =~ s/^ex\.([^.]+)$/$1/;
966
967 push (@metadata, $data->[1]) if ($data_name eq $field);
968 }
969
970 return \@metadata;
971}
972
973sub get_metadata_hashmap {
974 my $self = shift (@_);
975 my ($section, $opt_namespace) = @_;
976
977 my $section_ptr = $self->_lookup_section($section);
978 if (!defined $section_ptr) {
979 print STDERR "doc::get_metadata couldn't find section ",
980 $section, "\n";
981 return;
982 }
983
984 my $metadata_hashmap = {};
985 foreach my $data (@{$section_ptr->{'metadata'}}) {
986 my $metaname = $data->[0];
987
988 if ((!defined $opt_namespace) || ($metaname =~ m/^$opt_namespace\./)) {
989 if (!defined $metadata_hashmap->{$metaname}) {
990 $metadata_hashmap->{$metaname} = [];
991 }
992 my $metaval_list = $metadata_hashmap->{$metaname};
993 push(@$metaval_list, $data->[1]);
994 }
995 }
996
997 return $metadata_hashmap;
998}
999
1000# returns a list of the form [[field,value],[field,value],...]
1001sub get_all_metadata {
1002 my $self = shift (@_);
1003 my ($section) = @_;
1004
1005 my $section_ptr = $self->_lookup_section($section);
1006 if (!defined $section_ptr) {
1007 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
1008 return;
1009 }
1010
1011 return $section_ptr->{'metadata'};
1012}
1013
1014# $value is optional
1015sub delete_metadata {
1016 my $self = shift (@_);
1017 my ($section, $field, $value) = @_;
1018
1019 my $section_ptr = $self->_lookup_section($section);
1020 if (!defined $section_ptr) {
1021 print STDERR "doc::delete_metadata couldn't find section ", $section, "$field\n";
1022 return;
1023 }
1024
1025 my $i = 0;
1026 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
1027 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
1028 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
1029 splice (@{$section_ptr->{'metadata'}}, $i, 1);
1030 } else {
1031 $i++;
1032 }
1033 }
1034}
1035
1036sub delete_all_metadata {
1037 my $self = shift (@_);
1038 my ($section) = @_;
1039
1040 my $section_ptr = $self->_lookup_section($section);
1041 if (!defined $section_ptr) {
1042 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
1043 return;
1044 }
1045
1046 $section_ptr->{'metadata'} = [];
1047}
1048
1049sub set_metadata_element {
1050 my $self = shift (@_);
1051 my ($section, $field, $value) = @_;
1052
1053 $self->set_utf8_metadata_element ($section, $field,
1054 &unicode::ascii2utf8(\$value));
1055}
1056
1057# set_utf8_metadata_element assumes the text has already been
1058# converted to the UTF-8 encoding.
1059sub set_utf8_metadata_element {
1060 my $self = shift (@_);
1061 my ($section, $field, $value) = @_;
1062
1063 $self->delete_metadata ($section, $field);
1064 $self->add_utf8_metadata ($section, $field, $value);
1065}
1066
1067
1068# add_metadata assumes the text is in (extended) ascii form. For
1069# text which has already been converted to the UTF-8 format use
1070# add_utf8_metadata.
1071sub add_metadata {
1072 my $self = shift (@_);
1073 my ($section, $field, $value) = @_;
1074
1075 $self->add_utf8_metadata ($section, $field,
1076 &unicode::ascii2utf8(\$value));
1077}
1078
1079sub add_utf8_metadata {
1080 my $self = shift (@_);
1081 my ($section, $field, $value) = @_;
1082
1083 # my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1084 # my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
1085 # print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
1086
1087 my $section_ptr = $self->_lookup_section($section);
1088 if (!defined $section_ptr) {
1089 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
1090 return;
1091 }
1092 if (!defined $value) {
1093 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
1094 return;
1095 }
1096 if (!defined $field) {
1097 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
1098 return;
1099 }
1100
1101 #print STDERR "###$field=$value\n";
1102
1103 # For now, supress this check. Given that text data read in is now
1104 # Unicode aware, then the following block of code can (ironically enough)
1105 # cause our unicode compliant string to be re-encoded (leading to
1106 # a double-encoded UTF-8 string, which we definitely don't want!).
1107
1108
1109 # double check that the value is utf-8
1110 # if (!&unicode::check_is_utf8($value)) {
1111 # print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
1112 # &unicode::ensure_utf8(\$value);
1113 # print STDERR " Tried converting to utf8: $value\n";
1114 # }
1115
1116 #If the metadata value is either a latitude or a longitude value then we want to save a shortened version for spacial searching purposes
1117 if ($field =~ m/^(.+\.)?Latitude$/ || $field =~ m/^(.+\.)?Longitude$/)
1118 {
1119 my ($mdprefix,$metaname) = ($field =~ m/(.+)\.(.+)$/);
1120 if (defined $mdprefix) {
1121 # Add in a version of Latitude/Longitude without the metadata namespace prefix to keep Runtime happy
1122 push (@{$section_ptr->{'metadata'}}, [$metaname, $value]);
1123 }
1124
1125 my $direction;
1126 if($value =~ m/^-/)
1127 {
1128 $direction = ($field eq "Latitude") ? "S" : "W";
1129 }
1130 else
1131 {
1132 $direction = ($field eq "Latitude") ? "N" : "E";
1133 }
1134
1135 my ($beforeDec, $afterDec) = ($value =~ m/^-?([0-9]+)\.([0-9]+)$/);
1136 if(defined $beforeDec && defined $afterDec)
1137 {
1138 my $name = ($field eq "Latitude") ? "LatShort" : "LngShort";
1139 push (@{$section_ptr->{'metadata'}}, [$name, $beforeDec . $direction]);
1140
1141 for(my $i = 2; $i <= 4; $i++)
1142 {
1143 if(length($afterDec) >= $i)
1144 {
1145 push (@{$section_ptr->{'metadata'}}, [$name, substr($afterDec, 0, $i)]);
1146 }
1147 }
1148
1149 #Only add the metadata if it has not already been added
1150 my $metaMap = $self->get_metadata_hashmap($section); # metaMap not used and called function has no apparent side-effects,
1151 # but this line appears important for ensuring uniqueness of (Latitude, value) meta
1152 # in the section. Also for LatShort, Longitude, LngShort.
1153 }
1154
1155 if($field =~ m/^(.+\.)?Longitude$/) {
1156 # if we are dealing with Longitude meta, we should 1. have Latitude meta too; 2. already have processed Latitude meta
1157 # in that case, add both Lat and Lng of this section as a Coordinate meta
1158 my $latitude = $self->get_metadata_element ($section, "Latitude");
1159 my $longitude = $value;
1160 # TODO: would like all Longitude meta placed together, followed by all Coordinate meta,
1161 # but the following will add all such coord meta to the doc object and the end of this function will add Longitude meta
1162 $self->processCoordinate($section, $latitude, $longitude);
1163 }
1164 }
1165
1166 elsif($field eq "GPS.mapOverlay") { # then the $value is a JSON string
1167
1168 # In order to allow searching map data enriched documents by map shape descriptions,
1169 # and to run rawquery searches for other docs by proximity based on their map data,
1170 # need to store the shape descriptions and Coordinate info for shapes into the text index.
1171 # We add the description for each shape in the mapoverlay into the text index as GPSMapOverlayLabel
1172 # And we add Coordinate (CD) and CoordShort (CS) info for each shape in the mapoverlay in the format (Lat, Lng) as ("37S339 175E342")
1173 # where the digits before the N/S/E/W direction represents the whole number. And the digits after the direction are the
1174 # decimal places which can range from 0 and 2-4 digits.
1175
1176 # However, we only want to process GPS.mapOverlay only during buildcol and only in the text indexing passes (e.g. sidx and didx for lucene)
1177 # and certainly never during the infodb pass of buildcol. The latter can end up duplicating Coordinate/CoordShort/GPSMapOverlayLabel for
1178 # when rebuilding with the online doc editor as that runs incremental-rebuild which then calls basebuilder::reconstruct_doc_objs_metadata()
1179 # on all docs NOT being incrementally rebuilt. That call would get meta from the infodb and use it to reconstruct the doc objects of docs
1180 # NOT being incrementally built. If the Coord and Label meta were written to the infodb, they would then be loaded back in when the collection
1181 # is incrementally rebuilt for those docs that don't need incremental processing. Then this function would once again add the same meta into
1182 # the infodb, thus duplicating what goes into the infodb. Hence, don't do all the following if doc::processor_mode eq "infodb".
1183
1184 # Note that for incremental rebuilding, the text pass can be called textreindex for instance (and infodb pass can be incinfodb).
1185 # So don't check for exact string match
1186
1187 if($doc::cmd_line_mode eq "buildcol" && $doc::processor_mode =~ m/^text/) { # currently known text processor_modes:
1188 # text, textreindex, possibly textdelete (see ArchivesInfPlugin.pm::read() for last 2).
1189 # OR: ..."buildcol" && $doc::processor_mode !~ m/infodb$/) # if dummy pass important
1190
1191 ###print STDERR "GPS.mapOverlay before decoding, val = " . $value . "\n";
1192
1193 # TODO: html decode?
1194 $value =~ s@&#091;@[@g;
1195 $value =~ s@&#093;@]@g;
1196 $value =~ s@&quot;@"@g;
1197 ###print STDERR "GPS.mapOverlay after decoding, val = " . $value . "\n";
1198
1199 my $json_array = decode_json $value;
1200
1201 foreach my $shape (@$json_array) {
1202
1203 # Put each available shape description/label into this section's metadata with GPSMapOverlayLabel as metaname.
1204 # Just as for Coordinate meta, don't need to know which shape a label belongs too. This is just so each label
1205 # will be indexed, and therefore can be searched.
1206
1207 my $description = $shape->{"description"};
1208 if($description) {
1209 push (@{$section_ptr->{'metadata'}}, ["GPSMapOverlayLabel", $description]);
1210 ###print STDERR "@@@@############################################ Just added description meta: " . $description . "\n";
1211 }
1212
1213 my $type = $shape->{"type"};
1214 ###print STDERR "Shape type : " . $type . "\n";
1215
1216 if($type eq "circle") {
1217 ###print STDERR "Found a circle:\n" . &printShape($json, $shape);
1218
1219 # work out bounding box
1220 # want the inverse of this useful page:
1221 # https://stackoverflow.com/questions/639695/how-to-convert-latitude-or-longitude-to-meters
1222 # https://www.geodatasource.com/developers/javascript
1223
1224 # Dr Bainbridge wants us to use: https://gis.stackexchange.com/questions/5821/calculating-latitude-longitude-x-miles-from-point
1225 # Since for N,E,S,W the following works out the same as above link, we're using the rule of thumb in the following:
1226 # https://gis.stackexchange.com/questions/2951/algorithm-for-offsetting-a-latitude-longitude-by-some-amount-of-meters
1227 # which states
1228 # "If your displacements aren't too great (less than a few kilometers) and you're not right at the poles,
1229 # use the quick and dirty estimate that 111,111 meters (111.111 km) in the y direction is 1 degree
1230 # (of latitude) and 111,111 * cos(latitude) meters in the x direction is 1 degree (of longitude)."
1231 my $centre_lat = $shape->{"center"}->{"lat"};
1232 my $centre_lng = $shape->{"center"}->{"lng"};
1233 my $radius = $shape->{"radius"}; # in metres!
1234
1235 ###print STDERR "@@@ circle centre: ($centre_lat, $centre_lng), radius: $radius\n";
1236
1237 my $lat_north = $centre_lat + ($radius/111111);
1238 my $lat_south = $centre_lat - ($radius/111111);
1239
1240 ###print STDERR "### lat_north: $lat_north\n";
1241 ###print STDERR "### lat_south: $lat_south\n";
1242
1243 # our latitude and longitude values are in degrees. But cos and sin etc in perl and generally all prog languages
1244 # all expect their angle to be in radians. So need to convert from degree to radians before we can take the cos of it.
1245 my $centre_lat_radians = $self->degreesToRadians($centre_lat);
1246 my $cos_in_radians = cos($centre_lat_radians);
1247 ###print STDERR "cos $centre_lat_radians " . cos($centre_lat_radians) . "\n";
1248 my $lng_east = $centre_lng + ($radius/(111111 * $cos_in_radians));
1249 my $lng_west = $centre_lng - ($radius/(111111 * $cos_in_radians));
1250 ###print STDERR "### lng_east $lng_east\n";
1251 ###print STDERR "### lng_west $lng_west\n";
1252
1253 my $cos_lat = cos($centre_lat);
1254 ###print STDERR "cos $centre_lat is $cos_lat\n";
1255
1256 $self->processCoordinate($section, $lat_north, $lng_east);
1257 $self->processCoordinate($section, $lat_south, $lng_east);
1258 $self->processCoordinate($section, $lat_south, $lng_west);
1259 $self->processCoordinate($section, $lat_north, $lng_west);
1260
1261 }
1262 elsif ($type eq "marker") {
1263 ###print STDERR "@@ MARKER FOUND WITH LAT: " . $shape->{"position"}->{"lat"} . "\n";
1264 ###print STDERR "@@ MARKER FOUND WITH LNG: " . $shape->{"position"}->{"lng"} . "\n";
1265 $self->processCoordinate($section, $shape->{"position"}->{"lat"}, $shape->{"position"}->{"lng"});
1266 }
1267 elsif ($type eq "polyline" || $type eq "polygon") {
1268 my $path_array = $shape->{"path"};
1269 foreach my $position (@$path_array) {
1270 $self->processCoordinate($section, $position->{"lat"}, $position->{"lng"});
1271 }
1272 }
1273 elsif ($type eq "rectangle") {
1274
1275 my $bounds = $shape->{"bounds"};
1276 $self->processCoordinate($section, $bounds->{"north"}, $bounds->{"east"});
1277 $self->processCoordinate($section, $bounds->{"south"}, $bounds->{"east"});
1278 $self->processCoordinate($section, $bounds->{"south"}, $bounds->{"west"});
1279 $self->processCoordinate($section, $bounds->{"north"}, $bounds->{"west"});
1280 }
1281
1282 } # end for on each shape in GPS.mapOverlay
1283 } # end if(buildcol and text pass)
1284 } # end GPS.mapOverlay meta
1285
1286 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1287}
1288
1289# https://en.wikipedia.org/wiki/Radian
1290sub degreesToRadians
1291{
1292 my $self = shift (@_);
1293 my ($degrees) = @_;
1294
1295 return $degrees * pi /180; # returns radians
1296}
1297
1298sub radiansToDegrees
1299{
1300 my $self = shift (@_);
1301 my ($radians) = @_;
1302
1303 return $radians * 180 / pi; # returns degrees
1304}
1305
1306# Call as:
1307# my $json = JSON->new->allow_nonref;
1308# &printAllShapes($json, $json_array);
1309sub printAllShapes {
1310 my ($json, $json_array) = @_;
1311
1312
1313 #my $pretty_print_shape = $json->pretty->encode( $json_array->[0] );
1314 foreach my $shape (@$json_array) {
1315 my $pretty_print_shape = $json->pretty->encode( $shape );
1316 print STDERR "Shape: $pretty_print_shape\n";
1317 }
1318
1319}
1320
1321# For the (lat, lng) coordinate given,
1322# attaches Coordinate and multiple CoordShort (different precision level) metadata to the doc object
1323sub processCoordinate {
1324 my $self = shift (@_);
1325 my ($section, $latitude, $longitude) = @_;
1326
1327 my $section_ptr = $self->_lookup_section($section);
1328
1329 my $lat_direction = ($latitude =~ m/^-/) ? "S" : "N";
1330 my $lng_direction = ($longitude =~ m/^-/) ? "W" : "E";
1331
1332 # have to store (lat, lng) in pairs, when there are so many coords to store
1333 #push (@{$section_ptr->{'metadata'}}, ["Latitude", $latitude]);
1334 #push (@{$section_ptr->{'metadata'}}, ["Longitude", $longitude]);
1335
1336 push (@{$section_ptr->{'metadata'}}, ["Coordinate", "$latitude $longitude"]); # "$latitude$lat_direction $longitude$lng_direction"
1337
1338 my ($latBeforeDec, $latAfterDec);
1339 my ($lngBeforeDec, $lngAfterDec);
1340
1341 if($latitude !~ m/\./) {
1342 $latBeforeDec = $latitude;
1343 $latAfterDec = "";
1344 } else {
1345 ($latBeforeDec, $latAfterDec) = ($latitude =~ m/^-?([0-9]+)\.([0-9]+)$/);
1346 }
1347 if($longitude !~ m/\./) {
1348 $lngBeforeDec = $longitude;
1349 $lngAfterDec = "";
1350 } else {
1351 ($lngBeforeDec, $lngAfterDec) = ($longitude =~ m/^-?([0-9]+)\.([0-9]+)$/);
1352 }
1353
1354
1355 my $name = "CoordShort";
1356 push (@{$section_ptr->{'metadata'}}, [$name, "$latBeforeDec$lat_direction $lngBeforeDec$lng_direction"]);
1357
1358 for(my $i = 2; $i <= 4; $i++)
1359 {
1360 my $latDecPlaces = (length($latAfterDec) >= $i) ? substr($latAfterDec, 0, $i) : "";
1361 my $lngDecPlaces = (length($lngAfterDec) >= $i) ? substr($lngAfterDec, 0, $i) : "";
1362
1363 push (@{$section_ptr->{'metadata'}}, [$name,
1364 $latBeforeDec . $lat_direction. $latDecPlaces . " " . $lngBeforeDec . $lng_direction. $lngDecPlaces]);
1365
1366 }
1367
1368 #Only add the metadata if it has not already been added
1369 #my $metaMap = $self->get_metadata_hashmap($section); ### TODO: metaMap not used. Unnecesssary step? (Called function has no apparent side-effects.)
1370}
1371
1372
1373# methods for dealing with text
1374
1375# returns the text for a section
1376sub get_text {
1377 my $self = shift (@_);
1378 my ($section) = @_;
1379
1380 my $section_ptr = $self->_lookup_section($section);
1381 if (!defined $section_ptr) {
1382 print STDERR "doc::get_text couldn't find section " .
1383 "$section\n";
1384 return "";
1385 }
1386
1387 return $section_ptr->{'text'};
1388}
1389
1390# returns the (utf-8 encoded) length of the text for a section
1391sub get_text_length {
1392 my $self = shift (@_);
1393 my ($section) = @_;
1394
1395 my $section_ptr = $self->_lookup_section($section);
1396 if (!defined $section_ptr) {
1397 print STDERR "doc::get_text_length couldn't find section " .
1398 "$section\n";
1399 return 0;
1400 }
1401
1402 return length ($section_ptr->{'text'});
1403}
1404
1405# returns the total length for all the sections
1406sub get_total_text_length {
1407 my $self = shift (@_);
1408
1409 my $section = $self->get_top_section();
1410 my $length = 0;
1411 while (defined $section) {
1412 $length += $self->get_text_length($section);
1413 $section = $self->get_next_section($section);
1414 }
1415 return $length;
1416}
1417
1418sub delete_text {
1419 my $self = shift (@_);
1420 my ($section) = @_;
1421
1422 my $section_ptr = $self->_lookup_section($section);
1423 if (!defined $section_ptr) {
1424 print STDERR "doc::delete_text couldn't find section " .
1425 "$section\n";
1426 return;
1427 }
1428
1429 $section_ptr->{'text'} = "";
1430}
1431
1432# add_text assumes the text is in (extended) ascii form. For
1433# text which has been already converted to the UTF-8 format
1434# use add_utf8_text.
1435sub add_text {
1436 my $self = shift (@_);
1437 my ($section, $text) = @_;
1438
1439 # convert the text to UTF-8 encoded unicode characters
1440 # and add the text
1441 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1442}
1443
1444
1445# add_utf8_text assumes the text to be added has already
1446# been converted to the UTF-8 encoding. For ascii text use
1447# add_text
1448# Pass by value version (internally calls pass by ref version
1449# to avoid code duplication)
1450sub add_utf8_text {
1451 my $self = shift (@_);
1452 my ($section, $text) = @_;
1453
1454 $self->add_utf8_textref($section, \$text);
1455}
1456
1457# Pass by reference version, used by GreenstoneSQLPlugin for fulltext
1458sub add_utf8_textref {
1459 my $self = shift (@_);
1460 my ($section, $text_ref) = @_;
1461
1462 my $section_ptr = $self->_lookup_section($section);
1463 if (!defined $section_ptr) {
1464 print STDERR "doc::add_utf8_textref couldn't find section " .
1465 "$section\n";
1466 return;
1467 }
1468
1469 $section_ptr->{'text'} .= $$text_ref;
1470}
1471
1472# returns the Source meta, which is the utf8 filename generated.
1473# Added a separate method here for convenience
1474sub get_source {
1475 my $self = shift (@_);
1476 return $self->get_metadata_element ($self->get_top_section(), "Source");
1477}
1478
1479# returns the SourceFile meta, which is the url reference to the URL-encoded
1480# version of Source (the utf8 filename). Added a separate method here for convenience
1481sub get_sourcefile {
1482 my $self = shift (@_);
1483 return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
1484}
1485
1486# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
1487# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
1488sub get_assocfile_from_sourcefile {
1489 my $self = shift (@_);
1490
1491 # get the SourceFile meta, which is a *URL* to a file on the filesystem
1492 my $top_section = $self->get_top_section();
1493 my $source_file = $self->get_metadata_element($top_section, "SourceFile");
1494
1495 # get the actual filename as it exists on the filesystem which this url refers to
1496 $source_file = &unicode::url_to_filename($source_file);
1497 my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
1498 return $assocfilename;
1499}
1500
1501# methods for dealing with associated files
1502
1503# a file is associated with a document, NOT a section.
1504# if section is defined it is noted in the data structure
1505# only so that files associated from a particular section
1506# may be removed later (using delete_section_assoc_files)
1507sub associate_file {
1508 my $self = shift (@_);
1509 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1510 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1511
1512 # remove all associated files with the same name
1513 $self->delete_assoc_file ($assoc_filename);
1514
1515 # Too harsh a requirement
1516 # Definitely get HTML docs, for example, with some missing
1517 # support files
1518# if (!&util::fd_exists($real_filename)) {
1519# print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n";
1520# exit -1;
1521# }
1522
1523# print STDERR "**** is the following a UTF8 rep of *real* filename?\n $real_filename\n";
1524# print STDERR "****##### so, ensure it is before storing?!?!?\n";
1525## my $utf8_filename = Encode::encode("utf8",$filename);
1526
1527 push (@{$self->{'associated_files'}},
1528 [$real_filename, $assoc_filename, $mime_type, $section]);
1529}
1530
1531# returns a list of associated files in the form
1532# [[real_filename, assoc_filename, mimetype], ...]
1533sub get_assoc_files {
1534 my $self = shift (@_);
1535
1536 return $self->{'associated_files'};
1537}
1538
1539# the following two methods used to keep track of original associated files
1540# for incremental building. eg a txt file used by an item file does not end
1541# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1542# file for incremental build
1543sub associate_source_file {
1544 my $self = shift (@_);
1545 my ($full_filename) = @_;
1546
1547 push (@{$self->{'source_assoc_files'}}, $full_filename);
1548
1549}
1550
1551sub get_source_assoc_files {
1552 my $self = shift (@_);
1553
1554 return $self->{'source_assoc_files'};
1555
1556
1557}
1558sub metadata_file {
1559 my $self = shift (@_);
1560 my ($real_filename, $filename) = @_;
1561
1562 push (@{$self->{'metadata_files'}},
1563 [$real_filename, $filename]);
1564}
1565
1566# used for writing out the archiveinf-doc info database, to list all the metadata files
1567sub get_meta_files {
1568 my $self = shift (@_);
1569
1570 return $self->{'metadata_files'};
1571}
1572
1573sub delete_section_assoc_files {
1574 my $self = shift (@_);
1575 my ($section) = @_;
1576
1577 my $i=0;
1578 while ($i < scalar (@{$self->{'associated_files'}})) {
1579 if (defined $self->{'associated_files'}->[$i]->[3] &&
1580 $self->{'associated_files'}->[$i]->[3] eq $section) {
1581 splice (@{$self->{'associated_files'}}, $i, 1);
1582 } else {
1583 $i++;
1584 }
1585 }
1586}
1587
1588sub delete_assoc_file {
1589 my $self = shift (@_);
1590 my ($assoc_filename) = @_;
1591
1592 my $i=0;
1593 while ($i < scalar (@{$self->{'associated_files'}})) {
1594 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1595 splice (@{$self->{'associated_files'}}, $i, 1);
1596 } else {
1597 $i++;
1598 }
1599 }
1600}
1601
1602sub reset_nextsection_ptr {
1603 my $self = shift (@_);
1604 my ($section) = @_;
1605
1606 my $section_ptr = $self->_lookup_section($section);
1607 $section_ptr->{'next_subsection'} = 1;
1608}
1609
16101;
Note: See TracBrowser for help on using the repository browser.