source: main/trunk/greenstone2/perllib/doc.pm@ 33217

Last change on this file since 33217 was 33217, checked in by wy59, 5 years ago
  1. Undoing the change to doc.pm committed in revision 33185: not storing all combinations of precision for Lat and Long in CoordShorts. Instead, Dr Bainbridge fixed up map-scripts.js to ensure that the requested precision (num decimal places) for Lat and Lng in a CoordShort are now the same. The chosen precision for both is set to whichever of the two is more zoomed out, which work out to be whichever of the two has fewer decimal places. 2. Dr Bainbridge and Martin set min-height when mapEnabled to ensure the footer didn't interfere in the map. 3. classifier-scripts' httpRequest() renamed to httpBrowseRequest() and map's getSubClassifier() renamed to httpMapBrowseRequest().
  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 47.5 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use FileUtils;
40use ghtml;
41use File::stat;
42##use hashdoc;
43use docprint;
44use JSON;
45
46# We just need pi from the Trig lib
47# Import constants pi2, pip2, pip4 (2*pi, pi/2, pi/4).
48use Math::Trig ':pi';
49
50# the document type may be indexed_doc, nonindexed_doc, or
51# classification
52
53our $OIDcount = 0;
54
55
56# Declare the Package (level) Variable that will tell us whether we're in the import or buildcol phase.
57# This var will be set by import.pl, buildcol.pl and export.pl to the then appropriate value.
58# https://perlmaven.com/package-variables-and-lexical-variables-in-perl
59# But: https://stackoverflow.com/questions/7542120/perl-how-to-make-variables-from-requiring-script-available-in-required-script
60# https://stackoverflow.com/questions/33230015/how-do-i-use-a-variable-in-another-perl-script
61#$doc::cmd_line_mode = undef;
62# Decided to follow the usage of $OIDcount above to declare the necessary package-level variables 'globally' accessible
63# actually accessible only for those packages that import this one (with 'use doc')
64our $cmd_line_mode = undef;
65
66# rename_method can be 'url', 'none', 'base64'
67sub new {
68 my $class = shift (@_);
69 my ($source_filename, $doc_type, $rename_method) = @_;
70
71
72 my $self = bless {'associated_files'=>[],
73 'subsection_order'=>[],
74 'next_subsection'=>1,
75 'subsections'=>{},
76 'metadata'=>[],
77 'text'=>"",
78 'OIDtype'=>"hash"}, $class;
79
80 # used to set lastmodified here, but this can screw up the HASH ids, so
81 # the docsave processor now calls set_lastmodified
82
83 $self->set_source_path($source_filename);
84
85 if (defined $source_filename) {
86 $source_filename = &util::filename_within_collection($source_filename);
87 print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
88 $self->set_source_filename ($source_filename, $rename_method);
89 }
90
91 $self->set_doc_type ($doc_type) if defined $doc_type;
92
93 return $self;
94}
95
96
97sub set_source_path
98{
99 my $self = shift @_;
100 my ($source_filename) = @_;
101
102 if (defined $source_filename) {
103 # On Windows the source_filename can be in terse DOS format
104 # e.g. test~1.txt
105
106 $self->{'terse_source_path'} = $source_filename;
107
108 # Use the FileUtil library methods as they are aware of more special
109 # cases such as HDFS [jmt12]
110 if (&FileUtils::fileExists($source_filename))
111 {
112 # See if we can do better for Windows with a filename
113 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
114 require Win32;
115 $self->{'source_path'} = Win32::GetLongPathName($source_filename);
116 }
117 else {
118 # For Unix-based systems, there is no difference between the two
119 $self->{'source_path'} = $source_filename;
120 }
121 }
122 else {
123 print STDERR "Warning: In doc::set_source_path(), file\n";
124 print STDERR " $source_filename\n";
125 print STDERR " does not exist\n";
126
127 # (default) Set it to whatever we were given
128 $self->{'source_path'} = $source_filename;
129 }
130 }
131 else {
132 # Previous code for setting source_path allowed for
133 # it to be undefined, so continue this practice
134 $self->{'terse_source_path'} = undef;
135 $self->{'source_path'} = undef;
136 }
137}
138
139
140sub get_source_path
141{
142 my $self = shift @_;
143
144 return $self->{'terse_source_path'};
145}
146
147# set lastmodified for OAI purposes, added by GRB, moved by kjdon
148sub set_oailastmodified {
149 my $self = shift (@_);
150
151 my $source_path = $self->{'terse_source_path'};
152
153 if (defined $source_path && (-e $source_path)) {
154 my $current_time = time;
155
156 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
157 $wday, $yday, $isdst) = localtime($current_time);
158
159 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
160
161 $self->add_utf8_metadata($self->get_top_section(), "oailastmodified", $current_time);
162 $self->add_utf8_metadata($self->get_top_section(), "oailastmodifieddate", $date_modified);
163 }
164}
165
166# no longer used for OAI purposes, since lastmodified is not what we want as the
167# Datestamp of a document. This doc metadata may be useful for general purposes.
168sub set_lastmodified {
169 my $self = shift (@_);
170
171 my $source_path = $self->{'terse_source_path'};
172
173 if (defined $source_path && (-e $source_path)) {
174
175 my $file_stat = stat($source_path);
176 my $mtime = $file_stat->mtime;
177 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
178 $wday, $yday, $isdst) = localtime($mtime);
179
180 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
181
182 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $mtime);
183 $self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
184 }
185}
186
187# clone the $self object
188sub duplicate {
189 my $self = shift (@_);
190
191 my $newobj = {};
192
193 foreach my $k (keys %$self) {
194 $newobj->{$k} = &clone ($self->{$k});
195 }
196
197 bless $newobj, ref($self);
198 return $newobj;
199}
200
201sub clone {
202 my ($from) = @_;
203 my $type = ref ($from);
204
205 if ($type eq "HASH") {
206 my $to = {};
207 foreach my $key (keys %$from) {
208 $to->{$key} = &clone ($from->{$key});
209 }
210 return $to;
211 } elsif ($type eq "ARRAY") {
212 my $to = [];
213 foreach my $v (@$from) {
214 push (@$to, &clone ($v));
215 }
216 return $to;
217 } else {
218 return $from;
219 }
220}
221
222sub set_OIDtype {
223 my $self = shift (@_);
224 my ($type, $metadata) = @_;
225
226 if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|hash_on_full_filename|incremental|filename|dirname|full_filename|assigned)$/) {
227 $self->{'OIDtype'} = $type;
228 } else {
229 $self->{'OIDtype'} = "hash";
230 }
231
232 if ($type =~ /^assigned$/) {
233 if (defined $metadata) {
234 $self->{'OIDmetadata'} = $metadata;
235 } else {
236 $self->{'OIDmetadata'} = "dc.Identifier";
237 }
238 }
239}
240
241# rename_method can be 'url', 'none', 'base64'
242sub set_source_filename {
243 my $self = shift (@_);
244 my ($source_filename, $rename_method) = @_;
245
246 # Since the gsdlsourcefilename element goes into the doc.xml it has
247 # to be utf8. However, it should also *represent* the source filename
248 # (in the import directory) which may not be utf8 at all.
249 # For instance, if this meta element (gsdlsourcefilename) will be used
250 # by other applications that parse doc.xml in order to locate
251 # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
252 # encode the real filename as this is a binary-to-text encoding meaning
253 # that the resulting string is ASCII (utf8). Decoding will give the original.
254
255# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
256
257 # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
258 $source_filename = $self->encode_filename($source_filename, $rename_method);
259# my ($srcfilename,$dirname,$suffix)
260# = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
261# print STDERR "-> $srcfilename -> ";
262# $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
263# $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
264# print STDERR "$source_filename\n";
265
266 $self->set_utf8_metadata_element ($self->get_top_section(),
267 "gsdlsourcefilename",
268 $source_filename);
269}
270
271sub encode_filename {
272 my $self = shift (@_);
273 my ($source_filename, $rename_method) = @_;
274
275 my ($srcfilename,$dirname,$suffix)
276 = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
277# print STDERR "-> $srcfilename -> ";
278 $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
279 $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
280
281 return $source_filename;
282}
283
284sub set_converted_filename {
285 my $self = shift (@_);
286 my ($converted_filename) = @_;
287
288 # we know the converted filename is utf8
289 $self->set_utf8_metadata_element ($self->get_top_section(),
290 "gsdlconvertedfilename",
291 $converted_filename);
292}
293
294# returns the source_filename as it was provided
295sub get_unmodified_source_filename {
296 my $self = shift (@_);
297
298 return $self->{'terse_source_path'};
299}
300
301# returns the source_filename with whatever rename_method was given
302sub get_source_filename {
303 my $self = shift (@_);
304
305 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
306}
307
308
309
310# returns converted filename if available else returns source filename
311sub get_filename_for_hashing {
312 my $self = shift (@_);
313
314 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
315
316 if (!defined $filename) {
317 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
318 # if NULPlug processed file, then don't give a filename
319 if (defined $plugin_name && $plugin_name eq "NULPlug") {
320 $filename = undef;
321 } else { # returns the URL encoded source filename!
322 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
323 }
324 }
325
326 if (!&FileUtils::isFilenameAbsolute($filename)) {
327 $filename = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$filename);
328 }
329
330 return $filename;
331}
332
333sub set_doc_type {
334 my $self = shift (@_);
335 my ($doc_type) = @_;
336
337 $self->set_metadata_element ($self->get_top_section(),
338 "gsdldoctype",
339 $doc_type);
340}
341
342# returns the gsdldoctype as it was provided
343# the default of "indexed_doc" is used if no document
344# type was provided
345sub get_doc_type {
346 my $self = shift (@_);
347
348 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
349 return $doc_type if (defined $doc_type);
350 return "indexed_doc";
351}
352
353
354# look up the reference to the a particular section
355sub _lookup_section {
356 my $self = shift (@_);
357 my ($section) = @_;
358
359 my ($num);
360 my $sectionref = $self;
361
362 while (defined $section && $section ne "") {
363
364 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
365
366 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
367
368 $section = "" unless defined $section;
369
370
371 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
372 $sectionref = $sectionref->{'subsections'}->{$num};
373 } else {
374 return undef;
375 }
376 }
377
378 return $sectionref;
379}
380
381# calculate OID by hashing the contents of the document
382sub _calc_OID {
383 my $self = shift (@_);
384 my ($filename) = @_;
385
386
387 my $osexe = &util::get_os_exe();
388
389 my $hashfile_exe = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin",
390 $ENV{'GSDLOS'},"hashfile$osexe");
391
392 &util::set_gnomelib_env(); # gnomelib_env (particularly lib/libiconv2.dylib) required to run the hashfile executable on Mac Lions
393 # The subroutine will set the gnomelib env once for each subshell launched, by first testing if GEXTGNOME is not already set
394
395 # A different way to set the gnomelib env would be to do it more locally: exporting the necessary vars
396 # (specifically DYLD/LD_LIB_PATH) for gnome_lib as part of the command executed.
397 # E.g. $result=`export LD_LIBRARY_PATH=../ext/gnome-lib/darwin/lib; hashfile...`
398
399 my $result = "NULL";
400
401
402 if (-e "$hashfile_exe") {
403# $result = `\"$hashfile_exe\" \"$filename\"`;
404# $result = `hashfile$osexe \"$filename\" 2>&1`;
405 $result = `hashfile$osexe \"$filename\"`;
406
407 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
408 } else {
409 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
410 }
411 return "HASH$result";
412}
413
414# methods dealing with OID, not groups of them.
415
416# if $OID is not provided one is calculated
417sub set_OID {
418 my $self = shift (@_);
419 my ($OID) = @_;
420
421 my $use_hash_oid = 0;
422 # if an OID wasn't provided calculate one
423 if (!defined $OID) {
424 $OID = "NULL";
425 if ($self->{'OIDtype'} =~ /^hash/) {
426 $use_hash_oid = 1;
427 } elsif ($self->{'OIDtype'} eq "incremental") {
428 $OID = "D" . $OIDcount;
429 $OIDcount ++;
430 } elsif ($self->{'OIDtype'} eq "filename") {
431 my $filename = $self->get_source_filename();
432 $OID = &File::Basename::fileparse($filename, qr/\.[^.]*/);
433 $OID = &util::tidy_up_oid($OID);
434 } elsif ($self->{'OIDtype'} eq "full_filename") {
435 my $source_filename = $self->get_source_filename();
436 my $dirsep = &util::get_os_dirsep();
437
438 $source_filename =~ s/^import$dirsep//;
439 $source_filename =~ s/$dirsep/-/g;
440 $source_filename =~ s/\./_/g;
441
442 $OID = $source_filename;
443 $OID = &util::tidy_up_oid($OID);
444 } elsif ($self->{'OIDtype'} eq "dirname") {
445 my $filename = $self->get_source_filename();
446 if (defined($filename)) { # && -e $filename) {
447 # get the immediate parent directory
448 $OID = &File::Basename::dirname($filename);
449 if (defined $OID) {
450 $OID = &File::Basename::basename($OID);
451 $OID = &util::tidy_up_oid($OID);
452 } else {
453 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
454 $use_hash_oid = 1;
455 }
456 } else {
457 print STDERR "Failed to find a filename, generating hash id\n";
458 $use_hash_oid = 1;
459 }
460
461 } elsif ($self->{'OIDtype'} eq "assigned") {
462 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
463 if (defined $identifier && $identifier ne "") {
464 $OID = $identifier;
465 $OID = &util::tidy_up_oid($OID);
466 } else {
467 # need a hash id
468 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
469 $use_hash_oid = 1;
470 }
471
472 } else {
473 $use_hash_oid = 1;
474 }
475
476 if ($use_hash_oid) {
477 my $hash_on_file = 1;
478 my $hash_on_ga_xml = 0;
479
480 if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
481 $hash_on_file = 0;
482 $hash_on_ga_xml = 1;
483 }
484
485 if ($self->{'OIDtype'} eq "hash_on_full_filename") {
486 $hash_on_file = 0;
487 $hash_on_ga_xml = 0;
488
489 my $source_filename = $self->get_source_filename();
490 my $dirsep = &util::get_os_dirsep();
491
492 $source_filename =~ s/^import$dirsep//;
493 $source_filename =~ s/$dirsep/-/g;
494 $source_filename =~ s/\./_/g;
495
496 # If the filename is very short then (handled naively)
497 # this can cause conjestion in the hash-values
498 # computed, leading documents sharing the same leading
499 # Hex values in the computed has.
500 #
501 # The solution taken here is to replace the name of
502 # the file name a sufficient number of times (up to
503 # the character limit defined in 'rep_limit' and
504 # make that the content that is hashed on
505
506 # *** Think twice before changing the following value
507 # as it will break backward compatability of computed
508 # document HASH values
509
510 my $rep_limit = 256;
511 my $hash_content = undef;
512
513 if (length($source_filename)<$rep_limit) {
514 my $rep_string = "$source_filename|";
515 my $rs_len = length($rep_string);
516
517 my $clone_times = int(($rep_limit-1)/$rs_len) +1;
518
519 $hash_content = substr($rep_string x $clone_times, 0, $rep_limit);
520 }
521 else {
522 $hash_content = $source_filename;
523 }
524
525 my $filename = &util::get_tmp_filename();
526 if (!open (OUTFILE, ">:utf8", $filename)) {
527 print STDERR "doc::set_OID could not write to $filename\n";
528 } else {
529 print OUTFILE $hash_content;
530 close (OUTFILE);
531 }
532 $OID = $self->_calc_OID ($filename);
533
534 &FileUtils::removeFiles ($filename);
535 }
536
537 if ($hash_on_file) {
538 # "hash" OID - feed file to hashfile.exe
539 my $filename = $self->get_filename_for_hashing();
540
541 # -z: don't want to hash on the file if it is zero size
542 if (defined($filename) && -e $filename && !-z $filename) {
543 $OID = $self->_calc_OID ($filename);
544 } else {
545 $hash_on_ga_xml = 1; # switch to back-up plan, and hash on GA file instead
546 }
547 }
548
549 if ($hash_on_ga_xml) {
550 # In addition being asked to explicity calculate the has based on the GA file,
551 # can also end up coming into this block is doing 'hash_on_file' but the file
552 # itself is of zero bytes (as could be the case with 'doc.nul' file
553
554 my $filename = &util::get_tmp_filename();
555 if (!open (OUTFILE, ">:utf8", $filename)) {
556 print STDERR "doc::set_OID could not write to $filename\n";
557 } else {
558 my $doc_text = &docprint::get_section_xml($self);
559 print OUTFILE $doc_text;
560 close (OUTFILE);
561 }
562 $OID = $self->_calc_OID ($filename);
563 &FileUtils::removeFiles($filename);
564 }
565 }
566 }
567 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
568}
569
570# this uses hashdoc (embedded c thingy) which is faster but still
571# needs a little work to be sufficiently stable
572sub ___set_OID {
573 my $self = shift (@_);
574 my ($OID) = @_;
575
576 # if an OID wasn't provided then calculate hash value based on document
577 if (!defined $OID)
578 {
579 my $hash_text = &docprint::get_section_xml($self);
580 my $hash_len = length($hash_text);
581
582 $OID = &hashdoc::buffer($hash_text,$hash_len);
583 }
584
585 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
586}
587
588# returns the OID for this document
589sub get_OID {
590 my $self = shift (@_);
591 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
592 return $OID if (defined $OID);
593 return "NULL";
594}
595
596sub delete_OID {
597 my $self = shift (@_);
598
599 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
600}
601
602
603# methods for manipulating section names
604
605# returns the name of the top-most section (the top
606# level of the document
607sub get_top_section {
608 my $self = shift (@_);
609
610 return "";
611}
612
613# returns a section
614sub get_parent_section {
615 my $self = shift (@_);
616 my ($section) = @_;
617
618 $section =~ s/(^|\.)\d+$//;
619
620 return $section;
621}
622
623# returns the first child section (or the end child
624# if there isn't any)
625sub get_begin_child {
626 my $self = shift (@_);
627 my ($section) = @_;
628
629 my $section_ptr = $self->_lookup_section($section);
630 return "" unless defined $section_ptr;
631
632 if (defined $section_ptr->{'subsection_order'}->[0]) {
633 return "$section.$section_ptr->{'subsection_order'}->[0]";
634 }
635
636 return $self->get_end_child ($section);
637}
638
639# returns the next child of a parent section
640sub get_next_child {
641 my $self = shift (@_);
642 my ($section) = @_;
643
644 my $parent_section = $self->get_parent_section($section);
645 my $parent_section_ptr = $self->_lookup_section($parent_section);
646 return undef unless defined $parent_section_ptr;
647
648 my ($section_num) = $section =~ /(\d+)$/;
649 return undef unless defined $section_num;
650
651 my $i = 0;
652 my $section_order = $parent_section_ptr->{'subsection_order'};
653 while ($i < scalar(@$section_order)) {
654 last if $section_order->[$i] eq $section_num;
655 $i++;
656 }
657
658 $i++; # the next child
659 if ($i < scalar(@$section_order)) {
660 return $section_order->[$i] if $parent_section eq "";
661 return "$parent_section.$section_order->[$i]";
662 }
663
664 # no more sections in this level
665 return undef;
666}
667
668# returns a reference to a list of children
669sub get_children {
670 my $self = shift (@_);
671 my ($section) = @_;
672
673 my $section_ptr = $self->_lookup_section($section);
674 return [] unless defined $section_ptr;
675
676 my @children = @{$section_ptr->{'subsection_order'}};
677
678 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
679 return \@children;
680}
681
682# returns the child section one past the last one (which
683# is coded as "0")
684sub get_end_child {
685 my $self = shift (@_);
686 my ($section) = @_;
687
688 return $section . ".0" unless $section eq "";
689 return "0";
690}
691
692# returns the next section in book order
693sub get_next_section {
694 my $self = shift (@_);
695 my ($section) = @_;
696
697 return undef unless defined $section;
698
699 my $section_ptr = $self->_lookup_section($section);
700 return undef unless defined $section_ptr;
701
702 # first try to find first child
703 if (defined $section_ptr->{'subsection_order'}->[0]) {
704 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
705 return "$section.$section_ptr->{'subsection_order'}->[0]";
706 }
707
708 do {
709 # try to find sibling
710 my $next_child = $self->get_next_child ($section);
711 return $next_child if (defined $next_child);
712
713 # move up one level
714 $section = $self->get_parent_section ($section);
715 } while $section =~ /\d/;
716
717 return undef;
718}
719
720sub is_leaf_section {
721 my $self = shift (@_);
722 my ($section) = @_;
723
724 my $section_ptr = $self->_lookup_section($section);
725 return 1 unless defined $section_ptr;
726
727 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
728}
729
730# methods for dealing with sections
731
732# returns the name of the inserted section
733sub insert_section {
734 my $self = shift (@_);
735 my ($before_section) = @_;
736
737 # get the child to insert before and its parent section
738 my $parent_section = "";
739 my $before_child = "0";
740 my @before_section = split (/\./, $before_section);
741 if (scalar(@before_section) > 0) {
742 $before_child = pop (@before_section);
743 $parent_section = join (".", @before_section);
744 }
745
746 my $parent_section_ptr = $self->_lookup_section($parent_section);
747 if (!defined $parent_section_ptr) {
748 print STDERR "doc::insert_section couldn't find parent section " .
749 "$parent_section\n";
750 return;
751 }
752
753 # get the next section number
754 my $section_num = $parent_section_ptr->{'next_subsection'}++;
755
756 my $i = 0;
757 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
758 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
759 $i++;
760 }
761
762 # insert the section number into the order list
763 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
764
765 # add this section to the parent section
766 my $section_ptr = {'subsection_order'=>[],
767 'next_subsection'=>1,
768 'subsections'=>{},
769 'metadata'=>[],
770 'text'=>""};
771 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
772
773 # work out the full section number
774 my $section = $parent_section;
775 $section .= "." unless $section eq "";
776 $section .= $section_num;
777
778 return $section;
779}
780
781# creates a pre-named section
782sub create_named_section {
783 my $self = shift (@_);
784 my ($mastersection) = @_;
785
786 my ($num);
787 my $section = $mastersection;
788 my $sectionref = $self;
789
790 while ($section ne "") {
791 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
792 $num =~ s/^0+(\d)/$1/; # remove leading 0s
793 $section = "" unless defined $section;
794
795 if (defined $num) {
796 if (!defined $sectionref->{'subsections'}->{$num}) {
797 push (@{$sectionref->{'subsection_order'}}, $num);
798 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
799 'next_subsection'=>1,
800 'subsections'=>{},
801 'metadata'=>[],
802 'text'=>""};
803 if ($num >= $sectionref->{'next_subsection'}) {
804 $sectionref->{'next_subsection'} = $num + 1;
805 }
806 }
807 $sectionref = $sectionref->{'subsections'}->{$num};
808
809 } else {
810 print STDERR "doc::create_named_section couldn't create section ";
811 print STDERR "$mastersection\n";
812 last;
813 }
814 }
815}
816
817# returns a reference to a list of subsections
818sub list_subsections {
819 my $self = shift (@_);
820 my ($section) = @_;
821
822 my $section_ptr = $self->_lookup_section ($section);
823 if (!defined $section_ptr) {
824 print STDERR "doc::list_subsections couldn't find section $section\n";
825 return [];
826 }
827
828 return [@{$section_ptr->{'subsection_order'}}];
829}
830
831sub delete_section {
832 my $self = shift (@_);
833 my ($section) = @_;
834
835# my $section_ptr = {'subsection_order'=>[],
836# 'next_subsection'=>1,
837# 'subsections'=>{},
838# 'metadata'=>[],
839# 'text'=>""};
840
841 # if this is the top section reset everything
842 if ($section eq "") {
843 $self->{'subsection_order'} = [];
844 $self->{'subsections'} = {};
845 $self->{'metadata'} = [];
846 $self->{'text'} = "";
847 return;
848 }
849
850 # find the parent of the section to delete
851 my $parent_section = "";
852 my $child = "0";
853 my @section = split (/\./, $section);
854 if (scalar(@section) > 0) {
855 $child = pop (@section);
856 $parent_section = join (".", @section);
857 }
858
859 my $parent_section_ptr = $self->_lookup_section($parent_section);
860 if (!defined $parent_section_ptr) {
861 print STDERR "doc::delete_section couldn't find parent section " .
862 "$parent_section\n";
863 return;
864 }
865
866 # remove this section from the subsection_order list
867 my $i = 0;
868 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
869 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
870 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
871 last;
872 }
873 $i++;
874 }
875
876 # remove this section from the subsection hash
877 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
878 undef $parent_section_ptr->{'subsections'}->{$child};
879 }
880}
881
882#--
883# methods for dealing with metadata
884
885# set_metadata_element and get_metadata_element are for metadata
886# which should only have one value. add_meta_data and get_metadata
887# are for metadata which can have more than one value.
888
889# returns the first metadata value which matches field
890
891# This version of get metadata element works much like the one above,
892# except it allows for the namespace portion of a metadata element to
893# be ignored, thus if you are searching for dc.Title, the first piece
894# of matching metadata ending with the name Title (once any namespace
895# is removed) would be returned.
896# 28-11-2003 John Thompson
897sub get_metadata_element {
898 my $self = shift (@_);
899 my ($section, $field, $ignore_namespace) = @_;
900 my ($data);
901
902 $ignore_namespace = 0 unless defined $ignore_namespace;
903
904 my $section_ptr = $self->_lookup_section($section);
905 if (!defined $section_ptr) {
906 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
907 return;
908 }
909
910 # Remove any namespace if we are being told to ignore them
911 if($ignore_namespace) {
912 $field =~ s/^.*\.//; #$field =~ s/^\w*\.//;
913 }
914
915 foreach $data (@{$section_ptr->{'metadata'}}) {
916
917 my $data_name = $data->[0];
918
919 # Remove any namespace if we are being told to ignore them
920 if($ignore_namespace) {
921 $data_name =~ s/^.*\.//; #$data_name =~ s/^\w*\.//;
922 }
923 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
924 $data_name =~ s/^ex\.([^.]+)$/$1/; #$data_name =~ s/^ex\.//;
925
926 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
927 }
928
929 return undef; # was not found
930}
931
932# returns a list of the form [value1, value2, ...]
933sub get_metadata {
934 my $self = shift (@_);
935 my ($section, $field, $ignore_namespace) = @_;
936 my ($data);
937
938 $ignore_namespace = 0 unless defined $ignore_namespace;
939
940 my $section_ptr = $self->_lookup_section($section);
941 if (!defined $section_ptr) {
942 print STDERR "doc::get_metadata couldn't find section ",
943 $section, "\n";
944 return;
945 }
946
947 # Remove any namespace if we are being told to ignore them
948 if($ignore_namespace) {
949 $field =~ s/^.*\.//;
950 }
951
952 my @metadata = ();
953 foreach $data (@{$section_ptr->{'metadata'}}) {
954
955 my $data_name = $data->[0];
956
957 # Remove any namespace if we are being told to ignore them
958 if($ignore_namespace) {
959 $data_name =~ s/^.*\.//;
960 }
961 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
962 $data_name =~ s/^ex\.([^.]+)$/$1/;
963
964 push (@metadata, $data->[1]) if ($data_name eq $field);
965 }
966
967 return \@metadata;
968}
969
970sub get_metadata_hashmap {
971 my $self = shift (@_);
972 my ($section, $opt_namespace) = @_;
973
974 my $section_ptr = $self->_lookup_section($section);
975 if (!defined $section_ptr) {
976 print STDERR "doc::get_metadata couldn't find section ",
977 $section, "\n";
978 return;
979 }
980
981 my $metadata_hashmap = {};
982 foreach my $data (@{$section_ptr->{'metadata'}}) {
983 my $metaname = $data->[0];
984
985 if ((!defined $opt_namespace) || ($metaname =~ m/^$opt_namespace\./)) {
986 if (!defined $metadata_hashmap->{$metaname}) {
987 $metadata_hashmap->{$metaname} = [];
988 }
989 my $metaval_list = $metadata_hashmap->{$metaname};
990 push(@$metaval_list, $data->[1]);
991 }
992 }
993
994 return $metadata_hashmap;
995}
996
997# returns a list of the form [[field,value],[field,value],...]
998sub get_all_metadata {
999 my $self = shift (@_);
1000 my ($section) = @_;
1001
1002 my $section_ptr = $self->_lookup_section($section);
1003 if (!defined $section_ptr) {
1004 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
1005 return;
1006 }
1007
1008 return $section_ptr->{'metadata'};
1009}
1010
1011# $value is optional
1012sub delete_metadata {
1013 my $self = shift (@_);
1014 my ($section, $field, $value) = @_;
1015
1016 my $section_ptr = $self->_lookup_section($section);
1017 if (!defined $section_ptr) {
1018 print STDERR "doc::delete_metadata couldn't find section ", $section, "$field\n";
1019 return;
1020 }
1021
1022 my $i = 0;
1023 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
1024 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
1025 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
1026 splice (@{$section_ptr->{'metadata'}}, $i, 1);
1027 } else {
1028 $i++;
1029 }
1030 }
1031}
1032
1033sub delete_all_metadata {
1034 my $self = shift (@_);
1035 my ($section) = @_;
1036
1037 my $section_ptr = $self->_lookup_section($section);
1038 if (!defined $section_ptr) {
1039 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
1040 return;
1041 }
1042
1043 $section_ptr->{'metadata'} = [];
1044}
1045
1046sub set_metadata_element {
1047 my $self = shift (@_);
1048 my ($section, $field, $value) = @_;
1049
1050 $self->set_utf8_metadata_element ($section, $field,
1051 &unicode::ascii2utf8(\$value));
1052}
1053
1054# set_utf8_metadata_element assumes the text has already been
1055# converted to the UTF-8 encoding.
1056sub set_utf8_metadata_element {
1057 my $self = shift (@_);
1058 my ($section, $field, $value) = @_;
1059
1060 $self->delete_metadata ($section, $field);
1061 $self->add_utf8_metadata ($section, $field, $value);
1062}
1063
1064
1065# add_metadata assumes the text is in (extended) ascii form. For
1066# text which has already been converted to the UTF-8 format use
1067# add_utf8_metadata.
1068sub add_metadata {
1069 my $self = shift (@_);
1070 my ($section, $field, $value) = @_;
1071
1072 $self->add_utf8_metadata ($section, $field,
1073 &unicode::ascii2utf8(\$value));
1074}
1075
1076sub add_utf8_metadata {
1077 my $self = shift (@_);
1078 my ($section, $field, $value) = @_;
1079
1080 # my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1081 # my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
1082 # print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
1083
1084 my $section_ptr = $self->_lookup_section($section);
1085 if (!defined $section_ptr) {
1086 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
1087 return;
1088 }
1089 if (!defined $value) {
1090 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
1091 return;
1092 }
1093 if (!defined $field) {
1094 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
1095 return;
1096 }
1097
1098 #print STDERR "###$field=$value\n";
1099
1100 # For now, supress this check. Given that text data read in is now
1101 # Unicode aware, then the following block of code can (ironically enough)
1102 # cause our unicode compliant string to be re-encoded (leading to
1103 # a double-encoded UTF-8 string, which we definitely don't want!).
1104
1105
1106 # double check that the value is utf-8
1107 # if (!&unicode::check_is_utf8($value)) {
1108 # print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
1109 # &unicode::ensure_utf8(\$value);
1110 # print STDERR " Tried converting to utf8: $value\n";
1111 # }
1112
1113 #If the metadata value is either a latitude or a longitude value then we want to save a shortened version for spacial searching purposes
1114 if ($field =~ m/^(.+\.)?Latitude$/ || $field =~ m/^(.+\.)?Longitude$/)
1115 {
1116 my ($mdprefix,$metaname) = ($field =~ m/(.+)\.(.+)$/);
1117 if (defined $mdprefix) {
1118 # Add in a version of Latitude/Longitude without the metadata namespace prefix to keep Runtime happy
1119 push (@{$section_ptr->{'metadata'}}, [$metaname, $value]);
1120 }
1121
1122 my $direction;
1123 if($value =~ m/^-/)
1124 {
1125 $direction = ($field eq "Latitude") ? "S" : "W";
1126 }
1127 else
1128 {
1129 $direction = ($field eq "Latitude") ? "N" : "E";
1130 }
1131
1132 my ($beforeDec, $afterDec) = ($value =~ m/^-?([0-9]+)\.([0-9]+)$/);
1133 if(defined $beforeDec && defined $afterDec)
1134 {
1135 my $name = ($field eq "Latitude") ? "LatShort" : "LngShort";
1136 push (@{$section_ptr->{'metadata'}}, [$name, $beforeDec . $direction]);
1137
1138 for(my $i = 2; $i <= 4; $i++)
1139 {
1140 if(length($afterDec) >= $i)
1141 {
1142 push (@{$section_ptr->{'metadata'}}, [$name, substr($afterDec, 0, $i)]);
1143 }
1144 }
1145
1146 #Only add the metadata if it has not already been added
1147 my $metaMap = $self->get_metadata_hashmap($section); # metaMap not used and called function has no apparent side-effects,
1148 # but this line appears important for ensuring uniqueness of (Latitude, value) meta
1149 # in the section. Also for LatShort, Longitude, LngShort.
1150 }
1151
1152 if($field =~ m/^(.+\.)?Longitude$/) {
1153 # if we are dealing with Longitude meta, we should 1. have Latitude meta too; 2. already have processed Latitude meta
1154 # in that case, add both Lat and Lng of this section as a Coordinate meta
1155 my $latitude = $self->get_metadata_element ($section, "Latitude");
1156 my $longitude = $value;
1157 # TODO: would like all Longitude meta placed together, followed by all Coordinate meta,
1158 # but the following will add all such coord meta to the doc object and the end of this function will add Longitude meta
1159 $self->processCoordinate($section, $latitude, $longitude);
1160 }
1161 }
1162
1163 elsif($field eq "GPS.mapOverlay") { # then the value is a JSON string
1164
1165 if($cmd_line_mode eq "buildcol") {
1166 #my $metaMap = $self->get_metadata_hashmap($section); ## TODO: Check if necessary to avoid duplication of <Coordinate> meta in index\text\<coll>.jdb
1167
1168 #if(!$metaMap->{'Coordinate'}) {
1169 #print STDERR "@@@@@@@@@@@@@@ cmd line mode (build phase) is now: $doc::cmd_line_mode\n";
1170
1171
1172 print STDERR "GPS.mapOverlay before decoding, val = " . $value . "\n";
1173
1174 # TODO html decode?
1175 $value =~ s@&#091;@[@g;
1176 $value =~ s@&#093;@]@g;
1177 $value =~ s@&quot;@"@g;
1178 print STDERR "GPS.mapOverlay after decoding, val = " . $value . "\n";
1179
1180 my $json_array = decode_json $value;
1181 #my $json = JSON->new->allow_nonref;
1182 #&printAllShapes($json, $json_array);
1183
1184 foreach my $shape (@$json_array) {
1185
1186 my $type = $shape->{"type"};
1187 print STDERR "Type : " . $type . "\n";
1188
1189 if($type eq "circle") {
1190 #print STDERR "Found a circle:\n" . &printShape($json, $shape);
1191
1192 # work out bounding box
1193 # want the inverse of this useful page:
1194 # https://stackoverflow.com/questions/639695/how-to-convert-latitude-or-longitude-to-meters
1195 # https://www.geodatasource.com/developers/javascript
1196
1197 # Dr Bainbridge wants us to use: https://gis.stackexchange.com/questions/5821/calculating-latitude-longitude-x-miles-from-point
1198 # Since for N,E,S,W the following works out the same as above link, we're using the rule of thumb in the following:
1199 # https://gis.stackexchange.com/questions/2951/algorithm-for-offsetting-a-latitude-longitude-by-some-amount-of-meters
1200 # which states
1201 # "If your displacements aren't too great (less than a few kilometers) and you're not right at the poles,
1202 # use the quick and dirty estimate that 111,111 meters (111.111 km) in the y direction is 1 degree
1203 # (of latitude) and 111,111 * cos(latitude) meters in the x direction is 1 degree (of longitude)."
1204 my $centre_lat = $shape->{"center"}->{"lat"};
1205 my $centre_lng = $shape->{"center"}->{"lng"};
1206 my $radius = $shape->{"radius"}; # in metres!
1207
1208 print STDERR "@@@ circle centre: ($centre_lat, $centre_lng), radius: $radius\n";
1209
1210 my $lat_north = $centre_lat + ($radius/111111);
1211 my $lat_south = $centre_lat - ($radius/111111);
1212
1213 print STDERR "### lat_north: $lat_north\n";
1214 print STDERR "### lat_south: $lat_south\n";
1215
1216 # our latitude and longitude values are in degrees. But cos and sin etc in perl and generally all prog languages
1217 # all expect their angle to be in radians. So need to convert from degree to radians before we can take the cos of it.
1218 my $centre_lat_radians = $self->degreesToRadians($centre_lat);
1219 my $cos_in_radians = cos($centre_lat_radians);
1220 print STDERR "cos $centre_lat_radians " . cos($centre_lat_radians) . "\n";
1221 my $lng_east = $centre_lng + ($radius/(111111 * $cos_in_radians));
1222 my $lng_west = $centre_lng - ($radius/(111111 * $cos_in_radians));
1223 print STDERR "### lng_east $lng_east\n";
1224 print STDERR "### lng_west $lng_west\n";
1225
1226 my $cos_lat = cos($centre_lat);
1227 print STDERR "cos $centre_lat is $cos_lat\n";
1228
1229 $self->processCoordinate($section, $lat_north, $lng_east);
1230 $self->processCoordinate($section, $lat_south, $lng_east);
1231 $self->processCoordinate($section, $lat_south, $lng_west);
1232 $self->processCoordinate($section, $lat_north, $lng_west);
1233
1234 }
1235 elsif ($type eq "marker") {
1236 print STDERR "@@ MARKER FOUND WITH LAT: " . $shape->{"position"}->{"lat"} . "\n";
1237 print STDERR "@@ MARKER FOUND WITH LNG: " . $shape->{"position"}->{"lng"} . "\n";
1238 $self->processCoordinate($section, $shape->{"position"}->{"lat"}, $shape->{"position"}->{"lng"});
1239 }
1240 elsif ($type eq "polyline" || $type eq "polygon") {
1241 my $path_array = $shape->{"path"};
1242 foreach my $position (@$path_array) {
1243 $self->processCoordinate($section, $position->{"lat"}, $position->{"lng"});
1244 }
1245 }
1246 elsif ($type eq "rectangle") {
1247
1248 my $bounds = $shape->{"bounds"};
1249 $self->processCoordinate($section, $bounds->{"north"}, $bounds->{"east"});
1250 $self->processCoordinate($section, $bounds->{"south"}, $bounds->{"east"});
1251 $self->processCoordinate($section, $bounds->{"south"}, $bounds->{"west"});
1252 $self->processCoordinate($section, $bounds->{"north"}, $bounds->{"west"});
1253 }
1254
1255 } # end for on each shape in GPS.mapOverlay
1256 #}
1257 }
1258 } # end GPS.mapOverlay meta
1259
1260 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1261}
1262
1263# https://en.wikipedia.org/wiki/Radian
1264sub degreesToRadians
1265{
1266 my $self = shift (@_);
1267 my ($degrees) = @_;
1268
1269 return $degrees * pi /180; # returns radians
1270}
1271
1272sub radiansToDegrees
1273{
1274 my $self = shift (@_);
1275 my ($radians) = @_;
1276
1277 return $radians * 180 / pi; # returns degrees
1278}
1279
1280sub printAllShapes {
1281 my ($json, $json_array) = @_;
1282
1283
1284 #my $pretty_print_shape = $json->pretty->encode( $json_array->[0] );
1285 foreach my $shape (@$json_array) {
1286 my $pretty_print_shape = $json->pretty->encode( $shape );
1287 print STDERR "Shape: $pretty_print_shape\n";
1288 }
1289
1290}
1291
1292# For the (lat, lng) coordinate given,
1293# attaches Coordinate and multiple CoordShort (different precision level) metadata to the doc object
1294sub processCoordinate {
1295 my $self = shift (@_);
1296 my ($section, $latitude, $longitude) = @_;
1297
1298 my $section_ptr = $self->_lookup_section($section);
1299
1300 my $lat_direction = ($latitude =~ m/^-/) ? "S" : "N";
1301 my $lng_direction = ($longitude =~ m/^-/) ? "W" : "E";
1302
1303 # have to store (lat, lng) in pairs, when there are so many coords to store
1304 #push (@{$section_ptr->{'metadata'}}, ["Latitude", $latitude]);
1305 #push (@{$section_ptr->{'metadata'}}, ["Longitude", $longitude]);
1306
1307 push (@{$section_ptr->{'metadata'}}, ["Coordinate", "$latitude $longitude"]); # "$latitude$lat_direction $longitude$lng_direction"
1308
1309 my ($latBeforeDec, $latAfterDec);
1310 my ($lngBeforeDec, $lngAfterDec);
1311
1312 if($latitude !~ m/\./) {
1313 $latBeforeDec = $latitude;
1314 $latAfterDec = "";
1315 } else {
1316 ($latBeforeDec, $latAfterDec) = ($latitude =~ m/^-?([0-9]+)\.([0-9]+)$/);
1317 }
1318 if($longitude !~ m/\./) {
1319 $lngBeforeDec = $longitude;
1320 $lngAfterDec = "";
1321 } else {
1322 ($lngBeforeDec, $lngAfterDec) = ($longitude =~ m/^-?([0-9]+)\.([0-9]+)$/);
1323 }
1324
1325
1326 my $name = "CoordShort";
1327 push (@{$section_ptr->{'metadata'}}, [$name, "$latBeforeDec$lat_direction $lngBeforeDec$lng_direction"]);
1328
1329 for(my $i = 2; $i <= 4; $i++)
1330 {
1331 my $latDecPlaces = (length($latAfterDec) >= $i) ? substr($latAfterDec, 0, $i) : "";
1332 my $lngDecPlaces = (length($lngAfterDec) >= $i) ? substr($lngAfterDec, 0, $i) : "";
1333
1334 push (@{$section_ptr->{'metadata'}}, [$name,
1335 $latBeforeDec . $lat_direction. $latDecPlaces . " " . $lngBeforeDec . $lng_direction. $lngDecPlaces]);
1336
1337 }
1338
1339 #Only add the metadata if it has not already been added
1340 #my $metaMap = $self->get_metadata_hashmap($section); ### TODO: metaMap not used. Unnecesssary step? (Called function has no apparent side-effects.)
1341}
1342
1343
1344# methods for dealing with text
1345
1346# returns the text for a section
1347sub get_text {
1348 my $self = shift (@_);
1349 my ($section) = @_;
1350
1351 my $section_ptr = $self->_lookup_section($section);
1352 if (!defined $section_ptr) {
1353 print STDERR "doc::get_text couldn't find section " .
1354 "$section\n";
1355 return "";
1356 }
1357
1358 return $section_ptr->{'text'};
1359}
1360
1361# returns the (utf-8 encoded) length of the text for a section
1362sub get_text_length {
1363 my $self = shift (@_);
1364 my ($section) = @_;
1365
1366 my $section_ptr = $self->_lookup_section($section);
1367 if (!defined $section_ptr) {
1368 print STDERR "doc::get_text_length couldn't find section " .
1369 "$section\n";
1370 return 0;
1371 }
1372
1373 return length ($section_ptr->{'text'});
1374}
1375
1376# returns the total length for all the sections
1377sub get_total_text_length {
1378 my $self = shift (@_);
1379
1380 my $section = $self->get_top_section();
1381 my $length = 0;
1382 while (defined $section) {
1383 $length += $self->get_text_length($section);
1384 $section = $self->get_next_section($section);
1385 }
1386 return $length;
1387}
1388
1389sub delete_text {
1390 my $self = shift (@_);
1391 my ($section) = @_;
1392
1393 my $section_ptr = $self->_lookup_section($section);
1394 if (!defined $section_ptr) {
1395 print STDERR "doc::delete_text couldn't find section " .
1396 "$section\n";
1397 return;
1398 }
1399
1400 $section_ptr->{'text'} = "";
1401}
1402
1403# add_text assumes the text is in (extended) ascii form. For
1404# text which has been already converted to the UTF-8 format
1405# use add_utf8_text.
1406sub add_text {
1407 my $self = shift (@_);
1408 my ($section, $text) = @_;
1409
1410 # convert the text to UTF-8 encoded unicode characters
1411 # and add the text
1412 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1413}
1414
1415
1416# add_utf8_text assumes the text to be added has already
1417# been converted to the UTF-8 encoding. For ascii text use
1418# add_text
1419# Pass by value version (internally calls pass by ref version
1420# to avoid code duplication)
1421sub add_utf8_text {
1422 my $self = shift (@_);
1423 my ($section, $text) = @_;
1424
1425 $self->add_utf8_textref($section, \$text);
1426}
1427
1428# Pass by reference version, used by GreenstoneSQLPlugin for fulltext
1429sub add_utf8_textref {
1430 my $self = shift (@_);
1431 my ($section, $text_ref) = @_;
1432
1433 my $section_ptr = $self->_lookup_section($section);
1434 if (!defined $section_ptr) {
1435 print STDERR "doc::add_utf8_textref couldn't find section " .
1436 "$section\n";
1437 return;
1438 }
1439
1440 $section_ptr->{'text'} .= $$text_ref;
1441}
1442
1443# returns the Source meta, which is the utf8 filename generated.
1444# Added a separate method here for convenience
1445sub get_source {
1446 my $self = shift (@_);
1447 return $self->get_metadata_element ($self->get_top_section(), "Source");
1448}
1449
1450# returns the SourceFile meta, which is the url reference to the URL-encoded
1451# version of Source (the utf8 filename). Added a separate method here for convenience
1452sub get_sourcefile {
1453 my $self = shift (@_);
1454 return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
1455}
1456
1457# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
1458# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
1459sub get_assocfile_from_sourcefile {
1460 my $self = shift (@_);
1461
1462 # get the SourceFile meta, which is a *URL* to a file on the filesystem
1463 my $top_section = $self->get_top_section();
1464 my $source_file = $self->get_metadata_element($top_section, "SourceFile");
1465
1466 # get the actual filename as it exists on the filesystem which this url refers to
1467 $source_file = &unicode::url_to_filename($source_file);
1468 my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
1469 return $assocfilename;
1470}
1471
1472# methods for dealing with associated files
1473
1474# a file is associated with a document, NOT a section.
1475# if section is defined it is noted in the data structure
1476# only so that files associated from a particular section
1477# may be removed later (using delete_section_assoc_files)
1478sub associate_file {
1479 my $self = shift (@_);
1480 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1481 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1482
1483 # remove all associated files with the same name
1484 $self->delete_assoc_file ($assoc_filename);
1485
1486 # Too harsh a requirement
1487 # Definitely get HTML docs, for example, with some missing
1488 # support files
1489# if (!&util::fd_exists($real_filename)) {
1490# print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n";
1491# exit -1;
1492# }
1493
1494# print STDERR "**** is the following a UTF8 rep of *real* filename?\n $real_filename\n";
1495# print STDERR "****##### so, ensure it is before storing?!?!?\n";
1496## my $utf8_filename = Encode::encode("utf8",$filename);
1497
1498 push (@{$self->{'associated_files'}},
1499 [$real_filename, $assoc_filename, $mime_type, $section]);
1500}
1501
1502# returns a list of associated files in the form
1503# [[real_filename, assoc_filename, mimetype], ...]
1504sub get_assoc_files {
1505 my $self = shift (@_);
1506
1507 return $self->{'associated_files'};
1508}
1509
1510# the following two methods used to keep track of original associated files
1511# for incremental building. eg a txt file used by an item file does not end
1512# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1513# file for incremental build
1514sub associate_source_file {
1515 my $self = shift (@_);
1516 my ($full_filename) = @_;
1517
1518 push (@{$self->{'source_assoc_files'}}, $full_filename);
1519
1520}
1521
1522sub get_source_assoc_files {
1523 my $self = shift (@_);
1524
1525 return $self->{'source_assoc_files'};
1526
1527
1528}
1529sub metadata_file {
1530 my $self = shift (@_);
1531 my ($real_filename, $filename) = @_;
1532
1533 push (@{$self->{'metadata_files'}},
1534 [$real_filename, $filename]);
1535}
1536
1537# used for writing out the archiveinf-doc info database, to list all the metadata files
1538sub get_meta_files {
1539 my $self = shift (@_);
1540
1541 return $self->{'metadata_files'};
1542}
1543
1544sub delete_section_assoc_files {
1545 my $self = shift (@_);
1546 my ($section) = @_;
1547
1548 my $i=0;
1549 while ($i < scalar (@{$self->{'associated_files'}})) {
1550 if (defined $self->{'associated_files'}->[$i]->[3] &&
1551 $self->{'associated_files'}->[$i]->[3] eq $section) {
1552 splice (@{$self->{'associated_files'}}, $i, 1);
1553 } else {
1554 $i++;
1555 }
1556 }
1557}
1558
1559sub delete_assoc_file {
1560 my $self = shift (@_);
1561 my ($assoc_filename) = @_;
1562
1563 my $i=0;
1564 while ($i < scalar (@{$self->{'associated_files'}})) {
1565 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1566 splice (@{$self->{'associated_files'}}, $i, 1);
1567 } else {
1568 $i++;
1569 }
1570 }
1571}
1572
1573sub reset_nextsection_ptr {
1574 my $self = shift (@_);
1575 my ($section) = @_;
1576
1577 my $section_ptr = $self->_lookup_section($section);
1578 $section_ptr->{'next_subsection'} = 1;
1579}
1580
15811;
Note: See TracBrowser for help on using the repository browser.