source: main/trunk/greenstone2/perllib/doc.pm@ 33128

Last change on this file since 33128 was 33128, checked in by wy59, 5 years ago

Improvements to Coordinate support AND bugfixes. BUT not all the fixes may be ideal, many marked with TODO. 1. Now we support an Array of coordinates. At present these are only displayed as Markers, but in future shapes should appear as shapes. 2. Bugfixes include: (a) expanding sections wasn't working when we had hierarchical docs with Coordinate data, because map-scripts 'overrode' the toggleSection function but no longer did any of the doc expanding behaviour that document_scripts.js used to do. This was not a problem with the ImagesGPS collection, simply because that did not have hierarchical/sectionalised documents. (b) Perl: A previous commit output duplicate Coordinates into the index. Now this doesn't happen. Fix works but may not be ideal. 3. Perl: (a) Reserved index names CD, CS for Coordinate and CoordShort. Note however that LAT and LNG were never added to reserve list for index names. (b) Now doc.pm::processCoord() takes a section parameter and works out the section-ptr from that.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 47.5 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use FileUtils;
40use ghtml;
41use File::stat;
42##use hashdoc;
43use docprint;
44use JSON;
45
46# We just need pi from the Trig lib
47# Import constants pi2, pip2, pip4 (2*pi, pi/2, pi/4).
48use Math::Trig ':pi';
49
50# the document type may be indexed_doc, nonindexed_doc, or
51# classification
52
53our $OIDcount = 0;
54
55# rename_method can be 'url', 'none', 'base64'
56sub new {
57 my $class = shift (@_);
58 my ($source_filename, $doc_type, $rename_method) = @_;
59
60
61 my $self = bless {'associated_files'=>[],
62 'subsection_order'=>[],
63 'next_subsection'=>1,
64 'subsections'=>{},
65 'metadata'=>[],
66 'text'=>"",
67 'OIDtype'=>"hash"}, $class;
68
69 # used to set lastmodified here, but this can screw up the HASH ids, so
70 # the docsave processor now calls set_lastmodified
71
72 $self->set_source_path($source_filename);
73
74 if (defined $source_filename) {
75 $source_filename = &util::filename_within_collection($source_filename);
76 print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
77 $self->set_source_filename ($source_filename, $rename_method);
78 }
79
80 $self->set_doc_type ($doc_type) if defined $doc_type;
81
82 return $self;
83}
84
85
86sub set_source_path
87{
88 my $self = shift @_;
89 my ($source_filename) = @_;
90
91 if (defined $source_filename) {
92 # On Windows the source_filename can be in terse DOS format
93 # e.g. test~1.txt
94
95 $self->{'terse_source_path'} = $source_filename;
96
97 # Use the FileUtil library methods as they are aware of more special
98 # cases such as HDFS [jmt12]
99 if (&FileUtils::fileExists($source_filename))
100 {
101 # See if we can do better for Windows with a filename
102 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
103 require Win32;
104 $self->{'source_path'} = Win32::GetLongPathName($source_filename);
105 }
106 else {
107 # For Unix-based systems, there is no difference between the two
108 $self->{'source_path'} = $source_filename;
109 }
110 }
111 else {
112 print STDERR "Warning: In doc::set_source_path(), file\n";
113 print STDERR " $source_filename\n";
114 print STDERR " does not exist\n";
115
116 # (default) Set it to whatever we were given
117 $self->{'source_path'} = $source_filename;
118 }
119 }
120 else {
121 # Previous code for setting source_path allowed for
122 # it to be undefined, so continue this practice
123 $self->{'terse_source_path'} = undef;
124 $self->{'source_path'} = undef;
125 }
126}
127
128
129sub get_source_path
130{
131 my $self = shift @_;
132
133 return $self->{'terse_source_path'};
134}
135
136# set lastmodified for OAI purposes, added by GRB, moved by kjdon
137sub set_oailastmodified {
138 my $self = shift (@_);
139
140 my $source_path = $self->{'terse_source_path'};
141
142 if (defined $source_path && (-e $source_path)) {
143 my $current_time = time;
144
145 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
146 $wday, $yday, $isdst) = localtime($current_time);
147
148 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
149
150 $self->add_utf8_metadata($self->get_top_section(), "oailastmodified", $current_time);
151 $self->add_utf8_metadata($self->get_top_section(), "oailastmodifieddate", $date_modified);
152 }
153}
154
155# no longer used for OAI purposes, since lastmodified is not what we want as the
156# Datestamp of a document. This doc metadata may be useful for general purposes.
157sub set_lastmodified {
158 my $self = shift (@_);
159
160 my $source_path = $self->{'terse_source_path'};
161
162 if (defined $source_path && (-e $source_path)) {
163
164 my $file_stat = stat($source_path);
165 my $mtime = $file_stat->mtime;
166 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
167 $wday, $yday, $isdst) = localtime($mtime);
168
169 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
170
171 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $mtime);
172 $self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
173 }
174}
175
176# clone the $self object
177sub duplicate {
178 my $self = shift (@_);
179
180 my $newobj = {};
181
182 foreach my $k (keys %$self) {
183 $newobj->{$k} = &clone ($self->{$k});
184 }
185
186 bless $newobj, ref($self);
187 return $newobj;
188}
189
190sub clone {
191 my ($from) = @_;
192 my $type = ref ($from);
193
194 if ($type eq "HASH") {
195 my $to = {};
196 foreach my $key (keys %$from) {
197 $to->{$key} = &clone ($from->{$key});
198 }
199 return $to;
200 } elsif ($type eq "ARRAY") {
201 my $to = [];
202 foreach my $v (@$from) {
203 push (@$to, &clone ($v));
204 }
205 return $to;
206 } else {
207 return $from;
208 }
209}
210
211sub set_OIDtype {
212 my $self = shift (@_);
213 my ($type, $metadata) = @_;
214
215 if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|hash_on_full_filename|incremental|filename|dirname|full_filename|assigned)$/) {
216 $self->{'OIDtype'} = $type;
217 } else {
218 $self->{'OIDtype'} = "hash";
219 }
220
221 if ($type =~ /^assigned$/) {
222 if (defined $metadata) {
223 $self->{'OIDmetadata'} = $metadata;
224 } else {
225 $self->{'OIDmetadata'} = "dc.Identifier";
226 }
227 }
228}
229
230# rename_method can be 'url', 'none', 'base64'
231sub set_source_filename {
232 my $self = shift (@_);
233 my ($source_filename, $rename_method) = @_;
234
235 # Since the gsdlsourcefilename element goes into the doc.xml it has
236 # to be utf8. However, it should also *represent* the source filename
237 # (in the import directory) which may not be utf8 at all.
238 # For instance, if this meta element (gsdlsourcefilename) will be used
239 # by other applications that parse doc.xml in order to locate
240 # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
241 # encode the real filename as this is a binary-to-text encoding meaning
242 # that the resulting string is ASCII (utf8). Decoding will give the original.
243
244# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
245
246 # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
247 $source_filename = $self->encode_filename($source_filename, $rename_method);
248# my ($srcfilename,$dirname,$suffix)
249# = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
250# print STDERR "-> $srcfilename -> ";
251# $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
252# $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
253# print STDERR "$source_filename\n";
254
255 $self->set_utf8_metadata_element ($self->get_top_section(),
256 "gsdlsourcefilename",
257 $source_filename);
258}
259
260sub encode_filename {
261 my $self = shift (@_);
262 my ($source_filename, $rename_method) = @_;
263
264 my ($srcfilename,$dirname,$suffix)
265 = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
266# print STDERR "-> $srcfilename -> ";
267 $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
268 $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
269
270 return $source_filename;
271}
272
273sub set_converted_filename {
274 my $self = shift (@_);
275 my ($converted_filename) = @_;
276
277 # we know the converted filename is utf8
278 $self->set_utf8_metadata_element ($self->get_top_section(),
279 "gsdlconvertedfilename",
280 $converted_filename);
281}
282
283# returns the source_filename as it was provided
284sub get_unmodified_source_filename {
285 my $self = shift (@_);
286
287 return $self->{'terse_source_path'};
288}
289
290# returns the source_filename with whatever rename_method was given
291sub get_source_filename {
292 my $self = shift (@_);
293
294 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
295}
296
297
298
299# returns converted filename if available else returns source filename
300sub get_filename_for_hashing {
301 my $self = shift (@_);
302
303 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
304
305 if (!defined $filename) {
306 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
307 # if NULPlug processed file, then don't give a filename
308 if (defined $plugin_name && $plugin_name eq "NULPlug") {
309 $filename = undef;
310 } else { # returns the URL encoded source filename!
311 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
312 }
313 }
314
315 if (!&FileUtils::isFilenameAbsolute($filename)) {
316 $filename = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$filename);
317 }
318
319 return $filename;
320}
321
322sub set_doc_type {
323 my $self = shift (@_);
324 my ($doc_type) = @_;
325
326 $self->set_metadata_element ($self->get_top_section(),
327 "gsdldoctype",
328 $doc_type);
329}
330
331# returns the gsdldoctype as it was provided
332# the default of "indexed_doc" is used if no document
333# type was provided
334sub get_doc_type {
335 my $self = shift (@_);
336
337 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
338 return $doc_type if (defined $doc_type);
339 return "indexed_doc";
340}
341
342
343# look up the reference to the a particular section
344sub _lookup_section {
345 my $self = shift (@_);
346 my ($section) = @_;
347
348 my ($num);
349 my $sectionref = $self;
350
351 while (defined $section && $section ne "") {
352
353 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
354
355 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
356
357 $section = "" unless defined $section;
358
359
360 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
361 $sectionref = $sectionref->{'subsections'}->{$num};
362 } else {
363 return undef;
364 }
365 }
366
367 return $sectionref;
368}
369
370# calculate OID by hashing the contents of the document
371sub _calc_OID {
372 my $self = shift (@_);
373 my ($filename) = @_;
374
375
376 my $osexe = &util::get_os_exe();
377
378 my $hashfile_exe = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin",
379 $ENV{'GSDLOS'},"hashfile$osexe");
380
381 &util::set_gnomelib_env(); # gnomelib_env (particularly lib/libiconv2.dylib) required to run the hashfile executable on Mac Lions
382 # The subroutine will set the gnomelib env once for each subshell launched, by first testing if GEXTGNOME is not already set
383
384 # A different way to set the gnomelib env would be to do it more locally: exporting the necessary vars
385 # (specifically DYLD/LD_LIB_PATH) for gnome_lib as part of the command executed.
386 # E.g. $result=`export LD_LIBRARY_PATH=../ext/gnome-lib/darwin/lib; hashfile...`
387
388 my $result = "NULL";
389
390
391 if (-e "$hashfile_exe") {
392# $result = `\"$hashfile_exe\" \"$filename\"`;
393# $result = `hashfile$osexe \"$filename\" 2>&1`;
394 $result = `hashfile$osexe \"$filename\"`;
395
396 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
397 } else {
398 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
399 }
400 return "HASH$result";
401}
402
403# methods dealing with OID, not groups of them.
404
405# if $OID is not provided one is calculated
406sub set_OID {
407 my $self = shift (@_);
408 my ($OID) = @_;
409
410 my $use_hash_oid = 0;
411 # if an OID wasn't provided calculate one
412 if (!defined $OID) {
413 $OID = "NULL";
414 if ($self->{'OIDtype'} =~ /^hash/) {
415 $use_hash_oid = 1;
416 } elsif ($self->{'OIDtype'} eq "incremental") {
417 $OID = "D" . $OIDcount;
418 $OIDcount ++;
419 } elsif ($self->{'OIDtype'} eq "filename") {
420 my $filename = $self->get_source_filename();
421 $OID = &File::Basename::fileparse($filename, qr/\.[^.]*/);
422 $OID = &util::tidy_up_oid($OID);
423 } elsif ($self->{'OIDtype'} eq "full_filename") {
424 my $source_filename = $self->get_source_filename();
425 my $dirsep = &util::get_os_dirsep();
426
427 $source_filename =~ s/^import$dirsep//;
428 $source_filename =~ s/$dirsep/-/g;
429 $source_filename =~ s/\./_/g;
430
431 $OID = $source_filename;
432 $OID = &util::tidy_up_oid($OID);
433 } elsif ($self->{'OIDtype'} eq "dirname") {
434 my $filename = $self->get_source_filename();
435 if (defined($filename)) { # && -e $filename) {
436 # get the immediate parent directory
437 $OID = &File::Basename::dirname($filename);
438 if (defined $OID) {
439 $OID = &File::Basename::basename($OID);
440 $OID = &util::tidy_up_oid($OID);
441 } else {
442 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
443 $use_hash_oid = 1;
444 }
445 } else {
446 print STDERR "Failed to find a filename, generating hash id\n";
447 $use_hash_oid = 1;
448 }
449
450 } elsif ($self->{'OIDtype'} eq "assigned") {
451 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
452 if (defined $identifier && $identifier ne "") {
453 $OID = $identifier;
454 $OID = &util::tidy_up_oid($OID);
455 } else {
456 # need a hash id
457 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
458 $use_hash_oid = 1;
459 }
460
461 } else {
462 $use_hash_oid = 1;
463 }
464
465 if ($use_hash_oid) {
466 my $hash_on_file = 1;
467 my $hash_on_ga_xml = 0;
468
469 if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
470 $hash_on_file = 0;
471 $hash_on_ga_xml = 1;
472 }
473
474 if ($self->{'OIDtype'} eq "hash_on_full_filename") {
475 $hash_on_file = 0;
476 $hash_on_ga_xml = 0;
477
478 my $source_filename = $self->get_source_filename();
479 my $dirsep = &util::get_os_dirsep();
480
481 $source_filename =~ s/^import$dirsep//;
482 $source_filename =~ s/$dirsep/-/g;
483 $source_filename =~ s/\./_/g;
484
485 # If the filename is very short then (handled naively)
486 # this can cause conjestion in the hash-values
487 # computed, leading documents sharing the same leading
488 # Hex values in the computed has.
489 #
490 # The solution taken here is to replace the name of
491 # the file name a sufficient number of times (up to
492 # the character limit defined in 'rep_limit' and
493 # make that the content that is hashed on
494
495 # *** Think twice before changing the following value
496 # as it will break backward compatability of computed
497 # document HASH values
498
499 my $rep_limit = 256;
500 my $hash_content = undef;
501
502 if (length($source_filename)<$rep_limit) {
503 my $rep_string = "$source_filename|";
504 my $rs_len = length($rep_string);
505
506 my $clone_times = int(($rep_limit-1)/$rs_len) +1;
507
508 $hash_content = substr($rep_string x $clone_times, 0, $rep_limit);
509 }
510 else {
511 $hash_content = $source_filename;
512 }
513
514 my $filename = &util::get_tmp_filename();
515 if (!open (OUTFILE, ">:utf8", $filename)) {
516 print STDERR "doc::set_OID could not write to $filename\n";
517 } else {
518 print OUTFILE $hash_content;
519 close (OUTFILE);
520 }
521 $OID = $self->_calc_OID ($filename);
522
523 &FileUtils::removeFiles ($filename);
524 }
525
526 if ($hash_on_file) {
527 # "hash" OID - feed file to hashfile.exe
528 my $filename = $self->get_filename_for_hashing();
529
530 # -z: don't want to hash on the file if it is zero size
531 if (defined($filename) && -e $filename && !-z $filename) {
532 $OID = $self->_calc_OID ($filename);
533 } else {
534 $hash_on_ga_xml = 1; # switch to back-up plan, and hash on GA file instead
535 }
536 }
537
538 if ($hash_on_ga_xml) {
539 # In addition being asked to explicity calculate the has based on the GA file,
540 # can also end up coming into this block is doing 'hash_on_file' but the file
541 # itself is of zero bytes (as could be the case with 'doc.nul' file
542
543 my $filename = &util::get_tmp_filename();
544 if (!open (OUTFILE, ">:utf8", $filename)) {
545 print STDERR "doc::set_OID could not write to $filename\n";
546 } else {
547 my $doc_text = &docprint::get_section_xml($self);
548 print OUTFILE $doc_text;
549 close (OUTFILE);
550 }
551 $OID = $self->_calc_OID ($filename);
552 &FileUtils::removeFiles($filename);
553 }
554 }
555 }
556 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
557}
558
559# this uses hashdoc (embedded c thingy) which is faster but still
560# needs a little work to be sufficiently stable
561sub ___set_OID {
562 my $self = shift (@_);
563 my ($OID) = @_;
564
565 # if an OID wasn't provided then calculate hash value based on document
566 if (!defined $OID)
567 {
568 my $hash_text = &docprint::get_section_xml($self);
569 my $hash_len = length($hash_text);
570
571 $OID = &hashdoc::buffer($hash_text,$hash_len);
572 }
573
574 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
575}
576
577# returns the OID for this document
578sub get_OID {
579 my $self = shift (@_);
580 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
581 return $OID if (defined $OID);
582 return "NULL";
583}
584
585sub delete_OID {
586 my $self = shift (@_);
587
588 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
589}
590
591
592# methods for manipulating section names
593
594# returns the name of the top-most section (the top
595# level of the document
596sub get_top_section {
597 my $self = shift (@_);
598
599 return "";
600}
601
602# returns a section
603sub get_parent_section {
604 my $self = shift (@_);
605 my ($section) = @_;
606
607 $section =~ s/(^|\.)\d+$//;
608
609 return $section;
610}
611
612# returns the first child section (or the end child
613# if there isn't any)
614sub get_begin_child {
615 my $self = shift (@_);
616 my ($section) = @_;
617
618 my $section_ptr = $self->_lookup_section($section);
619 return "" unless defined $section_ptr;
620
621 if (defined $section_ptr->{'subsection_order'}->[0]) {
622 return "$section.$section_ptr->{'subsection_order'}->[0]";
623 }
624
625 return $self->get_end_child ($section);
626}
627
628# returns the next child of a parent section
629sub get_next_child {
630 my $self = shift (@_);
631 my ($section) = @_;
632
633 my $parent_section = $self->get_parent_section($section);
634 my $parent_section_ptr = $self->_lookup_section($parent_section);
635 return undef unless defined $parent_section_ptr;
636
637 my ($section_num) = $section =~ /(\d+)$/;
638 return undef unless defined $section_num;
639
640 my $i = 0;
641 my $section_order = $parent_section_ptr->{'subsection_order'};
642 while ($i < scalar(@$section_order)) {
643 last if $section_order->[$i] eq $section_num;
644 $i++;
645 }
646
647 $i++; # the next child
648 if ($i < scalar(@$section_order)) {
649 return $section_order->[$i] if $parent_section eq "";
650 return "$parent_section.$section_order->[$i]";
651 }
652
653 # no more sections in this level
654 return undef;
655}
656
657# returns a reference to a list of children
658sub get_children {
659 my $self = shift (@_);
660 my ($section) = @_;
661
662 my $section_ptr = $self->_lookup_section($section);
663 return [] unless defined $section_ptr;
664
665 my @children = @{$section_ptr->{'subsection_order'}};
666
667 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
668 return \@children;
669}
670
671# returns the child section one past the last one (which
672# is coded as "0")
673sub get_end_child {
674 my $self = shift (@_);
675 my ($section) = @_;
676
677 return $section . ".0" unless $section eq "";
678 return "0";
679}
680
681# returns the next section in book order
682sub get_next_section {
683 my $self = shift (@_);
684 my ($section) = @_;
685
686 return undef unless defined $section;
687
688 my $section_ptr = $self->_lookup_section($section);
689 return undef unless defined $section_ptr;
690
691 # first try to find first child
692 if (defined $section_ptr->{'subsection_order'}->[0]) {
693 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
694 return "$section.$section_ptr->{'subsection_order'}->[0]";
695 }
696
697 do {
698 # try to find sibling
699 my $next_child = $self->get_next_child ($section);
700 return $next_child if (defined $next_child);
701
702 # move up one level
703 $section = $self->get_parent_section ($section);
704 } while $section =~ /\d/;
705
706 return undef;
707}
708
709sub is_leaf_section {
710 my $self = shift (@_);
711 my ($section) = @_;
712
713 my $section_ptr = $self->_lookup_section($section);
714 return 1 unless defined $section_ptr;
715
716 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
717}
718
719# methods for dealing with sections
720
721# returns the name of the inserted section
722sub insert_section {
723 my $self = shift (@_);
724 my ($before_section) = @_;
725
726 # get the child to insert before and its parent section
727 my $parent_section = "";
728 my $before_child = "0";
729 my @before_section = split (/\./, $before_section);
730 if (scalar(@before_section) > 0) {
731 $before_child = pop (@before_section);
732 $parent_section = join (".", @before_section);
733 }
734
735 my $parent_section_ptr = $self->_lookup_section($parent_section);
736 if (!defined $parent_section_ptr) {
737 print STDERR "doc::insert_section couldn't find parent section " .
738 "$parent_section\n";
739 return;
740 }
741
742 # get the next section number
743 my $section_num = $parent_section_ptr->{'next_subsection'}++;
744
745 my $i = 0;
746 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
747 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
748 $i++;
749 }
750
751 # insert the section number into the order list
752 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
753
754 # add this section to the parent section
755 my $section_ptr = {'subsection_order'=>[],
756 'next_subsection'=>1,
757 'subsections'=>{},
758 'metadata'=>[],
759 'text'=>""};
760 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
761
762 # work out the full section number
763 my $section = $parent_section;
764 $section .= "." unless $section eq "";
765 $section .= $section_num;
766
767 return $section;
768}
769
770# creates a pre-named section
771sub create_named_section {
772 my $self = shift (@_);
773 my ($mastersection) = @_;
774
775 my ($num);
776 my $section = $mastersection;
777 my $sectionref = $self;
778
779 while ($section ne "") {
780 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
781 $num =~ s/^0+(\d)/$1/; # remove leading 0s
782 $section = "" unless defined $section;
783
784 if (defined $num) {
785 if (!defined $sectionref->{'subsections'}->{$num}) {
786 push (@{$sectionref->{'subsection_order'}}, $num);
787 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
788 'next_subsection'=>1,
789 'subsections'=>{},
790 'metadata'=>[],
791 'text'=>""};
792 if ($num >= $sectionref->{'next_subsection'}) {
793 $sectionref->{'next_subsection'} = $num + 1;
794 }
795 }
796 $sectionref = $sectionref->{'subsections'}->{$num};
797
798 } else {
799 print STDERR "doc::create_named_section couldn't create section ";
800 print STDERR "$mastersection\n";
801 last;
802 }
803 }
804}
805
806# returns a reference to a list of subsections
807sub list_subsections {
808 my $self = shift (@_);
809 my ($section) = @_;
810
811 my $section_ptr = $self->_lookup_section ($section);
812 if (!defined $section_ptr) {
813 print STDERR "doc::list_subsections couldn't find section $section\n";
814 return [];
815 }
816
817 return [@{$section_ptr->{'subsection_order'}}];
818}
819
820sub delete_section {
821 my $self = shift (@_);
822 my ($section) = @_;
823
824# my $section_ptr = {'subsection_order'=>[],
825# 'next_subsection'=>1,
826# 'subsections'=>{},
827# 'metadata'=>[],
828# 'text'=>""};
829
830 # if this is the top section reset everything
831 if ($section eq "") {
832 $self->{'subsection_order'} = [];
833 $self->{'subsections'} = {};
834 $self->{'metadata'} = [];
835 $self->{'text'} = "";
836 return;
837 }
838
839 # find the parent of the section to delete
840 my $parent_section = "";
841 my $child = "0";
842 my @section = split (/\./, $section);
843 if (scalar(@section) > 0) {
844 $child = pop (@section);
845 $parent_section = join (".", @section);
846 }
847
848 my $parent_section_ptr = $self->_lookup_section($parent_section);
849 if (!defined $parent_section_ptr) {
850 print STDERR "doc::delete_section couldn't find parent section " .
851 "$parent_section\n";
852 return;
853 }
854
855 # remove this section from the subsection_order list
856 my $i = 0;
857 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
858 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
859 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
860 last;
861 }
862 $i++;
863 }
864
865 # remove this section from the subsection hash
866 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
867 undef $parent_section_ptr->{'subsections'}->{$child};
868 }
869}
870
871#--
872# methods for dealing with metadata
873
874# set_metadata_element and get_metadata_element are for metadata
875# which should only have one value. add_meta_data and get_metadata
876# are for metadata which can have more than one value.
877
878# returns the first metadata value which matches field
879
880# This version of get metadata element works much like the one above,
881# except it allows for the namespace portion of a metadata element to
882# be ignored, thus if you are searching for dc.Title, the first piece
883# of matching metadata ending with the name Title (once any namespace
884# is removed) would be returned.
885# 28-11-2003 John Thompson
886sub get_metadata_element {
887 my $self = shift (@_);
888 my ($section, $field, $ignore_namespace) = @_;
889 my ($data);
890
891 $ignore_namespace = 0 unless defined $ignore_namespace;
892
893 my $section_ptr = $self->_lookup_section($section);
894 if (!defined $section_ptr) {
895 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
896 return;
897 }
898
899 # Remove any namespace if we are being told to ignore them
900 if($ignore_namespace) {
901 $field =~ s/^.*\.//; #$field =~ s/^\w*\.//;
902 }
903
904 foreach $data (@{$section_ptr->{'metadata'}}) {
905
906 my $data_name = $data->[0];
907
908 # Remove any namespace if we are being told to ignore them
909 if($ignore_namespace) {
910 $data_name =~ s/^.*\.//; #$data_name =~ s/^\w*\.//;
911 }
912 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
913 $data_name =~ s/^ex\.([^.]+)$/$1/; #$data_name =~ s/^ex\.//;
914
915 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
916 }
917
918 return undef; # was not found
919}
920
921# returns a list of the form [value1, value2, ...]
922sub get_metadata {
923 my $self = shift (@_);
924 my ($section, $field, $ignore_namespace) = @_;
925 my ($data);
926
927 $ignore_namespace = 0 unless defined $ignore_namespace;
928
929 my $section_ptr = $self->_lookup_section($section);
930 if (!defined $section_ptr) {
931 print STDERR "doc::get_metadata couldn't find section ",
932 $section, "\n";
933 return;
934 }
935
936 # Remove any namespace if we are being told to ignore them
937 if($ignore_namespace) {
938 $field =~ s/^.*\.//;
939 }
940
941 my @metadata = ();
942 foreach $data (@{$section_ptr->{'metadata'}}) {
943
944 my $data_name = $data->[0];
945
946 # Remove any namespace if we are being told to ignore them
947 if($ignore_namespace) {
948 $data_name =~ s/^.*\.//;
949 }
950 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
951 $data_name =~ s/^ex\.([^.]+)$/$1/;
952
953 push (@metadata, $data->[1]) if ($data_name eq $field);
954 }
955
956 return \@metadata;
957}
958
959sub get_metadata_hashmap {
960 my $self = shift (@_);
961 my ($section, $opt_namespace) = @_;
962
963 my $section_ptr = $self->_lookup_section($section);
964 if (!defined $section_ptr) {
965 print STDERR "doc::get_metadata couldn't find section ",
966 $section, "\n";
967 return;
968 }
969
970 my $metadata_hashmap = {};
971 foreach my $data (@{$section_ptr->{'metadata'}}) {
972 my $metaname = $data->[0];
973
974 if ((!defined $opt_namespace) || ($metaname =~ m/^$opt_namespace\./)) {
975 if (!defined $metadata_hashmap->{$metaname}) {
976 $metadata_hashmap->{$metaname} = [];
977 }
978 my $metaval_list = $metadata_hashmap->{$metaname};
979 push(@$metaval_list, $data->[1]);
980 }
981 }
982
983 return $metadata_hashmap;
984}
985
986# returns a list of the form [[field,value],[field,value],...]
987sub get_all_metadata {
988 my $self = shift (@_);
989 my ($section) = @_;
990
991 my $section_ptr = $self->_lookup_section($section);
992 if (!defined $section_ptr) {
993 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
994 return;
995 }
996
997 return $section_ptr->{'metadata'};
998}
999
1000# $value is optional
1001sub delete_metadata {
1002 my $self = shift (@_);
1003 my ($section, $field, $value) = @_;
1004
1005 my $section_ptr = $self->_lookup_section($section);
1006 if (!defined $section_ptr) {
1007 print STDERR "doc::delete_metadata couldn't find section ", $section, "$field\n";
1008 return;
1009 }
1010
1011 my $i = 0;
1012 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
1013 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
1014 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
1015 splice (@{$section_ptr->{'metadata'}}, $i, 1);
1016 } else {
1017 $i++;
1018 }
1019 }
1020}
1021
1022sub delete_all_metadata {
1023 my $self = shift (@_);
1024 my ($section) = @_;
1025
1026 my $section_ptr = $self->_lookup_section($section);
1027 if (!defined $section_ptr) {
1028 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
1029 return;
1030 }
1031
1032 $section_ptr->{'metadata'} = [];
1033}
1034
1035sub set_metadata_element {
1036 my $self = shift (@_);
1037 my ($section, $field, $value) = @_;
1038
1039 $self->set_utf8_metadata_element ($section, $field,
1040 &unicode::ascii2utf8(\$value));
1041}
1042
1043# set_utf8_metadata_element assumes the text has already been
1044# converted to the UTF-8 encoding.
1045sub set_utf8_metadata_element {
1046 my $self = shift (@_);
1047 my ($section, $field, $value) = @_;
1048
1049 $self->delete_metadata ($section, $field);
1050 $self->add_utf8_metadata ($section, $field, $value);
1051}
1052
1053
1054# add_metadata assumes the text is in (extended) ascii form. For
1055# text which has already been converted to the UTF-8 format use
1056# add_utf8_metadata.
1057sub add_metadata {
1058 my $self = shift (@_);
1059 my ($section, $field, $value) = @_;
1060
1061 $self->add_utf8_metadata ($section, $field,
1062 &unicode::ascii2utf8(\$value));
1063}
1064
1065sub add_utf8_metadata {
1066 my $self = shift (@_);
1067 my ($section, $field, $value) = @_;
1068
1069 # my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1070 # my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
1071 # print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
1072
1073 my $section_ptr = $self->_lookup_section($section);
1074 if (!defined $section_ptr) {
1075 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
1076 return;
1077 }
1078 if (!defined $value) {
1079 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
1080 return;
1081 }
1082 if (!defined $field) {
1083 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
1084 return;
1085 }
1086
1087 #print STDERR "###$field=$value\n";
1088
1089 # For now, supress this check. Given that text data read in is now
1090 # Unicode aware, then the following block of code can (ironically enough)
1091 # cause our unicode compliant string to be re-encoded (leading to
1092 # a double-encoded UTF-8 string, which we definitely don't want!).
1093
1094
1095 # double check that the value is utf-8
1096 # if (!&unicode::check_is_utf8($value)) {
1097 # print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
1098 # &unicode::ensure_utf8(\$value);
1099 # print STDERR " Tried converting to utf8: $value\n";
1100 # }
1101
1102 #If the metadata value is either a latitude or a longitude value then we want to save a shortened version for spacial searching purposes
1103 if ($field =~ m/^(.+\.)?Latitude$/ || $field =~ m/^(.+\.)?Longitude$/)
1104 {
1105 my ($mdprefix,$metaname) = ($field =~ m/(.+)\.(.+)$/);
1106 if (defined $mdprefix) {
1107 # Add in a version of Latitude/Longitude without the metadata namespace prefix to keep Runtime happy
1108 push (@{$section_ptr->{'metadata'}}, [$metaname, $value]);
1109 }
1110
1111 my $direction;
1112 if($value =~ m/^-/)
1113 {
1114 $direction = ($field eq "Latitude") ? "S" : "W";
1115 }
1116 else
1117 {
1118 $direction = ($field eq "Latitude") ? "N" : "E";
1119 }
1120
1121 my ($beforeDec, $afterDec) = ($value =~ m/^-?([0-9]+)\.([0-9]+)$/);
1122 if(defined $beforeDec && defined $afterDec)
1123 {
1124 my $name = ($field eq "Latitude") ? "LatShort" : "LngShort";
1125 push (@{$section_ptr->{'metadata'}}, [$name, $beforeDec . $direction]);
1126
1127 for(my $i = 2; $i <= 4; $i++)
1128 {
1129 if(length($afterDec) >= $i)
1130 {
1131 push (@{$section_ptr->{'metadata'}}, [$name, substr($afterDec, 0, $i)]);
1132 }
1133 }
1134
1135 #Only add the metadata if it has not already been added
1136 my $metaMap = $self->get_metadata_hashmap($section); ### TODO: metaMap not used. Unnecesssary step? (Called function has no side-effects.)
1137 }
1138
1139 if($field =~ m/^(.+\.)?Longitude$/) {
1140 # if we are dealing with Longitude meta, we should 1. have Latitude meta too; 2. already have processed Latitude meta
1141 # in that case, add both Lat and Lng of this section as a Coordinate meta
1142 my $latitude = $self->get_metadata_element ($section, "Latitude");
1143 # TODO: would like all Longitude info together followed by all Coordinate info, but the following will add all coord info meta and end of this function will add Longitude meta
1144 $self->processCoordinate($section, $latitude, $value); # value is Longitude
1145 }
1146 }
1147
1148 elsif($field eq "GPS.mapOverlay") { # then the value is a JSON string
1149
1150 # TODO:
1151 # If we already have Coordinate meta for this section of the document (as can happen during buildcol.pl),
1152 # let's ASSUME this means we've already processed GPS.mapOverlay meta into Coordinate meta for this section (can have happened during import.pl)
1153 # to avoid adding duplicate Coordinates meta, which then end up duplicated in the index
1154 # Of course, the assumption is not always true! We could have an image with embedded Lat and Lng meta,
1155 # and the same image doc's section could have GPS.mapOverlay meta (from shapes) added via the doc editor.
1156 # This very function would then have converted Lat/Lng into Coordinate meta (just in the if stmt above) and added it to the section.
1157 # And then by the time we process this section's GPS.mapOverlay meta here, we would notice the section has Coordinate meta already,
1158 # and therefore skip converting the GPS.mapOverlay meta into Coordinate meta! What to dooooo?
1159 # So the return statement immediately below is a temporary solution, until we find a better one that will always work.
1160 my $metaMap = $self->get_metadata_hashmap($section);
1161 if($metaMap->{'Coordinate'}) {
1162 return;
1163 }
1164
1165 print STDERR "GPS.mapOverlay before val: " . $value . "\n";
1166
1167 # TODO html decode?
1168 $value =~ s@&#091;@[@g;
1169 $value =~ s@&#093;@]@g;
1170 $value =~ s@&quot;@"@g;
1171 print STDERR "GPS.mapOverlay after val: " . $value . "\n";
1172
1173 my $json_array = decode_json $value;
1174 #my $json = JSON->new->allow_nonref;
1175 #&printAllShapes($json, $json_array);
1176
1177 foreach my $shape (@$json_array) {
1178
1179 my $type = $shape->{"type"};
1180 print STDERR "Type : " . $type . "\n";
1181
1182 if($type eq "circle") {
1183 #print STDERR "Found a circle:\n" . &printShape($json, $shape);
1184
1185 # work out bounding box
1186 # SCARY!
1187 # want the inverse of this useful page:
1188 # https://stackoverflow.com/questions/639695/how-to-convert-latitude-or-longitude-to-meters
1189 # https://www.geodatasource.com/developers/javascript
1190
1191
1192 # for now, just process the circle centre
1193 #my $centre = $shape->{"center"};
1194 #$self->processLatOrLng($section_ptr, "Latitude", $centre->{"lat"});
1195 #$self->processLatOrLng($section_ptr, "Longitude", $centre->{"lng"});
1196
1197
1198 # Dr Bainbridge wants us to use: https://gis.stackexchange.com/questions/5821/calculating-latitude-longitude-x-miles-from-point
1199 # But we're using the rule of thumb here, since for N,E,S,W it works out the same:
1200 # https://gis.stackexchange.com/questions/2951/algorithm-for-offsetting-a-latitude-longitude-by-some-amount-of-meters
1201 # which states
1202 # "If your displacements aren't too great (less than a few kilometers) and you're not right at the poles,
1203 # use the quick and dirty estimate that 111,111 meters (111.111 km) in the y direction is 1 degree
1204 # (of latitude) and 111,111 * cos(latitude) meters in the x direction is 1 degree (of longitude)."
1205 my $centre_lat = $shape->{"center"}->{"lat"};
1206 my $centre_lng = $shape->{"center"}->{"lng"};
1207 my $radius = $shape->{"radius"}; # in metres!
1208
1209 print STDERR "@@@ circle centre: ($centre_lat, $centre_lng), radius: $radius\n";
1210
1211 my $lat_north = $centre_lat + ($radius/111111);
1212 my $lat_south = $centre_lat - ($radius/111111);
1213
1214 print STDERR "### lat_north: $lat_north\n";
1215 print STDERR "### lat_south: $lat_south\n";
1216
1217 # our latitude and longitude values are in degrees. But cos and sin etc in perl and generally all prog languages
1218 # all expect their angle to be in radians. So need to convert from degree to radians before we can take the cose of it.
1219 my $centre_lat_radians = $self->degreesToRadians($centre_lat);
1220 my $cos_in_radians = cos($centre_lat_radians);
1221 print STDERR "cos $centre_lat_radians " . cos($centre_lat_radians) . "\n";
1222 my $lng_east = $centre_lng + ($radius/(111111 * $cos_in_radians));
1223 my $lng_west = $centre_lng - ($radius/(111111 * $cos_in_radians));
1224 print STDERR "### lng_east $lng_east\n";
1225 print STDERR "### lng_west $lng_west\n";
1226
1227
1228
1229 my $cos_lat = cos($centre_lat);
1230 print STDERR "cos $centre_lat is $cos_lat\n";
1231
1232 $self->processCoordinate($section, $lat_north, $lng_east);
1233 $self->processCoordinate($section, $lat_south, $lng_east);
1234 $self->processCoordinate($section, $lat_south, $lng_west);
1235 $self->processCoordinate($section, $lat_north, $lng_west);
1236
1237 }
1238 elsif ($type eq "marker") {
1239 print STDERR "@@ MARKER FOUND WITH LAT: " . $shape->{"position"}->{"lat"} . "\n";
1240 print STDERR "@@ MARKER FOUND WITH LNG: " . $shape->{"position"}->{"lng"} . "\n";
1241 $self->processCoordinate($section, $shape->{"position"}->{"lat"}, $shape->{"position"}->{"lng"});
1242 }
1243 elsif ($type eq "polyline" || $type eq "polygon") {
1244 my $path_array = $shape->{"path"};
1245 foreach my $position (@$path_array) {
1246 $self->processCoordinate($section, $position->{"lat"}, $position->{"lng"});
1247 }
1248 }
1249 elsif ($type eq "rectangle") {
1250
1251 my $bounds = $shape->{"bounds"};
1252
1253 $self->processCoordinate($section, $bounds->{"north"}, $bounds->{"east"});
1254 $self->processCoordinate($section, $bounds->{"south"}, $bounds->{"east"});
1255 $self->processCoordinate($section, $bounds->{"south"}, $bounds->{"west"});
1256 $self->processCoordinate($section, $bounds->{"north"}, $bounds->{"west"});
1257 }
1258
1259 } # end for on each shape in GPS.mapOverlay
1260
1261 } # end GPS.mapOverlay meta
1262
1263 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1264}
1265
1266# https://en.wikipedia.org/wiki/Radian
1267sub degreesToRadians
1268{
1269 my $self = shift (@_);
1270 my ($degrees) = @_;
1271
1272 return $degrees * pi /180; # returns radians
1273}
1274
1275sub radiansToDegrees
1276{
1277 my $self = shift (@_);
1278 my ($radians) = @_;
1279
1280 return $radians * 180 / pi; # returns degrees
1281}
1282
1283sub printAllShapes {
1284 my ($json, $json_array) = @_;
1285
1286
1287 #my $pretty_print_shape = $json->pretty->encode( $json_array->[0] );
1288 foreach my $shape (@$json_array) {
1289 my $pretty_print_shape = $json->pretty->encode( $shape );
1290 print STDERR "Shape: $pretty_print_shape\n";
1291 #&printShape($shape);
1292 }
1293
1294}
1295
1296sub processCoordinate {
1297 my $self = shift (@_);
1298 my ($section, $latitude, $longitude) = @_;
1299
1300 my $section_ptr = $self->_lookup_section($section);
1301
1302 my $lat_direction = ($latitude =~ m/^-/) ? "S" : "N";
1303 my $lng_direction = ($longitude =~ m/^-/) ? "W" : "E";
1304
1305 # have to store (lat, lng) in pairs, when there are so many coords to store
1306 #push (@{$section_ptr->{'metadata'}}, ["Latitude", $latitude]);
1307 #push (@{$section_ptr->{'metadata'}}, ["Longitude", $longitude]);
1308
1309 push (@{$section_ptr->{'metadata'}}, ["Coordinate", "$latitude $longitude"]); # "$latitude$lat_direction $longitude$lng_direction"
1310
1311 my ($latBeforeDec, $latAfterDec);
1312 my ($lngBeforeDec, $lngAfterDec);
1313
1314 if($latitude !~ m/\./) {
1315 $latBeforeDec = $latitude;
1316 $latAfterDec = "";
1317 } else {
1318 ($latBeforeDec, $latAfterDec) = ($latitude =~ m/^-?([0-9]+)\.([0-9]+)$/);
1319 }
1320 if($longitude !~ m/\./) {
1321 $lngBeforeDec = $longitude;
1322 $lngAfterDec = "";
1323 } else {
1324 ($lngBeforeDec, $lngAfterDec) = ($longitude =~ m/^-?([0-9]+)\.([0-9]+)$/);
1325 }
1326
1327 #if(defined $beforeDec && defined $afterDec)
1328 #{
1329 my $name = "CoordShort";
1330 push (@{$section_ptr->{'metadata'}}, [$name, "$latBeforeDec$lat_direction $lngBeforeDec$lng_direction"]);
1331
1332 for(my $i = 2; $i <= 4; $i++)
1333 {
1334 my $latDecPlaces = (length($latAfterDec) >= $i) ? substr($latAfterDec, 0, $i) : "";
1335 my $lngDecPlaces = (length($lngAfterDec) >= $i) ? substr($lngAfterDec, 0, $i) : "";
1336
1337 push (@{$section_ptr->{'metadata'}}, [$name,
1338 $latBeforeDec . $lat_direction. $latDecPlaces . " " . $lngBeforeDec . $lng_direction. $lngDecPlaces]);
1339
1340 }
1341
1342 #Only add the metadata if it has not already been added
1343 #my $metaMap = $self->get_metadata_hashmap($section); ### TODO: metaMap not used. Unnecesssary step? (Called function has no side-effects.)
1344 #}
1345
1346
1347}
1348
1349
1350# methods for dealing with text
1351
1352# returns the text for a section
1353sub get_text {
1354 my $self = shift (@_);
1355 my ($section) = @_;
1356
1357 my $section_ptr = $self->_lookup_section($section);
1358 if (!defined $section_ptr) {
1359 print STDERR "doc::get_text couldn't find section " .
1360 "$section\n";
1361 return "";
1362 }
1363
1364 return $section_ptr->{'text'};
1365}
1366
1367# returns the (utf-8 encoded) length of the text for a section
1368sub get_text_length {
1369 my $self = shift (@_);
1370 my ($section) = @_;
1371
1372 my $section_ptr = $self->_lookup_section($section);
1373 if (!defined $section_ptr) {
1374 print STDERR "doc::get_text_length couldn't find section " .
1375 "$section\n";
1376 return 0;
1377 }
1378
1379 return length ($section_ptr->{'text'});
1380}
1381
1382# returns the total length for all the sections
1383sub get_total_text_length {
1384 my $self = shift (@_);
1385
1386 my $section = $self->get_top_section();
1387 my $length = 0;
1388 while (defined $section) {
1389 $length += $self->get_text_length($section);
1390 $section = $self->get_next_section($section);
1391 }
1392 return $length;
1393}
1394
1395sub delete_text {
1396 my $self = shift (@_);
1397 my ($section) = @_;
1398
1399 my $section_ptr = $self->_lookup_section($section);
1400 if (!defined $section_ptr) {
1401 print STDERR "doc::delete_text couldn't find section " .
1402 "$section\n";
1403 return;
1404 }
1405
1406 $section_ptr->{'text'} = "";
1407}
1408
1409# add_text assumes the text is in (extended) ascii form. For
1410# text which has been already converted to the UTF-8 format
1411# use add_utf8_text.
1412sub add_text {
1413 my $self = shift (@_);
1414 my ($section, $text) = @_;
1415
1416 # convert the text to UTF-8 encoded unicode characters
1417 # and add the text
1418 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1419}
1420
1421
1422# add_utf8_text assumes the text to be added has already
1423# been converted to the UTF-8 encoding. For ascii text use
1424# add_text
1425# Pass by value version (internally calls pass by ref version
1426# to avoid code duplication)
1427sub add_utf8_text {
1428 my $self = shift (@_);
1429 my ($section, $text) = @_;
1430
1431 $self->add_utf8_textref($section, \$text);
1432}
1433
1434# Pass by reference version, used by GreenstoneSQLPlugin for fulltext
1435sub add_utf8_textref {
1436 my $self = shift (@_);
1437 my ($section, $text_ref) = @_;
1438
1439 my $section_ptr = $self->_lookup_section($section);
1440 if (!defined $section_ptr) {
1441 print STDERR "doc::add_utf8_textref couldn't find section " .
1442 "$section\n";
1443 return;
1444 }
1445
1446 $section_ptr->{'text'} .= $$text_ref;
1447}
1448
1449# returns the Source meta, which is the utf8 filename generated.
1450# Added a separate method here for convenience
1451sub get_source {
1452 my $self = shift (@_);
1453 return $self->get_metadata_element ($self->get_top_section(), "Source");
1454}
1455
1456# returns the SourceFile meta, which is the url reference to the URL-encoded
1457# version of Source (the utf8 filename). Added a separate method here for convenience
1458sub get_sourcefile {
1459 my $self = shift (@_);
1460 return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
1461}
1462
1463# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
1464# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
1465sub get_assocfile_from_sourcefile {
1466 my $self = shift (@_);
1467
1468 # get the SourceFile meta, which is a *URL* to a file on the filesystem
1469 my $top_section = $self->get_top_section();
1470 my $source_file = $self->get_metadata_element($top_section, "SourceFile");
1471
1472 # get the actual filename as it exists on the filesystem which this url refers to
1473 $source_file = &unicode::url_to_filename($source_file);
1474 my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
1475 return $assocfilename;
1476}
1477
1478# methods for dealing with associated files
1479
1480# a file is associated with a document, NOT a section.
1481# if section is defined it is noted in the data structure
1482# only so that files associated from a particular section
1483# may be removed later (using delete_section_assoc_files)
1484sub associate_file {
1485 my $self = shift (@_);
1486 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1487 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1488
1489 # remove all associated files with the same name
1490 $self->delete_assoc_file ($assoc_filename);
1491
1492 # Too harsh a requirement
1493 # Definitely get HTML docs, for example, with some missing
1494 # support files
1495# if (!&util::fd_exists($real_filename)) {
1496# print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n";
1497# exit -1;
1498# }
1499
1500# print STDERR "**** is the following a UTF8 rep of *real* filename?\n $real_filename\n";
1501# print STDERR "****##### so, ensure it is before storing?!?!?\n";
1502## my $utf8_filename = Encode::encode("utf8",$filename);
1503
1504 push (@{$self->{'associated_files'}},
1505 [$real_filename, $assoc_filename, $mime_type, $section]);
1506}
1507
1508# returns a list of associated files in the form
1509# [[real_filename, assoc_filename, mimetype], ...]
1510sub get_assoc_files {
1511 my $self = shift (@_);
1512
1513 return $self->{'associated_files'};
1514}
1515
1516# the following two methods used to keep track of original associated files
1517# for incremental building. eg a txt file used by an item file does not end
1518# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1519# file for incremental build
1520sub associate_source_file {
1521 my $self = shift (@_);
1522 my ($full_filename) = @_;
1523
1524 push (@{$self->{'source_assoc_files'}}, $full_filename);
1525
1526}
1527
1528sub get_source_assoc_files {
1529 my $self = shift (@_);
1530
1531 return $self->{'source_assoc_files'};
1532
1533
1534}
1535sub metadata_file {
1536 my $self = shift (@_);
1537 my ($real_filename, $filename) = @_;
1538
1539 push (@{$self->{'metadata_files'}},
1540 [$real_filename, $filename]);
1541}
1542
1543# used for writing out the archiveinf-doc info database, to list all the metadata files
1544sub get_meta_files {
1545 my $self = shift (@_);
1546
1547 return $self->{'metadata_files'};
1548}
1549
1550sub delete_section_assoc_files {
1551 my $self = shift (@_);
1552 my ($section) = @_;
1553
1554 my $i=0;
1555 while ($i < scalar (@{$self->{'associated_files'}})) {
1556 if (defined $self->{'associated_files'}->[$i]->[3] &&
1557 $self->{'associated_files'}->[$i]->[3] eq $section) {
1558 splice (@{$self->{'associated_files'}}, $i, 1);
1559 } else {
1560 $i++;
1561 }
1562 }
1563}
1564
1565sub delete_assoc_file {
1566 my $self = shift (@_);
1567 my ($assoc_filename) = @_;
1568
1569 my $i=0;
1570 while ($i < scalar (@{$self->{'associated_files'}})) {
1571 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1572 splice (@{$self->{'associated_files'}}, $i, 1);
1573 } else {
1574 $i++;
1575 }
1576 }
1577}
1578
1579sub reset_nextsection_ptr {
1580 my $self = shift (@_);
1581 my ($section) = @_;
1582
1583 my $section_ptr = $self->_lookup_section($section);
1584 $section_ptr->{'next_subsection'} = 1;
1585}
1586
15871;
Note: See TracBrowser for help on using the repository browser.