source: main/trunk/greenstone2/perllib/doc.pm@ 33139

Last change on this file since 33139 was 33139, checked in by wy59, 5 years ago

Having 'fixed' the duplication issue of Coordinate data in the index, the side-effect of the fix was that GPS.mapOverlay meta never went into the index after that. Correction to the original 'fix' to get GPS.mapOverlay meta back into the index

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 47.5 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use FileUtils;
40use ghtml;
41use File::stat;
42##use hashdoc;
43use docprint;
44use JSON;
45
46# We just need pi from the Trig lib
47# Import constants pi2, pip2, pip4 (2*pi, pi/2, pi/4).
48use Math::Trig ':pi';
49
50# the document type may be indexed_doc, nonindexed_doc, or
51# classification
52
53our $OIDcount = 0;
54
55# rename_method can be 'url', 'none', 'base64'
56sub new {
57 my $class = shift (@_);
58 my ($source_filename, $doc_type, $rename_method) = @_;
59
60
61 my $self = bless {'associated_files'=>[],
62 'subsection_order'=>[],
63 'next_subsection'=>1,
64 'subsections'=>{},
65 'metadata'=>[],
66 'text'=>"",
67 'OIDtype'=>"hash"}, $class;
68
69 # used to set lastmodified here, but this can screw up the HASH ids, so
70 # the docsave processor now calls set_lastmodified
71
72 $self->set_source_path($source_filename);
73
74 if (defined $source_filename) {
75 $source_filename = &util::filename_within_collection($source_filename);
76 print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
77 $self->set_source_filename ($source_filename, $rename_method);
78 }
79
80 $self->set_doc_type ($doc_type) if defined $doc_type;
81
82 return $self;
83}
84
85
86sub set_source_path
87{
88 my $self = shift @_;
89 my ($source_filename) = @_;
90
91 if (defined $source_filename) {
92 # On Windows the source_filename can be in terse DOS format
93 # e.g. test~1.txt
94
95 $self->{'terse_source_path'} = $source_filename;
96
97 # Use the FileUtil library methods as they are aware of more special
98 # cases such as HDFS [jmt12]
99 if (&FileUtils::fileExists($source_filename))
100 {
101 # See if we can do better for Windows with a filename
102 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
103 require Win32;
104 $self->{'source_path'} = Win32::GetLongPathName($source_filename);
105 }
106 else {
107 # For Unix-based systems, there is no difference between the two
108 $self->{'source_path'} = $source_filename;
109 }
110 }
111 else {
112 print STDERR "Warning: In doc::set_source_path(), file\n";
113 print STDERR " $source_filename\n";
114 print STDERR " does not exist\n";
115
116 # (default) Set it to whatever we were given
117 $self->{'source_path'} = $source_filename;
118 }
119 }
120 else {
121 # Previous code for setting source_path allowed for
122 # it to be undefined, so continue this practice
123 $self->{'terse_source_path'} = undef;
124 $self->{'source_path'} = undef;
125 }
126}
127
128
129sub get_source_path
130{
131 my $self = shift @_;
132
133 return $self->{'terse_source_path'};
134}
135
136# set lastmodified for OAI purposes, added by GRB, moved by kjdon
137sub set_oailastmodified {
138 my $self = shift (@_);
139
140 my $source_path = $self->{'terse_source_path'};
141
142 if (defined $source_path && (-e $source_path)) {
143 my $current_time = time;
144
145 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
146 $wday, $yday, $isdst) = localtime($current_time);
147
148 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
149
150 $self->add_utf8_metadata($self->get_top_section(), "oailastmodified", $current_time);
151 $self->add_utf8_metadata($self->get_top_section(), "oailastmodifieddate", $date_modified);
152 }
153}
154
155# no longer used for OAI purposes, since lastmodified is not what we want as the
156# Datestamp of a document. This doc metadata may be useful for general purposes.
157sub set_lastmodified {
158 my $self = shift (@_);
159
160 my $source_path = $self->{'terse_source_path'};
161
162 if (defined $source_path && (-e $source_path)) {
163
164 my $file_stat = stat($source_path);
165 my $mtime = $file_stat->mtime;
166 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
167 $wday, $yday, $isdst) = localtime($mtime);
168
169 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
170
171 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $mtime);
172 $self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
173 }
174}
175
176# clone the $self object
177sub duplicate {
178 my $self = shift (@_);
179
180 my $newobj = {};
181
182 foreach my $k (keys %$self) {
183 $newobj->{$k} = &clone ($self->{$k});
184 }
185
186 bless $newobj, ref($self);
187 return $newobj;
188}
189
190sub clone {
191 my ($from) = @_;
192 my $type = ref ($from);
193
194 if ($type eq "HASH") {
195 my $to = {};
196 foreach my $key (keys %$from) {
197 $to->{$key} = &clone ($from->{$key});
198 }
199 return $to;
200 } elsif ($type eq "ARRAY") {
201 my $to = [];
202 foreach my $v (@$from) {
203 push (@$to, &clone ($v));
204 }
205 return $to;
206 } else {
207 return $from;
208 }
209}
210
211sub set_OIDtype {
212 my $self = shift (@_);
213 my ($type, $metadata) = @_;
214
215 if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|hash_on_full_filename|incremental|filename|dirname|full_filename|assigned)$/) {
216 $self->{'OIDtype'} = $type;
217 } else {
218 $self->{'OIDtype'} = "hash";
219 }
220
221 if ($type =~ /^assigned$/) {
222 if (defined $metadata) {
223 $self->{'OIDmetadata'} = $metadata;
224 } else {
225 $self->{'OIDmetadata'} = "dc.Identifier";
226 }
227 }
228}
229
230# rename_method can be 'url', 'none', 'base64'
231sub set_source_filename {
232 my $self = shift (@_);
233 my ($source_filename, $rename_method) = @_;
234
235 # Since the gsdlsourcefilename element goes into the doc.xml it has
236 # to be utf8. However, it should also *represent* the source filename
237 # (in the import directory) which may not be utf8 at all.
238 # For instance, if this meta element (gsdlsourcefilename) will be used
239 # by other applications that parse doc.xml in order to locate
240 # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
241 # encode the real filename as this is a binary-to-text encoding meaning
242 # that the resulting string is ASCII (utf8). Decoding will give the original.
243
244# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
245
246 # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
247 $source_filename = $self->encode_filename($source_filename, $rename_method);
248# my ($srcfilename,$dirname,$suffix)
249# = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
250# print STDERR "-> $srcfilename -> ";
251# $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
252# $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
253# print STDERR "$source_filename\n";
254
255 $self->set_utf8_metadata_element ($self->get_top_section(),
256 "gsdlsourcefilename",
257 $source_filename);
258}
259
260sub encode_filename {
261 my $self = shift (@_);
262 my ($source_filename, $rename_method) = @_;
263
264 my ($srcfilename,$dirname,$suffix)
265 = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
266# print STDERR "-> $srcfilename -> ";
267 $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
268 $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
269
270 return $source_filename;
271}
272
273sub set_converted_filename {
274 my $self = shift (@_);
275 my ($converted_filename) = @_;
276
277 # we know the converted filename is utf8
278 $self->set_utf8_metadata_element ($self->get_top_section(),
279 "gsdlconvertedfilename",
280 $converted_filename);
281}
282
283# returns the source_filename as it was provided
284sub get_unmodified_source_filename {
285 my $self = shift (@_);
286
287 return $self->{'terse_source_path'};
288}
289
290# returns the source_filename with whatever rename_method was given
291sub get_source_filename {
292 my $self = shift (@_);
293
294 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
295}
296
297
298
299# returns converted filename if available else returns source filename
300sub get_filename_for_hashing {
301 my $self = shift (@_);
302
303 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
304
305 if (!defined $filename) {
306 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
307 # if NULPlug processed file, then don't give a filename
308 if (defined $plugin_name && $plugin_name eq "NULPlug") {
309 $filename = undef;
310 } else { # returns the URL encoded source filename!
311 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
312 }
313 }
314
315 if (!&FileUtils::isFilenameAbsolute($filename)) {
316 $filename = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$filename);
317 }
318
319 return $filename;
320}
321
322sub set_doc_type {
323 my $self = shift (@_);
324 my ($doc_type) = @_;
325
326 $self->set_metadata_element ($self->get_top_section(),
327 "gsdldoctype",
328 $doc_type);
329}
330
331# returns the gsdldoctype as it was provided
332# the default of "indexed_doc" is used if no document
333# type was provided
334sub get_doc_type {
335 my $self = shift (@_);
336
337 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
338 return $doc_type if (defined $doc_type);
339 return "indexed_doc";
340}
341
342
343# look up the reference to the a particular section
344sub _lookup_section {
345 my $self = shift (@_);
346 my ($section) = @_;
347
348 my ($num);
349 my $sectionref = $self;
350
351 while (defined $section && $section ne "") {
352
353 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
354
355 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
356
357 $section = "" unless defined $section;
358
359
360 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
361 $sectionref = $sectionref->{'subsections'}->{$num};
362 } else {
363 return undef;
364 }
365 }
366
367 return $sectionref;
368}
369
370# calculate OID by hashing the contents of the document
371sub _calc_OID {
372 my $self = shift (@_);
373 my ($filename) = @_;
374
375
376 my $osexe = &util::get_os_exe();
377
378 my $hashfile_exe = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin",
379 $ENV{'GSDLOS'},"hashfile$osexe");
380
381 &util::set_gnomelib_env(); # gnomelib_env (particularly lib/libiconv2.dylib) required to run the hashfile executable on Mac Lions
382 # The subroutine will set the gnomelib env once for each subshell launched, by first testing if GEXTGNOME is not already set
383
384 # A different way to set the gnomelib env would be to do it more locally: exporting the necessary vars
385 # (specifically DYLD/LD_LIB_PATH) for gnome_lib as part of the command executed.
386 # E.g. $result=`export LD_LIBRARY_PATH=../ext/gnome-lib/darwin/lib; hashfile...`
387
388 my $result = "NULL";
389
390
391 if (-e "$hashfile_exe") {
392# $result = `\"$hashfile_exe\" \"$filename\"`;
393# $result = `hashfile$osexe \"$filename\" 2>&1`;
394 $result = `hashfile$osexe \"$filename\"`;
395
396 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
397 } else {
398 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
399 }
400 return "HASH$result";
401}
402
403# methods dealing with OID, not groups of them.
404
405# if $OID is not provided one is calculated
406sub set_OID {
407 my $self = shift (@_);
408 my ($OID) = @_;
409
410 my $use_hash_oid = 0;
411 # if an OID wasn't provided calculate one
412 if (!defined $OID) {
413 $OID = "NULL";
414 if ($self->{'OIDtype'} =~ /^hash/) {
415 $use_hash_oid = 1;
416 } elsif ($self->{'OIDtype'} eq "incremental") {
417 $OID = "D" . $OIDcount;
418 $OIDcount ++;
419 } elsif ($self->{'OIDtype'} eq "filename") {
420 my $filename = $self->get_source_filename();
421 $OID = &File::Basename::fileparse($filename, qr/\.[^.]*/);
422 $OID = &util::tidy_up_oid($OID);
423 } elsif ($self->{'OIDtype'} eq "full_filename") {
424 my $source_filename = $self->get_source_filename();
425 my $dirsep = &util::get_os_dirsep();
426
427 $source_filename =~ s/^import$dirsep//;
428 $source_filename =~ s/$dirsep/-/g;
429 $source_filename =~ s/\./_/g;
430
431 $OID = $source_filename;
432 $OID = &util::tidy_up_oid($OID);
433 } elsif ($self->{'OIDtype'} eq "dirname") {
434 my $filename = $self->get_source_filename();
435 if (defined($filename)) { # && -e $filename) {
436 # get the immediate parent directory
437 $OID = &File::Basename::dirname($filename);
438 if (defined $OID) {
439 $OID = &File::Basename::basename($OID);
440 $OID = &util::tidy_up_oid($OID);
441 } else {
442 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
443 $use_hash_oid = 1;
444 }
445 } else {
446 print STDERR "Failed to find a filename, generating hash id\n";
447 $use_hash_oid = 1;
448 }
449
450 } elsif ($self->{'OIDtype'} eq "assigned") {
451 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
452 if (defined $identifier && $identifier ne "") {
453 $OID = $identifier;
454 $OID = &util::tidy_up_oid($OID);
455 } else {
456 # need a hash id
457 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
458 $use_hash_oid = 1;
459 }
460
461 } else {
462 $use_hash_oid = 1;
463 }
464
465 if ($use_hash_oid) {
466 my $hash_on_file = 1;
467 my $hash_on_ga_xml = 0;
468
469 if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
470 $hash_on_file = 0;
471 $hash_on_ga_xml = 1;
472 }
473
474 if ($self->{'OIDtype'} eq "hash_on_full_filename") {
475 $hash_on_file = 0;
476 $hash_on_ga_xml = 0;
477
478 my $source_filename = $self->get_source_filename();
479 my $dirsep = &util::get_os_dirsep();
480
481 $source_filename =~ s/^import$dirsep//;
482 $source_filename =~ s/$dirsep/-/g;
483 $source_filename =~ s/\./_/g;
484
485 # If the filename is very short then (handled naively)
486 # this can cause conjestion in the hash-values
487 # computed, leading documents sharing the same leading
488 # Hex values in the computed has.
489 #
490 # The solution taken here is to replace the name of
491 # the file name a sufficient number of times (up to
492 # the character limit defined in 'rep_limit' and
493 # make that the content that is hashed on
494
495 # *** Think twice before changing the following value
496 # as it will break backward compatability of computed
497 # document HASH values
498
499 my $rep_limit = 256;
500 my $hash_content = undef;
501
502 if (length($source_filename)<$rep_limit) {
503 my $rep_string = "$source_filename|";
504 my $rs_len = length($rep_string);
505
506 my $clone_times = int(($rep_limit-1)/$rs_len) +1;
507
508 $hash_content = substr($rep_string x $clone_times, 0, $rep_limit);
509 }
510 else {
511 $hash_content = $source_filename;
512 }
513
514 my $filename = &util::get_tmp_filename();
515 if (!open (OUTFILE, ">:utf8", $filename)) {
516 print STDERR "doc::set_OID could not write to $filename\n";
517 } else {
518 print OUTFILE $hash_content;
519 close (OUTFILE);
520 }
521 $OID = $self->_calc_OID ($filename);
522
523 &FileUtils::removeFiles ($filename);
524 }
525
526 if ($hash_on_file) {
527 # "hash" OID - feed file to hashfile.exe
528 my $filename = $self->get_filename_for_hashing();
529
530 # -z: don't want to hash on the file if it is zero size
531 if (defined($filename) && -e $filename && !-z $filename) {
532 $OID = $self->_calc_OID ($filename);
533 } else {
534 $hash_on_ga_xml = 1; # switch to back-up plan, and hash on GA file instead
535 }
536 }
537
538 if ($hash_on_ga_xml) {
539 # In addition being asked to explicity calculate the has based on the GA file,
540 # can also end up coming into this block is doing 'hash_on_file' but the file
541 # itself is of zero bytes (as could be the case with 'doc.nul' file
542
543 my $filename = &util::get_tmp_filename();
544 if (!open (OUTFILE, ">:utf8", $filename)) {
545 print STDERR "doc::set_OID could not write to $filename\n";
546 } else {
547 my $doc_text = &docprint::get_section_xml($self);
548 print OUTFILE $doc_text;
549 close (OUTFILE);
550 }
551 $OID = $self->_calc_OID ($filename);
552 &FileUtils::removeFiles($filename);
553 }
554 }
555 }
556 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
557}
558
559# this uses hashdoc (embedded c thingy) which is faster but still
560# needs a little work to be sufficiently stable
561sub ___set_OID {
562 my $self = shift (@_);
563 my ($OID) = @_;
564
565 # if an OID wasn't provided then calculate hash value based on document
566 if (!defined $OID)
567 {
568 my $hash_text = &docprint::get_section_xml($self);
569 my $hash_len = length($hash_text);
570
571 $OID = &hashdoc::buffer($hash_text,$hash_len);
572 }
573
574 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
575}
576
577# returns the OID for this document
578sub get_OID {
579 my $self = shift (@_);
580 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
581 return $OID if (defined $OID);
582 return "NULL";
583}
584
585sub delete_OID {
586 my $self = shift (@_);
587
588 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
589}
590
591
592# methods for manipulating section names
593
594# returns the name of the top-most section (the top
595# level of the document
596sub get_top_section {
597 my $self = shift (@_);
598
599 return "";
600}
601
602# returns a section
603sub get_parent_section {
604 my $self = shift (@_);
605 my ($section) = @_;
606
607 $section =~ s/(^|\.)\d+$//;
608
609 return $section;
610}
611
612# returns the first child section (or the end child
613# if there isn't any)
614sub get_begin_child {
615 my $self = shift (@_);
616 my ($section) = @_;
617
618 my $section_ptr = $self->_lookup_section($section);
619 return "" unless defined $section_ptr;
620
621 if (defined $section_ptr->{'subsection_order'}->[0]) {
622 return "$section.$section_ptr->{'subsection_order'}->[0]";
623 }
624
625 return $self->get_end_child ($section);
626}
627
628# returns the next child of a parent section
629sub get_next_child {
630 my $self = shift (@_);
631 my ($section) = @_;
632
633 my $parent_section = $self->get_parent_section($section);
634 my $parent_section_ptr = $self->_lookup_section($parent_section);
635 return undef unless defined $parent_section_ptr;
636
637 my ($section_num) = $section =~ /(\d+)$/;
638 return undef unless defined $section_num;
639
640 my $i = 0;
641 my $section_order = $parent_section_ptr->{'subsection_order'};
642 while ($i < scalar(@$section_order)) {
643 last if $section_order->[$i] eq $section_num;
644 $i++;
645 }
646
647 $i++; # the next child
648 if ($i < scalar(@$section_order)) {
649 return $section_order->[$i] if $parent_section eq "";
650 return "$parent_section.$section_order->[$i]";
651 }
652
653 # no more sections in this level
654 return undef;
655}
656
657# returns a reference to a list of children
658sub get_children {
659 my $self = shift (@_);
660 my ($section) = @_;
661
662 my $section_ptr = $self->_lookup_section($section);
663 return [] unless defined $section_ptr;
664
665 my @children = @{$section_ptr->{'subsection_order'}};
666
667 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
668 return \@children;
669}
670
671# returns the child section one past the last one (which
672# is coded as "0")
673sub get_end_child {
674 my $self = shift (@_);
675 my ($section) = @_;
676
677 return $section . ".0" unless $section eq "";
678 return "0";
679}
680
681# returns the next section in book order
682sub get_next_section {
683 my $self = shift (@_);
684 my ($section) = @_;
685
686 return undef unless defined $section;
687
688 my $section_ptr = $self->_lookup_section($section);
689 return undef unless defined $section_ptr;
690
691 # first try to find first child
692 if (defined $section_ptr->{'subsection_order'}->[0]) {
693 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
694 return "$section.$section_ptr->{'subsection_order'}->[0]";
695 }
696
697 do {
698 # try to find sibling
699 my $next_child = $self->get_next_child ($section);
700 return $next_child if (defined $next_child);
701
702 # move up one level
703 $section = $self->get_parent_section ($section);
704 } while $section =~ /\d/;
705
706 return undef;
707}
708
709sub is_leaf_section {
710 my $self = shift (@_);
711 my ($section) = @_;
712
713 my $section_ptr = $self->_lookup_section($section);
714 return 1 unless defined $section_ptr;
715
716 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
717}
718
719# methods for dealing with sections
720
721# returns the name of the inserted section
722sub insert_section {
723 my $self = shift (@_);
724 my ($before_section) = @_;
725
726 # get the child to insert before and its parent section
727 my $parent_section = "";
728 my $before_child = "0";
729 my @before_section = split (/\./, $before_section);
730 if (scalar(@before_section) > 0) {
731 $before_child = pop (@before_section);
732 $parent_section = join (".", @before_section);
733 }
734
735 my $parent_section_ptr = $self->_lookup_section($parent_section);
736 if (!defined $parent_section_ptr) {
737 print STDERR "doc::insert_section couldn't find parent section " .
738 "$parent_section\n";
739 return;
740 }
741
742 # get the next section number
743 my $section_num = $parent_section_ptr->{'next_subsection'}++;
744
745 my $i = 0;
746 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
747 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
748 $i++;
749 }
750
751 # insert the section number into the order list
752 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
753
754 # add this section to the parent section
755 my $section_ptr = {'subsection_order'=>[],
756 'next_subsection'=>1,
757 'subsections'=>{},
758 'metadata'=>[],
759 'text'=>""};
760 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
761
762 # work out the full section number
763 my $section = $parent_section;
764 $section .= "." unless $section eq "";
765 $section .= $section_num;
766
767 return $section;
768}
769
770# creates a pre-named section
771sub create_named_section {
772 my $self = shift (@_);
773 my ($mastersection) = @_;
774
775 my ($num);
776 my $section = $mastersection;
777 my $sectionref = $self;
778
779 while ($section ne "") {
780 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
781 $num =~ s/^0+(\d)/$1/; # remove leading 0s
782 $section = "" unless defined $section;
783
784 if (defined $num) {
785 if (!defined $sectionref->{'subsections'}->{$num}) {
786 push (@{$sectionref->{'subsection_order'}}, $num);
787 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
788 'next_subsection'=>1,
789 'subsections'=>{},
790 'metadata'=>[],
791 'text'=>""};
792 if ($num >= $sectionref->{'next_subsection'}) {
793 $sectionref->{'next_subsection'} = $num + 1;
794 }
795 }
796 $sectionref = $sectionref->{'subsections'}->{$num};
797
798 } else {
799 print STDERR "doc::create_named_section couldn't create section ";
800 print STDERR "$mastersection\n";
801 last;
802 }
803 }
804}
805
806# returns a reference to a list of subsections
807sub list_subsections {
808 my $self = shift (@_);
809 my ($section) = @_;
810
811 my $section_ptr = $self->_lookup_section ($section);
812 if (!defined $section_ptr) {
813 print STDERR "doc::list_subsections couldn't find section $section\n";
814 return [];
815 }
816
817 return [@{$section_ptr->{'subsection_order'}}];
818}
819
820sub delete_section {
821 my $self = shift (@_);
822 my ($section) = @_;
823
824# my $section_ptr = {'subsection_order'=>[],
825# 'next_subsection'=>1,
826# 'subsections'=>{},
827# 'metadata'=>[],
828# 'text'=>""};
829
830 # if this is the top section reset everything
831 if ($section eq "") {
832 $self->{'subsection_order'} = [];
833 $self->{'subsections'} = {};
834 $self->{'metadata'} = [];
835 $self->{'text'} = "";
836 return;
837 }
838
839 # find the parent of the section to delete
840 my $parent_section = "";
841 my $child = "0";
842 my @section = split (/\./, $section);
843 if (scalar(@section) > 0) {
844 $child = pop (@section);
845 $parent_section = join (".", @section);
846 }
847
848 my $parent_section_ptr = $self->_lookup_section($parent_section);
849 if (!defined $parent_section_ptr) {
850 print STDERR "doc::delete_section couldn't find parent section " .
851 "$parent_section\n";
852 return;
853 }
854
855 # remove this section from the subsection_order list
856 my $i = 0;
857 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
858 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
859 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
860 last;
861 }
862 $i++;
863 }
864
865 # remove this section from the subsection hash
866 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
867 undef $parent_section_ptr->{'subsections'}->{$child};
868 }
869}
870
871#--
872# methods for dealing with metadata
873
874# set_metadata_element and get_metadata_element are for metadata
875# which should only have one value. add_meta_data and get_metadata
876# are for metadata which can have more than one value.
877
878# returns the first metadata value which matches field
879
880# This version of get metadata element works much like the one above,
881# except it allows for the namespace portion of a metadata element to
882# be ignored, thus if you are searching for dc.Title, the first piece
883# of matching metadata ending with the name Title (once any namespace
884# is removed) would be returned.
885# 28-11-2003 John Thompson
886sub get_metadata_element {
887 my $self = shift (@_);
888 my ($section, $field, $ignore_namespace) = @_;
889 my ($data);
890
891 $ignore_namespace = 0 unless defined $ignore_namespace;
892
893 my $section_ptr = $self->_lookup_section($section);
894 if (!defined $section_ptr) {
895 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
896 return;
897 }
898
899 # Remove any namespace if we are being told to ignore them
900 if($ignore_namespace) {
901 $field =~ s/^.*\.//; #$field =~ s/^\w*\.//;
902 }
903
904 foreach $data (@{$section_ptr->{'metadata'}}) {
905
906 my $data_name = $data->[0];
907
908 # Remove any namespace if we are being told to ignore them
909 if($ignore_namespace) {
910 $data_name =~ s/^.*\.//; #$data_name =~ s/^\w*\.//;
911 }
912 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
913 $data_name =~ s/^ex\.([^.]+)$/$1/; #$data_name =~ s/^ex\.//;
914
915 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
916 }
917
918 return undef; # was not found
919}
920
921# returns a list of the form [value1, value2, ...]
922sub get_metadata {
923 my $self = shift (@_);
924 my ($section, $field, $ignore_namespace) = @_;
925 my ($data);
926
927 $ignore_namespace = 0 unless defined $ignore_namespace;
928
929 my $section_ptr = $self->_lookup_section($section);
930 if (!defined $section_ptr) {
931 print STDERR "doc::get_metadata couldn't find section ",
932 $section, "\n";
933 return;
934 }
935
936 # Remove any namespace if we are being told to ignore them
937 if($ignore_namespace) {
938 $field =~ s/^.*\.//;
939 }
940
941 my @metadata = ();
942 foreach $data (@{$section_ptr->{'metadata'}}) {
943
944 my $data_name = $data->[0];
945
946 # Remove any namespace if we are being told to ignore them
947 if($ignore_namespace) {
948 $data_name =~ s/^.*\.//;
949 }
950 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
951 $data_name =~ s/^ex\.([^.]+)$/$1/;
952
953 push (@metadata, $data->[1]) if ($data_name eq $field);
954 }
955
956 return \@metadata;
957}
958
959sub get_metadata_hashmap {
960 my $self = shift (@_);
961 my ($section, $opt_namespace) = @_;
962
963 my $section_ptr = $self->_lookup_section($section);
964 if (!defined $section_ptr) {
965 print STDERR "doc::get_metadata couldn't find section ",
966 $section, "\n";
967 return;
968 }
969
970 my $metadata_hashmap = {};
971 foreach my $data (@{$section_ptr->{'metadata'}}) {
972 my $metaname = $data->[0];
973
974 if ((!defined $opt_namespace) || ($metaname =~ m/^$opt_namespace\./)) {
975 if (!defined $metadata_hashmap->{$metaname}) {
976 $metadata_hashmap->{$metaname} = [];
977 }
978 my $metaval_list = $metadata_hashmap->{$metaname};
979 push(@$metaval_list, $data->[1]);
980 }
981 }
982
983 return $metadata_hashmap;
984}
985
986# returns a list of the form [[field,value],[field,value],...]
987sub get_all_metadata {
988 my $self = shift (@_);
989 my ($section) = @_;
990
991 my $section_ptr = $self->_lookup_section($section);
992 if (!defined $section_ptr) {
993 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
994 return;
995 }
996
997 return $section_ptr->{'metadata'};
998}
999
1000# $value is optional
1001sub delete_metadata {
1002 my $self = shift (@_);
1003 my ($section, $field, $value) = @_;
1004
1005 my $section_ptr = $self->_lookup_section($section);
1006 if (!defined $section_ptr) {
1007 print STDERR "doc::delete_metadata couldn't find section ", $section, "$field\n";
1008 return;
1009 }
1010
1011 my $i = 0;
1012 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
1013 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
1014 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
1015 splice (@{$section_ptr->{'metadata'}}, $i, 1);
1016 } else {
1017 $i++;
1018 }
1019 }
1020}
1021
1022sub delete_all_metadata {
1023 my $self = shift (@_);
1024 my ($section) = @_;
1025
1026 my $section_ptr = $self->_lookup_section($section);
1027 if (!defined $section_ptr) {
1028 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
1029 return;
1030 }
1031
1032 $section_ptr->{'metadata'} = [];
1033}
1034
1035sub set_metadata_element {
1036 my $self = shift (@_);
1037 my ($section, $field, $value) = @_;
1038
1039 $self->set_utf8_metadata_element ($section, $field,
1040 &unicode::ascii2utf8(\$value));
1041}
1042
1043# set_utf8_metadata_element assumes the text has already been
1044# converted to the UTF-8 encoding.
1045sub set_utf8_metadata_element {
1046 my $self = shift (@_);
1047 my ($section, $field, $value) = @_;
1048
1049 $self->delete_metadata ($section, $field);
1050 $self->add_utf8_metadata ($section, $field, $value);
1051}
1052
1053
1054# add_metadata assumes the text is in (extended) ascii form. For
1055# text which has already been converted to the UTF-8 format use
1056# add_utf8_metadata.
1057sub add_metadata {
1058 my $self = shift (@_);
1059 my ($section, $field, $value) = @_;
1060
1061 $self->add_utf8_metadata ($section, $field,
1062 &unicode::ascii2utf8(\$value));
1063}
1064
1065sub add_utf8_metadata {
1066 my $self = shift (@_);
1067 my ($section, $field, $value) = @_;
1068
1069 # my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1070 # my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
1071 # print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
1072
1073 my $section_ptr = $self->_lookup_section($section);
1074 if (!defined $section_ptr) {
1075 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
1076 return;
1077 }
1078 if (!defined $value) {
1079 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
1080 return;
1081 }
1082 if (!defined $field) {
1083 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
1084 return;
1085 }
1086
1087 #print STDERR "###$field=$value\n";
1088
1089 # For now, supress this check. Given that text data read in is now
1090 # Unicode aware, then the following block of code can (ironically enough)
1091 # cause our unicode compliant string to be re-encoded (leading to
1092 # a double-encoded UTF-8 string, which we definitely don't want!).
1093
1094
1095 # double check that the value is utf-8
1096 # if (!&unicode::check_is_utf8($value)) {
1097 # print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
1098 # &unicode::ensure_utf8(\$value);
1099 # print STDERR " Tried converting to utf8: $value\n";
1100 # }
1101
1102 #If the metadata value is either a latitude or a longitude value then we want to save a shortened version for spacial searching purposes
1103 if ($field =~ m/^(.+\.)?Latitude$/ || $field =~ m/^(.+\.)?Longitude$/)
1104 {
1105 my ($mdprefix,$metaname) = ($field =~ m/(.+)\.(.+)$/);
1106 if (defined $mdprefix) {
1107 # Add in a version of Latitude/Longitude without the metadata namespace prefix to keep Runtime happy
1108 push (@{$section_ptr->{'metadata'}}, [$metaname, $value]);
1109 }
1110
1111 my $direction;
1112 if($value =~ m/^-/)
1113 {
1114 $direction = ($field eq "Latitude") ? "S" : "W";
1115 }
1116 else
1117 {
1118 $direction = ($field eq "Latitude") ? "N" : "E";
1119 }
1120
1121 my ($beforeDec, $afterDec) = ($value =~ m/^-?([0-9]+)\.([0-9]+)$/);
1122 if(defined $beforeDec && defined $afterDec)
1123 {
1124 my $name = ($field eq "Latitude") ? "LatShort" : "LngShort";
1125 push (@{$section_ptr->{'metadata'}}, [$name, $beforeDec . $direction]);
1126
1127 for(my $i = 2; $i <= 4; $i++)
1128 {
1129 if(length($afterDec) >= $i)
1130 {
1131 push (@{$section_ptr->{'metadata'}}, [$name, substr($afterDec, 0, $i)]);
1132 }
1133 }
1134
1135 #Only add the metadata if it has not already been added
1136 my $metaMap = $self->get_metadata_hashmap($section); ### TODO: metaMap not used. Unnecesssary step? (Called function has no side-effects.)
1137 }
1138
1139 if($field =~ m/^(.+\.)?Longitude$/) {
1140 # if we are dealing with Longitude meta, we should 1. have Latitude meta too; 2. already have processed Latitude meta
1141 # in that case, add both Lat and Lng of this section as a Coordinate meta
1142 my $latitude = $self->get_metadata_element ($section, "Latitude");
1143 # TODO: would like all Longitude info together followed by all Coordinate info, but the following will add all coord info meta and end of this function will add Longitude meta
1144 $self->processCoordinate($section, $latitude, $value); # value is Longitude
1145 }
1146 }
1147
1148 elsif($field eq "GPS.mapOverlay") { # then the value is a JSON string
1149
1150 # TODO:
1151 # If we already have Coordinate meta for this section of the document (as can happen during buildcol.pl),
1152 # let's ASSUME this means we've already processed GPS.mapOverlay meta into Coordinate meta for this section (can have happened during import.pl)
1153 # to avoid adding duplicate Coordinates meta, which then end up duplicated in the index
1154 # Of course, the assumption is not always true! We could have an image with embedded Lat and Lng meta,
1155 # and the same image doc's section could have GPS.mapOverlay meta (from shapes) added via the doc editor.
1156 # This very function would then have converted Lat/Lng into Coordinate meta (just in the if stmt above) and added it to the section.
1157 # And then by the time we process this section's GPS.mapOverlay meta here, we would notice the section has Coordinate meta already,
1158 # and therefore skip converting the GPS.mapOverlay meta into Coordinate meta! What to dooooo?
1159 # So the return statement immediately below is a temporary solution, until we find a better one that will always work.
1160 my $metaMap = $self->get_metadata_hashmap($section);
1161 if(!$metaMap->{'Coordinate'}) {
1162
1163 print STDERR "GPS.mapOverlay before val: " . $value . "\n";
1164
1165 # TODO html decode?
1166 $value =~ s@&#091;@[@g;
1167 $value =~ s@&#093;@]@g;
1168 $value =~ s@&quot;@"@g;
1169 print STDERR "GPS.mapOverlay after val: " . $value . "\n";
1170
1171 my $json_array = decode_json $value;
1172 #my $json = JSON->new->allow_nonref;
1173 #&printAllShapes($json, $json_array);
1174
1175 foreach my $shape (@$json_array) {
1176
1177 my $type = $shape->{"type"};
1178 print STDERR "Type : " . $type . "\n";
1179
1180 if($type eq "circle") {
1181 #print STDERR "Found a circle:\n" . &printShape($json, $shape);
1182
1183 # work out bounding box
1184 # SCARY!
1185 # want the inverse of this useful page:
1186 # https://stackoverflow.com/questions/639695/how-to-convert-latitude-or-longitude-to-meters
1187 # https://www.geodatasource.com/developers/javascript
1188
1189
1190 # for now, just process the circle centre
1191 #my $centre = $shape->{"center"};
1192 #$self->processLatOrLng($section_ptr, "Latitude", $centre->{"lat"});
1193 #$self->processLatOrLng($section_ptr, "Longitude", $centre->{"lng"});
1194
1195
1196 # Dr Bainbridge wants us to use: https://gis.stackexchange.com/questions/5821/calculating-latitude-longitude-x-miles-from-point
1197 # But we're using the rule of thumb here, since for N,E,S,W it works out the same:
1198 # https://gis.stackexchange.com/questions/2951/algorithm-for-offsetting-a-latitude-longitude-by-some-amount-of-meters
1199 # which states
1200 # "If your displacements aren't too great (less than a few kilometers) and you're not right at the poles,
1201 # use the quick and dirty estimate that 111,111 meters (111.111 km) in the y direction is 1 degree
1202 # (of latitude) and 111,111 * cos(latitude) meters in the x direction is 1 degree (of longitude)."
1203 my $centre_lat = $shape->{"center"}->{"lat"};
1204 my $centre_lng = $shape->{"center"}->{"lng"};
1205 my $radius = $shape->{"radius"}; # in metres!
1206
1207 print STDERR "@@@ circle centre: ($centre_lat, $centre_lng), radius: $radius\n";
1208
1209 my $lat_north = $centre_lat + ($radius/111111);
1210 my $lat_south = $centre_lat - ($radius/111111);
1211
1212 print STDERR "### lat_north: $lat_north\n";
1213 print STDERR "### lat_south: $lat_south\n";
1214
1215 # our latitude and longitude values are in degrees. But cos and sin etc in perl and generally all prog languages
1216 # all expect their angle to be in radians. So need to convert from degree to radians before we can take the cose of it.
1217 my $centre_lat_radians = $self->degreesToRadians($centre_lat);
1218 my $cos_in_radians = cos($centre_lat_radians);
1219 print STDERR "cos $centre_lat_radians " . cos($centre_lat_radians) . "\n";
1220 my $lng_east = $centre_lng + ($radius/(111111 * $cos_in_radians));
1221 my $lng_west = $centre_lng - ($radius/(111111 * $cos_in_radians));
1222 print STDERR "### lng_east $lng_east\n";
1223 print STDERR "### lng_west $lng_west\n";
1224
1225
1226
1227 my $cos_lat = cos($centre_lat);
1228 print STDERR "cos $centre_lat is $cos_lat\n";
1229
1230 $self->processCoordinate($section, $lat_north, $lng_east);
1231 $self->processCoordinate($section, $lat_south, $lng_east);
1232 $self->processCoordinate($section, $lat_south, $lng_west);
1233 $self->processCoordinate($section, $lat_north, $lng_west);
1234
1235 }
1236 elsif ($type eq "marker") {
1237 print STDERR "@@ MARKER FOUND WITH LAT: " . $shape->{"position"}->{"lat"} . "\n";
1238 print STDERR "@@ MARKER FOUND WITH LNG: " . $shape->{"position"}->{"lng"} . "\n";
1239 $self->processCoordinate($section, $shape->{"position"}->{"lat"}, $shape->{"position"}->{"lng"});
1240 }
1241 elsif ($type eq "polyline" || $type eq "polygon") {
1242 my $path_array = $shape->{"path"};
1243 foreach my $position (@$path_array) {
1244 $self->processCoordinate($section, $position->{"lat"}, $position->{"lng"});
1245 }
1246 }
1247 elsif ($type eq "rectangle") {
1248
1249 my $bounds = $shape->{"bounds"};
1250
1251 $self->processCoordinate($section, $bounds->{"north"}, $bounds->{"east"});
1252 $self->processCoordinate($section, $bounds->{"south"}, $bounds->{"east"});
1253 $self->processCoordinate($section, $bounds->{"south"}, $bounds->{"west"});
1254 $self->processCoordinate($section, $bounds->{"north"}, $bounds->{"west"});
1255 }
1256
1257 } # end for on each shape in GPS.mapOverlay
1258 }
1259 } # end GPS.mapOverlay meta
1260
1261 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1262}
1263
1264# https://en.wikipedia.org/wiki/Radian
1265sub degreesToRadians
1266{
1267 my $self = shift (@_);
1268 my ($degrees) = @_;
1269
1270 return $degrees * pi /180; # returns radians
1271}
1272
1273sub radiansToDegrees
1274{
1275 my $self = shift (@_);
1276 my ($radians) = @_;
1277
1278 return $radians * 180 / pi; # returns degrees
1279}
1280
1281sub printAllShapes {
1282 my ($json, $json_array) = @_;
1283
1284
1285 #my $pretty_print_shape = $json->pretty->encode( $json_array->[0] );
1286 foreach my $shape (@$json_array) {
1287 my $pretty_print_shape = $json->pretty->encode( $shape );
1288 print STDERR "Shape: $pretty_print_shape\n";
1289 #&printShape($shape);
1290 }
1291
1292}
1293
1294sub processCoordinate {
1295 my $self = shift (@_);
1296 my ($section, $latitude, $longitude) = @_;
1297
1298 my $section_ptr = $self->_lookup_section($section);
1299
1300 my $lat_direction = ($latitude =~ m/^-/) ? "S" : "N";
1301 my $lng_direction = ($longitude =~ m/^-/) ? "W" : "E";
1302
1303 # have to store (lat, lng) in pairs, when there are so many coords to store
1304 #push (@{$section_ptr->{'metadata'}}, ["Latitude", $latitude]);
1305 #push (@{$section_ptr->{'metadata'}}, ["Longitude", $longitude]);
1306
1307 push (@{$section_ptr->{'metadata'}}, ["Coordinate", "$latitude $longitude"]); # "$latitude$lat_direction $longitude$lng_direction"
1308
1309 my ($latBeforeDec, $latAfterDec);
1310 my ($lngBeforeDec, $lngAfterDec);
1311
1312 if($latitude !~ m/\./) {
1313 $latBeforeDec = $latitude;
1314 $latAfterDec = "";
1315 } else {
1316 ($latBeforeDec, $latAfterDec) = ($latitude =~ m/^-?([0-9]+)\.([0-9]+)$/);
1317 }
1318 if($longitude !~ m/\./) {
1319 $lngBeforeDec = $longitude;
1320 $lngAfterDec = "";
1321 } else {
1322 ($lngBeforeDec, $lngAfterDec) = ($longitude =~ m/^-?([0-9]+)\.([0-9]+)$/);
1323 }
1324
1325 #if(defined $beforeDec && defined $afterDec)
1326 #{
1327 my $name = "CoordShort";
1328 push (@{$section_ptr->{'metadata'}}, [$name, "$latBeforeDec$lat_direction $lngBeforeDec$lng_direction"]);
1329
1330 for(my $i = 2; $i <= 4; $i++)
1331 {
1332 my $latDecPlaces = (length($latAfterDec) >= $i) ? substr($latAfterDec, 0, $i) : "";
1333 my $lngDecPlaces = (length($lngAfterDec) >= $i) ? substr($lngAfterDec, 0, $i) : "";
1334
1335 push (@{$section_ptr->{'metadata'}}, [$name,
1336 $latBeforeDec . $lat_direction. $latDecPlaces . " " . $lngBeforeDec . $lng_direction. $lngDecPlaces]);
1337
1338 }
1339
1340 #Only add the metadata if it has not already been added
1341 #my $metaMap = $self->get_metadata_hashmap($section); ### TODO: metaMap not used. Unnecesssary step? (Called function has no side-effects.)
1342 #}
1343
1344
1345}
1346
1347
1348# methods for dealing with text
1349
1350# returns the text for a section
1351sub get_text {
1352 my $self = shift (@_);
1353 my ($section) = @_;
1354
1355 my $section_ptr = $self->_lookup_section($section);
1356 if (!defined $section_ptr) {
1357 print STDERR "doc::get_text couldn't find section " .
1358 "$section\n";
1359 return "";
1360 }
1361
1362 return $section_ptr->{'text'};
1363}
1364
1365# returns the (utf-8 encoded) length of the text for a section
1366sub get_text_length {
1367 my $self = shift (@_);
1368 my ($section) = @_;
1369
1370 my $section_ptr = $self->_lookup_section($section);
1371 if (!defined $section_ptr) {
1372 print STDERR "doc::get_text_length couldn't find section " .
1373 "$section\n";
1374 return 0;
1375 }
1376
1377 return length ($section_ptr->{'text'});
1378}
1379
1380# returns the total length for all the sections
1381sub get_total_text_length {
1382 my $self = shift (@_);
1383
1384 my $section = $self->get_top_section();
1385 my $length = 0;
1386 while (defined $section) {
1387 $length += $self->get_text_length($section);
1388 $section = $self->get_next_section($section);
1389 }
1390 return $length;
1391}
1392
1393sub delete_text {
1394 my $self = shift (@_);
1395 my ($section) = @_;
1396
1397 my $section_ptr = $self->_lookup_section($section);
1398 if (!defined $section_ptr) {
1399 print STDERR "doc::delete_text couldn't find section " .
1400 "$section\n";
1401 return;
1402 }
1403
1404 $section_ptr->{'text'} = "";
1405}
1406
1407# add_text assumes the text is in (extended) ascii form. For
1408# text which has been already converted to the UTF-8 format
1409# use add_utf8_text.
1410sub add_text {
1411 my $self = shift (@_);
1412 my ($section, $text) = @_;
1413
1414 # convert the text to UTF-8 encoded unicode characters
1415 # and add the text
1416 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1417}
1418
1419
1420# add_utf8_text assumes the text to be added has already
1421# been converted to the UTF-8 encoding. For ascii text use
1422# add_text
1423# Pass by value version (internally calls pass by ref version
1424# to avoid code duplication)
1425sub add_utf8_text {
1426 my $self = shift (@_);
1427 my ($section, $text) = @_;
1428
1429 $self->add_utf8_textref($section, \$text);
1430}
1431
1432# Pass by reference version, used by GreenstoneSQLPlugin for fulltext
1433sub add_utf8_textref {
1434 my $self = shift (@_);
1435 my ($section, $text_ref) = @_;
1436
1437 my $section_ptr = $self->_lookup_section($section);
1438 if (!defined $section_ptr) {
1439 print STDERR "doc::add_utf8_textref couldn't find section " .
1440 "$section\n";
1441 return;
1442 }
1443
1444 $section_ptr->{'text'} .= $$text_ref;
1445}
1446
1447# returns the Source meta, which is the utf8 filename generated.
1448# Added a separate method here for convenience
1449sub get_source {
1450 my $self = shift (@_);
1451 return $self->get_metadata_element ($self->get_top_section(), "Source");
1452}
1453
1454# returns the SourceFile meta, which is the url reference to the URL-encoded
1455# version of Source (the utf8 filename). Added a separate method here for convenience
1456sub get_sourcefile {
1457 my $self = shift (@_);
1458 return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
1459}
1460
1461# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
1462# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
1463sub get_assocfile_from_sourcefile {
1464 my $self = shift (@_);
1465
1466 # get the SourceFile meta, which is a *URL* to a file on the filesystem
1467 my $top_section = $self->get_top_section();
1468 my $source_file = $self->get_metadata_element($top_section, "SourceFile");
1469
1470 # get the actual filename as it exists on the filesystem which this url refers to
1471 $source_file = &unicode::url_to_filename($source_file);
1472 my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
1473 return $assocfilename;
1474}
1475
1476# methods for dealing with associated files
1477
1478# a file is associated with a document, NOT a section.
1479# if section is defined it is noted in the data structure
1480# only so that files associated from a particular section
1481# may be removed later (using delete_section_assoc_files)
1482sub associate_file {
1483 my $self = shift (@_);
1484 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1485 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1486
1487 # remove all associated files with the same name
1488 $self->delete_assoc_file ($assoc_filename);
1489
1490 # Too harsh a requirement
1491 # Definitely get HTML docs, for example, with some missing
1492 # support files
1493# if (!&util::fd_exists($real_filename)) {
1494# print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n";
1495# exit -1;
1496# }
1497
1498# print STDERR "**** is the following a UTF8 rep of *real* filename?\n $real_filename\n";
1499# print STDERR "****##### so, ensure it is before storing?!?!?\n";
1500## my $utf8_filename = Encode::encode("utf8",$filename);
1501
1502 push (@{$self->{'associated_files'}},
1503 [$real_filename, $assoc_filename, $mime_type, $section]);
1504}
1505
1506# returns a list of associated files in the form
1507# [[real_filename, assoc_filename, mimetype], ...]
1508sub get_assoc_files {
1509 my $self = shift (@_);
1510
1511 return $self->{'associated_files'};
1512}
1513
1514# the following two methods used to keep track of original associated files
1515# for incremental building. eg a txt file used by an item file does not end
1516# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1517# file for incremental build
1518sub associate_source_file {
1519 my $self = shift (@_);
1520 my ($full_filename) = @_;
1521
1522 push (@{$self->{'source_assoc_files'}}, $full_filename);
1523
1524}
1525
1526sub get_source_assoc_files {
1527 my $self = shift (@_);
1528
1529 return $self->{'source_assoc_files'};
1530
1531
1532}
1533sub metadata_file {
1534 my $self = shift (@_);
1535 my ($real_filename, $filename) = @_;
1536
1537 push (@{$self->{'metadata_files'}},
1538 [$real_filename, $filename]);
1539}
1540
1541# used for writing out the archiveinf-doc info database, to list all the metadata files
1542sub get_meta_files {
1543 my $self = shift (@_);
1544
1545 return $self->{'metadata_files'};
1546}
1547
1548sub delete_section_assoc_files {
1549 my $self = shift (@_);
1550 my ($section) = @_;
1551
1552 my $i=0;
1553 while ($i < scalar (@{$self->{'associated_files'}})) {
1554 if (defined $self->{'associated_files'}->[$i]->[3] &&
1555 $self->{'associated_files'}->[$i]->[3] eq $section) {
1556 splice (@{$self->{'associated_files'}}, $i, 1);
1557 } else {
1558 $i++;
1559 }
1560 }
1561}
1562
1563sub delete_assoc_file {
1564 my $self = shift (@_);
1565 my ($assoc_filename) = @_;
1566
1567 my $i=0;
1568 while ($i < scalar (@{$self->{'associated_files'}})) {
1569 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1570 splice (@{$self->{'associated_files'}}, $i, 1);
1571 } else {
1572 $i++;
1573 }
1574 }
1575}
1576
1577sub reset_nextsection_ptr {
1578 my $self = shift (@_);
1579 my ($section) = @_;
1580
1581 my $section_ptr = $self->_lookup_section($section);
1582 $section_ptr->{'next_subsection'} = 1;
1583}
1584
15851;
Note: See TracBrowser for help on using the repository browser.