source: main/trunk/greenstone2/perllib/doc.pm@ 33126

Last change on this file since 33126 was 33126, checked in by wy59, 5 years ago

Incremental changes following previous commit. This time the code is more backwards compatible with older collections that contain only Latitude and Longitude meta. In such cases, we now ADDITIONALLY add Coordinate (and Coordshort) meta, while still also outputting Lat and Lng meta. The JS and XSL code however now prefers to work with Coordinate meta where present.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 46.4 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use FileUtils;
40use ghtml;
41use File::stat;
42##use hashdoc;
43use docprint;
44use JSON;
45
46# We just need pi from the Trig lib
47# Import constants pi2, pip2, pip4 (2*pi, pi/2, pi/4).
48use Math::Trig ':pi';
49
50# the document type may be indexed_doc, nonindexed_doc, or
51# classification
52
53our $OIDcount = 0;
54
55# rename_method can be 'url', 'none', 'base64'
56sub new {
57 my $class = shift (@_);
58 my ($source_filename, $doc_type, $rename_method) = @_;
59
60
61 my $self = bless {'associated_files'=>[],
62 'subsection_order'=>[],
63 'next_subsection'=>1,
64 'subsections'=>{},
65 'metadata'=>[],
66 'text'=>"",
67 'OIDtype'=>"hash"}, $class;
68
69 # used to set lastmodified here, but this can screw up the HASH ids, so
70 # the docsave processor now calls set_lastmodified
71
72 $self->set_source_path($source_filename);
73
74 if (defined $source_filename) {
75 $source_filename = &util::filename_within_collection($source_filename);
76 print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
77 $self->set_source_filename ($source_filename, $rename_method);
78 }
79
80 $self->set_doc_type ($doc_type) if defined $doc_type;
81
82 return $self;
83}
84
85
86sub set_source_path
87{
88 my $self = shift @_;
89 my ($source_filename) = @_;
90
91 if (defined $source_filename) {
92 # On Windows the source_filename can be in terse DOS format
93 # e.g. test~1.txt
94
95 $self->{'terse_source_path'} = $source_filename;
96
97 # Use the FileUtil library methods as they are aware of more special
98 # cases such as HDFS [jmt12]
99 if (&FileUtils::fileExists($source_filename))
100 {
101 # See if we can do better for Windows with a filename
102 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
103 require Win32;
104 $self->{'source_path'} = Win32::GetLongPathName($source_filename);
105 }
106 else {
107 # For Unix-based systems, there is no difference between the two
108 $self->{'source_path'} = $source_filename;
109 }
110 }
111 else {
112 print STDERR "Warning: In doc::set_source_path(), file\n";
113 print STDERR " $source_filename\n";
114 print STDERR " does not exist\n";
115
116 # (default) Set it to whatever we were given
117 $self->{'source_path'} = $source_filename;
118 }
119 }
120 else {
121 # Previous code for setting source_path allowed for
122 # it to be undefined, so continue this practice
123 $self->{'terse_source_path'} = undef;
124 $self->{'source_path'} = undef;
125 }
126}
127
128
129sub get_source_path
130{
131 my $self = shift @_;
132
133 return $self->{'terse_source_path'};
134}
135
136# set lastmodified for OAI purposes, added by GRB, moved by kjdon
137sub set_oailastmodified {
138 my $self = shift (@_);
139
140 my $source_path = $self->{'terse_source_path'};
141
142 if (defined $source_path && (-e $source_path)) {
143 my $current_time = time;
144
145 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
146 $wday, $yday, $isdst) = localtime($current_time);
147
148 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
149
150 $self->add_utf8_metadata($self->get_top_section(), "oailastmodified", $current_time);
151 $self->add_utf8_metadata($self->get_top_section(), "oailastmodifieddate", $date_modified);
152 }
153}
154
155# no longer used for OAI purposes, since lastmodified is not what we want as the
156# Datestamp of a document. This doc metadata may be useful for general purposes.
157sub set_lastmodified {
158 my $self = shift (@_);
159
160 my $source_path = $self->{'terse_source_path'};
161
162 if (defined $source_path && (-e $source_path)) {
163
164 my $file_stat = stat($source_path);
165 my $mtime = $file_stat->mtime;
166 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
167 $wday, $yday, $isdst) = localtime($mtime);
168
169 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
170
171 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $mtime);
172 $self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
173 }
174}
175
176# clone the $self object
177sub duplicate {
178 my $self = shift (@_);
179
180 my $newobj = {};
181
182 foreach my $k (keys %$self) {
183 $newobj->{$k} = &clone ($self->{$k});
184 }
185
186 bless $newobj, ref($self);
187 return $newobj;
188}
189
190sub clone {
191 my ($from) = @_;
192 my $type = ref ($from);
193
194 if ($type eq "HASH") {
195 my $to = {};
196 foreach my $key (keys %$from) {
197 $to->{$key} = &clone ($from->{$key});
198 }
199 return $to;
200 } elsif ($type eq "ARRAY") {
201 my $to = [];
202 foreach my $v (@$from) {
203 push (@$to, &clone ($v));
204 }
205 return $to;
206 } else {
207 return $from;
208 }
209}
210
211sub set_OIDtype {
212 my $self = shift (@_);
213 my ($type, $metadata) = @_;
214
215 if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|hash_on_full_filename|incremental|filename|dirname|full_filename|assigned)$/) {
216 $self->{'OIDtype'} = $type;
217 } else {
218 $self->{'OIDtype'} = "hash";
219 }
220
221 if ($type =~ /^assigned$/) {
222 if (defined $metadata) {
223 $self->{'OIDmetadata'} = $metadata;
224 } else {
225 $self->{'OIDmetadata'} = "dc.Identifier";
226 }
227 }
228}
229
230# rename_method can be 'url', 'none', 'base64'
231sub set_source_filename {
232 my $self = shift (@_);
233 my ($source_filename, $rename_method) = @_;
234
235 # Since the gsdlsourcefilename element goes into the doc.xml it has
236 # to be utf8. However, it should also *represent* the source filename
237 # (in the import directory) which may not be utf8 at all.
238 # For instance, if this meta element (gsdlsourcefilename) will be used
239 # by other applications that parse doc.xml in order to locate
240 # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
241 # encode the real filename as this is a binary-to-text encoding meaning
242 # that the resulting string is ASCII (utf8). Decoding will give the original.
243
244# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
245
246 # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
247 $source_filename = $self->encode_filename($source_filename, $rename_method);
248# my ($srcfilename,$dirname,$suffix)
249# = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
250# print STDERR "-> $srcfilename -> ";
251# $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
252# $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
253# print STDERR "$source_filename\n";
254
255 $self->set_utf8_metadata_element ($self->get_top_section(),
256 "gsdlsourcefilename",
257 $source_filename);
258}
259
260sub encode_filename {
261 my $self = shift (@_);
262 my ($source_filename, $rename_method) = @_;
263
264 my ($srcfilename,$dirname,$suffix)
265 = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
266# print STDERR "-> $srcfilename -> ";
267 $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
268 $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
269
270 return $source_filename;
271}
272
273sub set_converted_filename {
274 my $self = shift (@_);
275 my ($converted_filename) = @_;
276
277 # we know the converted filename is utf8
278 $self->set_utf8_metadata_element ($self->get_top_section(),
279 "gsdlconvertedfilename",
280 $converted_filename);
281}
282
283# returns the source_filename as it was provided
284sub get_unmodified_source_filename {
285 my $self = shift (@_);
286
287 return $self->{'terse_source_path'};
288}
289
290# returns the source_filename with whatever rename_method was given
291sub get_source_filename {
292 my $self = shift (@_);
293
294 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
295}
296
297
298
299# returns converted filename if available else returns source filename
300sub get_filename_for_hashing {
301 my $self = shift (@_);
302
303 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
304
305 if (!defined $filename) {
306 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
307 # if NULPlug processed file, then don't give a filename
308 if (defined $plugin_name && $plugin_name eq "NULPlug") {
309 $filename = undef;
310 } else { # returns the URL encoded source filename!
311 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
312 }
313 }
314
315 if (!&FileUtils::isFilenameAbsolute($filename)) {
316 $filename = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$filename);
317 }
318
319 return $filename;
320}
321
322sub set_doc_type {
323 my $self = shift (@_);
324 my ($doc_type) = @_;
325
326 $self->set_metadata_element ($self->get_top_section(),
327 "gsdldoctype",
328 $doc_type);
329}
330
331# returns the gsdldoctype as it was provided
332# the default of "indexed_doc" is used if no document
333# type was provided
334sub get_doc_type {
335 my $self = shift (@_);
336
337 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
338 return $doc_type if (defined $doc_type);
339 return "indexed_doc";
340}
341
342
343# look up the reference to the a particular section
344sub _lookup_section {
345 my $self = shift (@_);
346 my ($section) = @_;
347
348 my ($num);
349 my $sectionref = $self;
350
351 while (defined $section && $section ne "") {
352
353 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
354
355 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
356
357 $section = "" unless defined $section;
358
359
360 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
361 $sectionref = $sectionref->{'subsections'}->{$num};
362 } else {
363 return undef;
364 }
365 }
366
367 return $sectionref;
368}
369
370# calculate OID by hashing the contents of the document
371sub _calc_OID {
372 my $self = shift (@_);
373 my ($filename) = @_;
374
375
376 my $osexe = &util::get_os_exe();
377
378 my $hashfile_exe = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin",
379 $ENV{'GSDLOS'},"hashfile$osexe");
380
381 &util::set_gnomelib_env(); # gnomelib_env (particularly lib/libiconv2.dylib) required to run the hashfile executable on Mac Lions
382 # The subroutine will set the gnomelib env once for each subshell launched, by first testing if GEXTGNOME is not already set
383
384 # A different way to set the gnomelib env would be to do it more locally: exporting the necessary vars
385 # (specifically DYLD/LD_LIB_PATH) for gnome_lib as part of the command executed.
386 # E.g. $result=`export LD_LIBRARY_PATH=../ext/gnome-lib/darwin/lib; hashfile...`
387
388 my $result = "NULL";
389
390
391 if (-e "$hashfile_exe") {
392# $result = `\"$hashfile_exe\" \"$filename\"`;
393# $result = `hashfile$osexe \"$filename\" 2>&1`;
394 $result = `hashfile$osexe \"$filename\"`;
395
396 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
397 } else {
398 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
399 }
400 return "HASH$result";
401}
402
403# methods dealing with OID, not groups of them.
404
405# if $OID is not provided one is calculated
406sub set_OID {
407 my $self = shift (@_);
408 my ($OID) = @_;
409
410 my $use_hash_oid = 0;
411 # if an OID wasn't provided calculate one
412 if (!defined $OID) {
413 $OID = "NULL";
414 if ($self->{'OIDtype'} =~ /^hash/) {
415 $use_hash_oid = 1;
416 } elsif ($self->{'OIDtype'} eq "incremental") {
417 $OID = "D" . $OIDcount;
418 $OIDcount ++;
419 } elsif ($self->{'OIDtype'} eq "filename") {
420 my $filename = $self->get_source_filename();
421 $OID = &File::Basename::fileparse($filename, qr/\.[^.]*/);
422 $OID = &util::tidy_up_oid($OID);
423 } elsif ($self->{'OIDtype'} eq "full_filename") {
424 my $source_filename = $self->get_source_filename();
425 my $dirsep = &util::get_os_dirsep();
426
427 $source_filename =~ s/^import$dirsep//;
428 $source_filename =~ s/$dirsep/-/g;
429 $source_filename =~ s/\./_/g;
430
431 $OID = $source_filename;
432 $OID = &util::tidy_up_oid($OID);
433 } elsif ($self->{'OIDtype'} eq "dirname") {
434 my $filename = $self->get_source_filename();
435 if (defined($filename)) { # && -e $filename) {
436 # get the immediate parent directory
437 $OID = &File::Basename::dirname($filename);
438 if (defined $OID) {
439 $OID = &File::Basename::basename($OID);
440 $OID = &util::tidy_up_oid($OID);
441 } else {
442 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
443 $use_hash_oid = 1;
444 }
445 } else {
446 print STDERR "Failed to find a filename, generating hash id\n";
447 $use_hash_oid = 1;
448 }
449
450 } elsif ($self->{'OIDtype'} eq "assigned") {
451 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
452 if (defined $identifier && $identifier ne "") {
453 $OID = $identifier;
454 $OID = &util::tidy_up_oid($OID);
455 } else {
456 # need a hash id
457 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
458 $use_hash_oid = 1;
459 }
460
461 } else {
462 $use_hash_oid = 1;
463 }
464
465 if ($use_hash_oid) {
466 my $hash_on_file = 1;
467 my $hash_on_ga_xml = 0;
468
469 if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
470 $hash_on_file = 0;
471 $hash_on_ga_xml = 1;
472 }
473
474 if ($self->{'OIDtype'} eq "hash_on_full_filename") {
475 $hash_on_file = 0;
476 $hash_on_ga_xml = 0;
477
478 my $source_filename = $self->get_source_filename();
479 my $dirsep = &util::get_os_dirsep();
480
481 $source_filename =~ s/^import$dirsep//;
482 $source_filename =~ s/$dirsep/-/g;
483 $source_filename =~ s/\./_/g;
484
485 # If the filename is very short then (handled naively)
486 # this can cause conjestion in the hash-values
487 # computed, leading documents sharing the same leading
488 # Hex values in the computed has.
489 #
490 # The solution taken here is to replace the name of
491 # the file name a sufficient number of times (up to
492 # the character limit defined in 'rep_limit' and
493 # make that the content that is hashed on
494
495 # *** Think twice before changing the following value
496 # as it will break backward compatability of computed
497 # document HASH values
498
499 my $rep_limit = 256;
500 my $hash_content = undef;
501
502 if (length($source_filename)<$rep_limit) {
503 my $rep_string = "$source_filename|";
504 my $rs_len = length($rep_string);
505
506 my $clone_times = int(($rep_limit-1)/$rs_len) +1;
507
508 $hash_content = substr($rep_string x $clone_times, 0, $rep_limit);
509 }
510 else {
511 $hash_content = $source_filename;
512 }
513
514 my $filename = &util::get_tmp_filename();
515 if (!open (OUTFILE, ">:utf8", $filename)) {
516 print STDERR "doc::set_OID could not write to $filename\n";
517 } else {
518 print OUTFILE $hash_content;
519 close (OUTFILE);
520 }
521 $OID = $self->_calc_OID ($filename);
522
523 &FileUtils::removeFiles ($filename);
524 }
525
526 if ($hash_on_file) {
527 # "hash" OID - feed file to hashfile.exe
528 my $filename = $self->get_filename_for_hashing();
529
530 # -z: don't want to hash on the file if it is zero size
531 if (defined($filename) && -e $filename && !-z $filename) {
532 $OID = $self->_calc_OID ($filename);
533 } else {
534 $hash_on_ga_xml = 1; # switch to back-up plan, and hash on GA file instead
535 }
536 }
537
538 if ($hash_on_ga_xml) {
539 # In addition being asked to explicity calculate the has based on the GA file,
540 # can also end up coming into this block is doing 'hash_on_file' but the file
541 # itself is of zero bytes (as could be the case with 'doc.nul' file
542
543 my $filename = &util::get_tmp_filename();
544 if (!open (OUTFILE, ">:utf8", $filename)) {
545 print STDERR "doc::set_OID could not write to $filename\n";
546 } else {
547 my $doc_text = &docprint::get_section_xml($self);
548 print OUTFILE $doc_text;
549 close (OUTFILE);
550 }
551 $OID = $self->_calc_OID ($filename);
552 &FileUtils::removeFiles($filename);
553 }
554 }
555 }
556 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
557}
558
559# this uses hashdoc (embedded c thingy) which is faster but still
560# needs a little work to be sufficiently stable
561sub ___set_OID {
562 my $self = shift (@_);
563 my ($OID) = @_;
564
565 # if an OID wasn't provided then calculate hash value based on document
566 if (!defined $OID)
567 {
568 my $hash_text = &docprint::get_section_xml($self);
569 my $hash_len = length($hash_text);
570
571 $OID = &hashdoc::buffer($hash_text,$hash_len);
572 }
573
574 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
575}
576
577# returns the OID for this document
578sub get_OID {
579 my $self = shift (@_);
580 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
581 return $OID if (defined $OID);
582 return "NULL";
583}
584
585sub delete_OID {
586 my $self = shift (@_);
587
588 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
589}
590
591
592# methods for manipulating section names
593
594# returns the name of the top-most section (the top
595# level of the document
596sub get_top_section {
597 my $self = shift (@_);
598
599 return "";
600}
601
602# returns a section
603sub get_parent_section {
604 my $self = shift (@_);
605 my ($section) = @_;
606
607 $section =~ s/(^|\.)\d+$//;
608
609 return $section;
610}
611
612# returns the first child section (or the end child
613# if there isn't any)
614sub get_begin_child {
615 my $self = shift (@_);
616 my ($section) = @_;
617
618 my $section_ptr = $self->_lookup_section($section);
619 return "" unless defined $section_ptr;
620
621 if (defined $section_ptr->{'subsection_order'}->[0]) {
622 return "$section.$section_ptr->{'subsection_order'}->[0]";
623 }
624
625 return $self->get_end_child ($section);
626}
627
628# returns the next child of a parent section
629sub get_next_child {
630 my $self = shift (@_);
631 my ($section) = @_;
632
633 my $parent_section = $self->get_parent_section($section);
634 my $parent_section_ptr = $self->_lookup_section($parent_section);
635 return undef unless defined $parent_section_ptr;
636
637 my ($section_num) = $section =~ /(\d+)$/;
638 return undef unless defined $section_num;
639
640 my $i = 0;
641 my $section_order = $parent_section_ptr->{'subsection_order'};
642 while ($i < scalar(@$section_order)) {
643 last if $section_order->[$i] eq $section_num;
644 $i++;
645 }
646
647 $i++; # the next child
648 if ($i < scalar(@$section_order)) {
649 return $section_order->[$i] if $parent_section eq "";
650 return "$parent_section.$section_order->[$i]";
651 }
652
653 # no more sections in this level
654 return undef;
655}
656
657# returns a reference to a list of children
658sub get_children {
659 my $self = shift (@_);
660 my ($section) = @_;
661
662 my $section_ptr = $self->_lookup_section($section);
663 return [] unless defined $section_ptr;
664
665 my @children = @{$section_ptr->{'subsection_order'}};
666
667 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
668 return \@children;
669}
670
671# returns the child section one past the last one (which
672# is coded as "0")
673sub get_end_child {
674 my $self = shift (@_);
675 my ($section) = @_;
676
677 return $section . ".0" unless $section eq "";
678 return "0";
679}
680
681# returns the next section in book order
682sub get_next_section {
683 my $self = shift (@_);
684 my ($section) = @_;
685
686 return undef unless defined $section;
687
688 my $section_ptr = $self->_lookup_section($section);
689 return undef unless defined $section_ptr;
690
691 # first try to find first child
692 if (defined $section_ptr->{'subsection_order'}->[0]) {
693 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
694 return "$section.$section_ptr->{'subsection_order'}->[0]";
695 }
696
697 do {
698 # try to find sibling
699 my $next_child = $self->get_next_child ($section);
700 return $next_child if (defined $next_child);
701
702 # move up one level
703 $section = $self->get_parent_section ($section);
704 } while $section =~ /\d/;
705
706 return undef;
707}
708
709sub is_leaf_section {
710 my $self = shift (@_);
711 my ($section) = @_;
712
713 my $section_ptr = $self->_lookup_section($section);
714 return 1 unless defined $section_ptr;
715
716 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
717}
718
719# methods for dealing with sections
720
721# returns the name of the inserted section
722sub insert_section {
723 my $self = shift (@_);
724 my ($before_section) = @_;
725
726 # get the child to insert before and its parent section
727 my $parent_section = "";
728 my $before_child = "0";
729 my @before_section = split (/\./, $before_section);
730 if (scalar(@before_section) > 0) {
731 $before_child = pop (@before_section);
732 $parent_section = join (".", @before_section);
733 }
734
735 my $parent_section_ptr = $self->_lookup_section($parent_section);
736 if (!defined $parent_section_ptr) {
737 print STDERR "doc::insert_section couldn't find parent section " .
738 "$parent_section\n";
739 return;
740 }
741
742 # get the next section number
743 my $section_num = $parent_section_ptr->{'next_subsection'}++;
744
745 my $i = 0;
746 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
747 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
748 $i++;
749 }
750
751 # insert the section number into the order list
752 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
753
754 # add this section to the parent section
755 my $section_ptr = {'subsection_order'=>[],
756 'next_subsection'=>1,
757 'subsections'=>{},
758 'metadata'=>[],
759 'text'=>""};
760 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
761
762 # work out the full section number
763 my $section = $parent_section;
764 $section .= "." unless $section eq "";
765 $section .= $section_num;
766
767 return $section;
768}
769
770# creates a pre-named section
771sub create_named_section {
772 my $self = shift (@_);
773 my ($mastersection) = @_;
774
775 my ($num);
776 my $section = $mastersection;
777 my $sectionref = $self;
778
779 while ($section ne "") {
780 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
781 $num =~ s/^0+(\d)/$1/; # remove leading 0s
782 $section = "" unless defined $section;
783
784 if (defined $num) {
785 if (!defined $sectionref->{'subsections'}->{$num}) {
786 push (@{$sectionref->{'subsection_order'}}, $num);
787 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
788 'next_subsection'=>1,
789 'subsections'=>{},
790 'metadata'=>[],
791 'text'=>""};
792 if ($num >= $sectionref->{'next_subsection'}) {
793 $sectionref->{'next_subsection'} = $num + 1;
794 }
795 }
796 $sectionref = $sectionref->{'subsections'}->{$num};
797
798 } else {
799 print STDERR "doc::create_named_section couldn't create section ";
800 print STDERR "$mastersection\n";
801 last;
802 }
803 }
804}
805
806# returns a reference to a list of subsections
807sub list_subsections {
808 my $self = shift (@_);
809 my ($section) = @_;
810
811 my $section_ptr = $self->_lookup_section ($section);
812 if (!defined $section_ptr) {
813 print STDERR "doc::list_subsections couldn't find section $section\n";
814 return [];
815 }
816
817 return [@{$section_ptr->{'subsection_order'}}];
818}
819
820sub delete_section {
821 my $self = shift (@_);
822 my ($section) = @_;
823
824# my $section_ptr = {'subsection_order'=>[],
825# 'next_subsection'=>1,
826# 'subsections'=>{},
827# 'metadata'=>[],
828# 'text'=>""};
829
830 # if this is the top section reset everything
831 if ($section eq "") {
832 $self->{'subsection_order'} = [];
833 $self->{'subsections'} = {};
834 $self->{'metadata'} = [];
835 $self->{'text'} = "";
836 return;
837 }
838
839 # find the parent of the section to delete
840 my $parent_section = "";
841 my $child = "0";
842 my @section = split (/\./, $section);
843 if (scalar(@section) > 0) {
844 $child = pop (@section);
845 $parent_section = join (".", @section);
846 }
847
848 my $parent_section_ptr = $self->_lookup_section($parent_section);
849 if (!defined $parent_section_ptr) {
850 print STDERR "doc::delete_section couldn't find parent section " .
851 "$parent_section\n";
852 return;
853 }
854
855 # remove this section from the subsection_order list
856 my $i = 0;
857 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
858 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
859 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
860 last;
861 }
862 $i++;
863 }
864
865 # remove this section from the subsection hash
866 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
867 undef $parent_section_ptr->{'subsections'}->{$child};
868 }
869}
870
871#--
872# methods for dealing with metadata
873
874# set_metadata_element and get_metadata_element are for metadata
875# which should only have one value. add_meta_data and get_metadata
876# are for metadata which can have more than one value.
877
878# returns the first metadata value which matches field
879
880# This version of get metadata element works much like the one above,
881# except it allows for the namespace portion of a metadata element to
882# be ignored, thus if you are searching for dc.Title, the first piece
883# of matching metadata ending with the name Title (once any namespace
884# is removed) would be returned.
885# 28-11-2003 John Thompson
886sub get_metadata_element {
887 my $self = shift (@_);
888 my ($section, $field, $ignore_namespace) = @_;
889 my ($data);
890
891 $ignore_namespace = 0 unless defined $ignore_namespace;
892
893 my $section_ptr = $self->_lookup_section($section);
894 if (!defined $section_ptr) {
895 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
896 return;
897 }
898
899 # Remove any namespace if we are being told to ignore them
900 if($ignore_namespace) {
901 $field =~ s/^.*\.//; #$field =~ s/^\w*\.//;
902 }
903
904 foreach $data (@{$section_ptr->{'metadata'}}) {
905
906 my $data_name = $data->[0];
907
908 # Remove any namespace if we are being told to ignore them
909 if($ignore_namespace) {
910 $data_name =~ s/^.*\.//; #$data_name =~ s/^\w*\.//;
911 }
912 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
913 $data_name =~ s/^ex\.([^.]+)$/$1/; #$data_name =~ s/^ex\.//;
914
915 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
916 }
917
918 return undef; # was not found
919}
920
921# returns a list of the form [value1, value2, ...]
922sub get_metadata {
923 my $self = shift (@_);
924 my ($section, $field, $ignore_namespace) = @_;
925 my ($data);
926
927 $ignore_namespace = 0 unless defined $ignore_namespace;
928
929 my $section_ptr = $self->_lookup_section($section);
930 if (!defined $section_ptr) {
931 print STDERR "doc::get_metadata couldn't find section ",
932 $section, "\n";
933 return;
934 }
935
936 # Remove any namespace if we are being told to ignore them
937 if($ignore_namespace) {
938 $field =~ s/^.*\.//;
939 }
940
941 my @metadata = ();
942 foreach $data (@{$section_ptr->{'metadata'}}) {
943
944 my $data_name = $data->[0];
945
946 # Remove any namespace if we are being told to ignore them
947 if($ignore_namespace) {
948 $data_name =~ s/^.*\.//;
949 }
950 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
951 $data_name =~ s/^ex\.([^.]+)$/$1/;
952
953 push (@metadata, $data->[1]) if ($data_name eq $field);
954 }
955
956 return \@metadata;
957}
958
959sub get_metadata_hashmap {
960 my $self = shift (@_);
961 my ($section, $opt_namespace) = @_;
962
963 my $section_ptr = $self->_lookup_section($section);
964 if (!defined $section_ptr) {
965 print STDERR "doc::get_metadata couldn't find section ",
966 $section, "\n";
967 return;
968 }
969
970 my $metadata_hashmap = {};
971 foreach my $data (@{$section_ptr->{'metadata'}}) {
972 my $metaname = $data->[0];
973
974 if ((!defined $opt_namespace) || ($metaname =~ m/^$opt_namespace\./)) {
975 if (!defined $metadata_hashmap->{$metaname}) {
976 $metadata_hashmap->{$metaname} = [];
977 }
978 my $metaval_list = $metadata_hashmap->{$metaname};
979 push(@$metaval_list, $data->[1]);
980 }
981 }
982
983 return $metadata_hashmap;
984}
985
986# returns a list of the form [[field,value],[field,value],...]
987sub get_all_metadata {
988 my $self = shift (@_);
989 my ($section) = @_;
990
991 my $section_ptr = $self->_lookup_section($section);
992 if (!defined $section_ptr) {
993 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
994 return;
995 }
996
997 return $section_ptr->{'metadata'};
998}
999
1000# $value is optional
1001sub delete_metadata {
1002 my $self = shift (@_);
1003 my ($section, $field, $value) = @_;
1004
1005 my $section_ptr = $self->_lookup_section($section);
1006 if (!defined $section_ptr) {
1007 print STDERR "doc::delete_metadata couldn't find section ", $section, "$field\n";
1008 return;
1009 }
1010
1011 my $i = 0;
1012 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
1013 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
1014 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
1015 splice (@{$section_ptr->{'metadata'}}, $i, 1);
1016 } else {
1017 $i++;
1018 }
1019 }
1020}
1021
1022sub delete_all_metadata {
1023 my $self = shift (@_);
1024 my ($section) = @_;
1025
1026 my $section_ptr = $self->_lookup_section($section);
1027 if (!defined $section_ptr) {
1028 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
1029 return;
1030 }
1031
1032 $section_ptr->{'metadata'} = [];
1033}
1034
1035sub set_metadata_element {
1036 my $self = shift (@_);
1037 my ($section, $field, $value) = @_;
1038
1039 $self->set_utf8_metadata_element ($section, $field,
1040 &unicode::ascii2utf8(\$value));
1041}
1042
1043# set_utf8_metadata_element assumes the text has already been
1044# converted to the UTF-8 encoding.
1045sub set_utf8_metadata_element {
1046 my $self = shift (@_);
1047 my ($section, $field, $value) = @_;
1048
1049 $self->delete_metadata ($section, $field);
1050 $self->add_utf8_metadata ($section, $field, $value);
1051}
1052
1053
1054# add_metadata assumes the text is in (extended) ascii form. For
1055# text which has already been converted to the UTF-8 format use
1056# add_utf8_metadata.
1057sub add_metadata {
1058 my $self = shift (@_);
1059 my ($section, $field, $value) = @_;
1060
1061 $self->add_utf8_metadata ($section, $field,
1062 &unicode::ascii2utf8(\$value));
1063}
1064
1065sub add_utf8_metadata {
1066 my $self = shift (@_);
1067 my ($section, $field, $value) = @_;
1068
1069 # my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1070 # my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
1071 # print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
1072
1073 my $section_ptr = $self->_lookup_section($section);
1074 if (!defined $section_ptr) {
1075 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
1076 return;
1077 }
1078 if (!defined $value) {
1079 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
1080 return;
1081 }
1082 if (!defined $field) {
1083 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
1084 return;
1085 }
1086
1087 #print STDERR "###$field=$value\n";
1088
1089 # For now, supress this check. Given that text data read in is now
1090 # Unicode aware, then the following block of code can (ironically enough)
1091 # cause our unicode compliant string to be re-encoded (leading to
1092 # a double-encoded UTF-8 string, which we definitely don't want!).
1093
1094
1095 # double check that the value is utf-8
1096 # if (!&unicode::check_is_utf8($value)) {
1097 # print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
1098 # &unicode::ensure_utf8(\$value);
1099 # print STDERR " Tried converting to utf8: $value\n";
1100 # }
1101
1102 #If the metadata value is either a latitude or a longitude value then we want to save a shortened version for spacial searching purposes
1103 if ($field =~ m/^(.+\.)?Latitude$/ || $field =~ m/^(.+\.)?Longitude$/)
1104 {
1105 my ($mdprefix,$metaname) = ($field =~ m/(.+)\.(.+)$/);
1106 if (defined $mdprefix) {
1107 # Add in a version of Latitude/Longitude without the metadata namespace prefix to keep Runtime happy
1108 push (@{$section_ptr->{'metadata'}}, [$metaname, $value]);
1109 }
1110
1111 my $direction;
1112 if($value =~ m/^-/)
1113 {
1114 $direction = ($field eq "Latitude") ? "S" : "W";
1115 }
1116 else
1117 {
1118 $direction = ($field eq "Latitude") ? "N" : "E";
1119 }
1120
1121 my ($beforeDec, $afterDec) = ($value =~ m/^-?([0-9]+)\.([0-9]+)$/);
1122 if(defined $beforeDec && defined $afterDec)
1123 {
1124 my $name = ($field eq "Latitude") ? "LatShort" : "LngShort";
1125 push (@{$section_ptr->{'metadata'}}, [$name, $beforeDec . $direction]);
1126
1127 for(my $i = 2; $i <= 4; $i++)
1128 {
1129 if(length($afterDec) >= $i)
1130 {
1131 push (@{$section_ptr->{'metadata'}}, [$name, substr($afterDec, 0, $i)]);
1132 }
1133 }
1134
1135 #Only add the metadata if it has not already been added
1136 my $metaMap = $self->get_metadata_hashmap($section); ### TODO: metaMap not used. Unnecesssary step? (Called function has no side-effects.)
1137 }
1138
1139 if($field =~ m/^(.+\.)?Longitude$/) {
1140 # if we are dealing with Longitude meta, we should 1. have Latitude meta; 2. already have processed Latitude meta
1141 # in that case, add both Lat and Lng of this section as a Coordinate meta
1142 my $latitude = $self->get_metadata_element ($section, "Latitude");
1143 # TODO: would like all Longitude info together followed by all Coordinate info, but the following will add all coord info meta and end of this function will add Longitude meta
1144 $self->processCoordinate($section_ptr, $latitude, $value); # value is Longitude
1145 }
1146 }
1147
1148 elsif($field eq "GPS.mapOverlay") { # then the value is a JSON string
1149 print STDERR "GPS.mapOverlay before val: " . $value . "\n";
1150
1151 # TODO html decode?
1152 $value =~ s@&#091;@[@g;
1153 $value =~ s@&#093;@]@g;
1154 $value =~ s@&quot;@"@g;
1155 print STDERR "GPS.mapOverlay after val: " . $value . "\n";
1156
1157 my $json_array = decode_json $value;
1158 #my $json = JSON->new->allow_nonref;
1159 #&printAllShapes($json, $json_array);
1160
1161 foreach my $shape (@$json_array) {
1162
1163 my $type = $shape->{"type"};
1164 print STDERR "Type : " . $type . "\n";
1165
1166 if($type eq "circle") {
1167 #print STDERR "Found a circle:\n" . &printShape($json, $shape);
1168
1169 # work out bounding box
1170 # SCARY!
1171 # want the inverse of this useful page:
1172 # https://stackoverflow.com/questions/639695/how-to-convert-latitude-or-longitude-to-meters
1173 # https://www.geodatasource.com/developers/javascript
1174
1175
1176 # for now, just process the circle centre
1177 #my $centre = $shape->{"center"};
1178 #$self->processLatOrLng($section_ptr, "Latitude", $centre->{"lat"});
1179 #$self->processLatOrLng($section_ptr, "Longitude", $centre->{"lng"});
1180
1181
1182 # Dr Bainbridge wants us to use: https://gis.stackexchange.com/questions/5821/calculating-latitude-longitude-x-miles-from-point
1183 # But we're using the rule of thumb here, since for N,E,S,W it works out the same:
1184 # https://gis.stackexchange.com/questions/2951/algorithm-for-offsetting-a-latitude-longitude-by-some-amount-of-meters
1185 # which states
1186 # "If your displacements aren't too great (less than a few kilometers) and you're not right at the poles,
1187 # use the quick and dirty estimate that 111,111 meters (111.111 km) in the y direction is 1 degree
1188 # (of latitude) and 111,111 * cos(latitude) meters in the x direction is 1 degree (of longitude)."
1189 my $centre_lat = $shape->{"center"}->{"lat"};
1190 my $centre_lng = $shape->{"center"}->{"lng"};
1191 my $radius = $shape->{"radius"}; # in metres!
1192
1193 print STDERR "@@@ circle centre: ($centre_lat, $centre_lng), radius: $radius\n";
1194
1195 my $lat_north = $centre_lat + ($radius/111111);
1196 my $lat_south = $centre_lat - ($radius/111111);
1197
1198 print STDERR "### lat_north: $lat_north\n";
1199 print STDERR "### lat_south: $lat_south\n";
1200
1201 # our latitude and longitude values are in degrees. But cos and sin etc in perl and generally all prog languages
1202 # all expect their angle to be in radians. So need to convert from degree to radians before we can take the cose of it.
1203 my $centre_lat_radians = $self->degreesToRadians($centre_lat);
1204 my $cos_in_radians = cos($centre_lat_radians);
1205 print STDERR "cos $centre_lat_radians " . cos($centre_lat_radians) . "\n";
1206 my $lng_east = $centre_lng + ($radius/(111111 * $cos_in_radians));
1207 my $lng_west = $centre_lng - ($radius/(111111 * $cos_in_radians));
1208 print STDERR "### lng_east $lng_east\n";
1209 print STDERR "### lng_west $lng_west\n";
1210
1211
1212
1213 my $cos_lat = cos($centre_lat);
1214 print STDERR "cos $centre_lat is $cos_lat\n";
1215
1216 $self->processCoordinate($section_ptr, $lat_north, $lng_east);
1217 $self->processCoordinate($section_ptr, $lat_south, $lng_east);
1218 $self->processCoordinate($section_ptr, $lat_south, $lng_west);
1219 $self->processCoordinate($section_ptr, $lat_north, $lng_west);
1220
1221 }
1222 elsif ($type eq "marker") {
1223 print STDERR "@@ MARKER FOUND WITH LAT: " . $shape->{"position"}->{"lat"} . "\n";
1224 print STDERR "@@ MARKER FOUND WITH LNG: " . $shape->{"position"}->{"lng"} . "\n";
1225 $self->processCoordinate($section_ptr, $shape->{"position"}->{"lat"}, $shape->{"position"}->{"lng"});
1226 }
1227 elsif ($type eq "polyline" || $type eq "polygon") {
1228 my $path_array = $shape->{"path"};
1229 foreach my $position (@$path_array) {
1230 $self->processCoordinate($section_ptr, $position->{"lat"}, $position->{"lng"});
1231 }
1232 }
1233 elsif ($type eq "rectangle") {
1234
1235 my $bounds = $shape->{"bounds"};
1236
1237 $self->processCoordinate($section_ptr, $bounds->{"north"}, $bounds->{"east"});
1238 $self->processCoordinate($section_ptr, $bounds->{"south"}, $bounds->{"east"});
1239 $self->processCoordinate($section_ptr, $bounds->{"south"}, $bounds->{"west"});
1240 $self->processCoordinate($section_ptr, $bounds->{"north"}, $bounds->{"west"});
1241 }
1242
1243 } # end for on each shape in GPS.mapOverlay
1244
1245 } # end GPS.mapOverlay meta
1246
1247 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1248}
1249
1250# https://en.wikipedia.org/wiki/Radian
1251sub degreesToRadians
1252{
1253 my $self = shift (@_);
1254 my ($degrees) = @_;
1255
1256 return $degrees * pi /180; # returns radians
1257}
1258
1259sub radiansToDegrees
1260{
1261 my $self = shift (@_);
1262 my ($radians) = @_;
1263
1264 return $radians * 180 / pi; # returns degrees
1265}
1266
1267sub printAllShapes {
1268 my ($json, $json_array) = @_;
1269
1270
1271 #my $pretty_print_shape = $json->pretty->encode( $json_array->[0] );
1272 foreach my $shape (@$json_array) {
1273 my $pretty_print_shape = $json->pretty->encode( $shape );
1274 print STDERR "Shape: $pretty_print_shape\n";
1275 #&printShape($shape);
1276 }
1277
1278}
1279
1280sub processCoordinate {
1281 my $self = shift (@_);
1282 my ($section_ptr, $latitude, $longitude) = @_;
1283
1284 my $lat_direction = ($latitude =~ m/^-/) ? "S" : "N";
1285 my $lng_direction = ($longitude =~ m/^-/) ? "W" : "E";
1286
1287 # have to store (lat, lng) in pairs, when there are so many coords to store
1288 #push (@{$section_ptr->{'metadata'}}, ["Latitude", $latitude]);
1289 #push (@{$section_ptr->{'metadata'}}, ["Longitude", $longitude]);
1290
1291 push (@{$section_ptr->{'metadata'}}, ["Coordinate", "$latitude $longitude"]); # "$latitude$lat_direction $longitude$lng_direction"
1292
1293 my ($latBeforeDec, $latAfterDec);
1294 my ($lngBeforeDec, $lngAfterDec);
1295
1296 if($latitude !~ m/\./) {
1297 $latBeforeDec = $latitude;
1298 $latAfterDec = "";
1299 } else {
1300 ($latBeforeDec, $latAfterDec) = ($latitude =~ m/^-?([0-9]+)\.([0-9]+)$/);
1301 }
1302 if($longitude !~ m/\./) {
1303 $lngBeforeDec = $longitude;
1304 $lngAfterDec = "";
1305 } else {
1306 ($lngBeforeDec, $lngAfterDec) = ($longitude =~ m/^-?([0-9]+)\.([0-9]+)$/);
1307 }
1308
1309 #if(defined $beforeDec && defined $afterDec)
1310 #{
1311 my $name = "CoordShort";
1312 push (@{$section_ptr->{'metadata'}}, [$name, "$latBeforeDec$lat_direction $lngBeforeDec$lng_direction"]);
1313
1314 for(my $i = 2; $i <= 4; $i++)
1315 {
1316 my $latDecPlaces = (length($latAfterDec) >= $i) ? substr($latAfterDec, 0, $i) : "";
1317 my $lngDecPlaces = (length($lngAfterDec) >= $i) ? substr($lngAfterDec, 0, $i) : "";
1318
1319 push (@{$section_ptr->{'metadata'}}, [$name,
1320 $latBeforeDec . $lat_direction. $latDecPlaces . " " . $lngBeforeDec . $lng_direction. $lngDecPlaces]);
1321
1322 }
1323
1324 #Only add the metadata if it has not already been added
1325 #my $metaMap = $self->get_metadata_hashmap($section); ### TODO: metaMap not used. Unnecesssary step? (Called function has no side-effects.)
1326 #}
1327
1328
1329}
1330
1331
1332# methods for dealing with text
1333
1334# returns the text for a section
1335sub get_text {
1336 my $self = shift (@_);
1337 my ($section) = @_;
1338
1339 my $section_ptr = $self->_lookup_section($section);
1340 if (!defined $section_ptr) {
1341 print STDERR "doc::get_text couldn't find section " .
1342 "$section\n";
1343 return "";
1344 }
1345
1346 return $section_ptr->{'text'};
1347}
1348
1349# returns the (utf-8 encoded) length of the text for a section
1350sub get_text_length {
1351 my $self = shift (@_);
1352 my ($section) = @_;
1353
1354 my $section_ptr = $self->_lookup_section($section);
1355 if (!defined $section_ptr) {
1356 print STDERR "doc::get_text_length couldn't find section " .
1357 "$section\n";
1358 return 0;
1359 }
1360
1361 return length ($section_ptr->{'text'});
1362}
1363
1364# returns the total length for all the sections
1365sub get_total_text_length {
1366 my $self = shift (@_);
1367
1368 my $section = $self->get_top_section();
1369 my $length = 0;
1370 while (defined $section) {
1371 $length += $self->get_text_length($section);
1372 $section = $self->get_next_section($section);
1373 }
1374 return $length;
1375}
1376
1377sub delete_text {
1378 my $self = shift (@_);
1379 my ($section) = @_;
1380
1381 my $section_ptr = $self->_lookup_section($section);
1382 if (!defined $section_ptr) {
1383 print STDERR "doc::delete_text couldn't find section " .
1384 "$section\n";
1385 return;
1386 }
1387
1388 $section_ptr->{'text'} = "";
1389}
1390
1391# add_text assumes the text is in (extended) ascii form. For
1392# text which has been already converted to the UTF-8 format
1393# use add_utf8_text.
1394sub add_text {
1395 my $self = shift (@_);
1396 my ($section, $text) = @_;
1397
1398 # convert the text to UTF-8 encoded unicode characters
1399 # and add the text
1400 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1401}
1402
1403
1404# add_utf8_text assumes the text to be added has already
1405# been converted to the UTF-8 encoding. For ascii text use
1406# add_text
1407# Pass by value version (internally calls pass by ref version
1408# to avoid code duplication)
1409sub add_utf8_text {
1410 my $self = shift (@_);
1411 my ($section, $text) = @_;
1412
1413 $self->add_utf8_textref($section, \$text);
1414}
1415
1416# Pass by reference version, used by GreenstoneSQLPlugin for fulltext
1417sub add_utf8_textref {
1418 my $self = shift (@_);
1419 my ($section, $text_ref) = @_;
1420
1421 my $section_ptr = $self->_lookup_section($section);
1422 if (!defined $section_ptr) {
1423 print STDERR "doc::add_utf8_textref couldn't find section " .
1424 "$section\n";
1425 return;
1426 }
1427
1428 $section_ptr->{'text'} .= $$text_ref;
1429}
1430
1431# returns the Source meta, which is the utf8 filename generated.
1432# Added a separate method here for convenience
1433sub get_source {
1434 my $self = shift (@_);
1435 return $self->get_metadata_element ($self->get_top_section(), "Source");
1436}
1437
1438# returns the SourceFile meta, which is the url reference to the URL-encoded
1439# version of Source (the utf8 filename). Added a separate method here for convenience
1440sub get_sourcefile {
1441 my $self = shift (@_);
1442 return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
1443}
1444
1445# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
1446# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
1447sub get_assocfile_from_sourcefile {
1448 my $self = shift (@_);
1449
1450 # get the SourceFile meta, which is a *URL* to a file on the filesystem
1451 my $top_section = $self->get_top_section();
1452 my $source_file = $self->get_metadata_element($top_section, "SourceFile");
1453
1454 # get the actual filename as it exists on the filesystem which this url refers to
1455 $source_file = &unicode::url_to_filename($source_file);
1456 my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
1457 return $assocfilename;
1458}
1459
1460# methods for dealing with associated files
1461
1462# a file is associated with a document, NOT a section.
1463# if section is defined it is noted in the data structure
1464# only so that files associated from a particular section
1465# may be removed later (using delete_section_assoc_files)
1466sub associate_file {
1467 my $self = shift (@_);
1468 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1469 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1470
1471 # remove all associated files with the same name
1472 $self->delete_assoc_file ($assoc_filename);
1473
1474 # Too harsh a requirement
1475 # Definitely get HTML docs, for example, with some missing
1476 # support files
1477# if (!&util::fd_exists($real_filename)) {
1478# print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n";
1479# exit -1;
1480# }
1481
1482# print STDERR "**** is the following a UTF8 rep of *real* filename?\n $real_filename\n";
1483# print STDERR "****##### so, ensure it is before storing?!?!?\n";
1484## my $utf8_filename = Encode::encode("utf8",$filename);
1485
1486 push (@{$self->{'associated_files'}},
1487 [$real_filename, $assoc_filename, $mime_type, $section]);
1488}
1489
1490# returns a list of associated files in the form
1491# [[real_filename, assoc_filename, mimetype], ...]
1492sub get_assoc_files {
1493 my $self = shift (@_);
1494
1495 return $self->{'associated_files'};
1496}
1497
1498# the following two methods used to keep track of original associated files
1499# for incremental building. eg a txt file used by an item file does not end
1500# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1501# file for incremental build
1502sub associate_source_file {
1503 my $self = shift (@_);
1504 my ($full_filename) = @_;
1505
1506 push (@{$self->{'source_assoc_files'}}, $full_filename);
1507
1508}
1509
1510sub get_source_assoc_files {
1511 my $self = shift (@_);
1512
1513 return $self->{'source_assoc_files'};
1514
1515
1516}
1517sub metadata_file {
1518 my $self = shift (@_);
1519 my ($real_filename, $filename) = @_;
1520
1521 push (@{$self->{'metadata_files'}},
1522 [$real_filename, $filename]);
1523}
1524
1525# used for writing out the archiveinf-doc info database, to list all the metadata files
1526sub get_meta_files {
1527 my $self = shift (@_);
1528
1529 return $self->{'metadata_files'};
1530}
1531
1532sub delete_section_assoc_files {
1533 my $self = shift (@_);
1534 my ($section) = @_;
1535
1536 my $i=0;
1537 while ($i < scalar (@{$self->{'associated_files'}})) {
1538 if (defined $self->{'associated_files'}->[$i]->[3] &&
1539 $self->{'associated_files'}->[$i]->[3] eq $section) {
1540 splice (@{$self->{'associated_files'}}, $i, 1);
1541 } else {
1542 $i++;
1543 }
1544 }
1545}
1546
1547sub delete_assoc_file {
1548 my $self = shift (@_);
1549 my ($assoc_filename) = @_;
1550
1551 my $i=0;
1552 while ($i < scalar (@{$self->{'associated_files'}})) {
1553 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1554 splice (@{$self->{'associated_files'}}, $i, 1);
1555 } else {
1556 $i++;
1557 }
1558 }
1559}
1560
1561sub reset_nextsection_ptr {
1562 my $self = shift (@_);
1563 my ($section) = @_;
1564
1565 my $section_ptr = $self->_lookup_section($section);
1566 $section_ptr->{'next_subsection'} = 1;
1567}
1568
15691;
Note: See TracBrowser for help on using the repository browser.