source: main/trunk/greenstone2/perllib/doc.pm@ 33171

Last change on this file since 33171 was 33171, checked in by wy59, 5 years ago

First part of commit: following Dr Bainbridge's suggestion for how best to determine what phase of building we're in from within doc.pm. Needs a package level variable, used existing coding pattern in the file that used the our keyword.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 48.5 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use strict;
37use unicode;
38use util;
39use FileUtils;
40use ghtml;
41use File::stat;
42##use hashdoc;
43use docprint;
44use JSON;
45
46# We just need pi from the Trig lib
47# Import constants pi2, pip2, pip4 (2*pi, pi/2, pi/4).
48use Math::Trig ':pi';
49
50# the document type may be indexed_doc, nonindexed_doc, or
51# classification
52
53our $OIDcount = 0;
54
55
56# Declare the Package (level) Variable that will tell us whether we're in the import or buildcol phase.
57# This var will be set by import.pl, buildcol.pl and export.pl to the then appropriate value.
58# https://perlmaven.com/package-variables-and-lexical-variables-in-perl
59# But: https://stackoverflow.com/questions/7542120/perl-how-to-make-variables-from-requiring-script-available-in-required-script
60# https://stackoverflow.com/questions/33230015/how-do-i-use-a-variable-in-another-perl-script
61#$doc::cmd_line_mode = undef;
62# Decided to follow the usage of $OIDcount above to declare the necessary package-level variables 'globally' accessible
63# actually accessible only for those packages that import this one (with 'use doc')
64our $cmd_line_mode = undef;
65
66# rename_method can be 'url', 'none', 'base64'
67sub new {
68 my $class = shift (@_);
69 my ($source_filename, $doc_type, $rename_method) = @_;
70
71
72 my $self = bless {'associated_files'=>[],
73 'subsection_order'=>[],
74 'next_subsection'=>1,
75 'subsections'=>{},
76 'metadata'=>[],
77 'text'=>"",
78 'OIDtype'=>"hash"}, $class;
79
80 # used to set lastmodified here, but this can screw up the HASH ids, so
81 # the docsave processor now calls set_lastmodified
82
83 $self->set_source_path($source_filename);
84
85 if (defined $source_filename) {
86 $source_filename = &util::filename_within_collection($source_filename);
87 print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
88 $self->set_source_filename ($source_filename, $rename_method);
89 }
90
91 $self->set_doc_type ($doc_type) if defined $doc_type;
92
93 return $self;
94}
95
96
97sub set_source_path
98{
99 my $self = shift @_;
100 my ($source_filename) = @_;
101
102 if (defined $source_filename) {
103 # On Windows the source_filename can be in terse DOS format
104 # e.g. test~1.txt
105
106 $self->{'terse_source_path'} = $source_filename;
107
108 # Use the FileUtil library methods as they are aware of more special
109 # cases such as HDFS [jmt12]
110 if (&FileUtils::fileExists($source_filename))
111 {
112 # See if we can do better for Windows with a filename
113 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
114 require Win32;
115 $self->{'source_path'} = Win32::GetLongPathName($source_filename);
116 }
117 else {
118 # For Unix-based systems, there is no difference between the two
119 $self->{'source_path'} = $source_filename;
120 }
121 }
122 else {
123 print STDERR "Warning: In doc::set_source_path(), file\n";
124 print STDERR " $source_filename\n";
125 print STDERR " does not exist\n";
126
127 # (default) Set it to whatever we were given
128 $self->{'source_path'} = $source_filename;
129 }
130 }
131 else {
132 # Previous code for setting source_path allowed for
133 # it to be undefined, so continue this practice
134 $self->{'terse_source_path'} = undef;
135 $self->{'source_path'} = undef;
136 }
137}
138
139
140sub get_source_path
141{
142 my $self = shift @_;
143
144 return $self->{'terse_source_path'};
145}
146
147# set lastmodified for OAI purposes, added by GRB, moved by kjdon
148sub set_oailastmodified {
149 my $self = shift (@_);
150
151 my $source_path = $self->{'terse_source_path'};
152
153 if (defined $source_path && (-e $source_path)) {
154 my $current_time = time;
155
156 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
157 $wday, $yday, $isdst) = localtime($current_time);
158
159 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
160
161 $self->add_utf8_metadata($self->get_top_section(), "oailastmodified", $current_time);
162 $self->add_utf8_metadata($self->get_top_section(), "oailastmodifieddate", $date_modified);
163 }
164}
165
166# no longer used for OAI purposes, since lastmodified is not what we want as the
167# Datestamp of a document. This doc metadata may be useful for general purposes.
168sub set_lastmodified {
169 my $self = shift (@_);
170
171 my $source_path = $self->{'terse_source_path'};
172
173 if (defined $source_path && (-e $source_path)) {
174
175 my $file_stat = stat($source_path);
176 my $mtime = $file_stat->mtime;
177 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
178 $wday, $yday, $isdst) = localtime($mtime);
179
180 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
181
182 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $mtime);
183 $self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
184 }
185}
186
187# clone the $self object
188sub duplicate {
189 my $self = shift (@_);
190
191 my $newobj = {};
192
193 foreach my $k (keys %$self) {
194 $newobj->{$k} = &clone ($self->{$k});
195 }
196
197 bless $newobj, ref($self);
198 return $newobj;
199}
200
201sub clone {
202 my ($from) = @_;
203 my $type = ref ($from);
204
205 if ($type eq "HASH") {
206 my $to = {};
207 foreach my $key (keys %$from) {
208 $to->{$key} = &clone ($from->{$key});
209 }
210 return $to;
211 } elsif ($type eq "ARRAY") {
212 my $to = [];
213 foreach my $v (@$from) {
214 push (@$to, &clone ($v));
215 }
216 return $to;
217 } else {
218 return $from;
219 }
220}
221
222sub set_OIDtype {
223 my $self = shift (@_);
224 my ($type, $metadata) = @_;
225
226 if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|hash_on_full_filename|incremental|filename|dirname|full_filename|assigned)$/) {
227 $self->{'OIDtype'} = $type;
228 } else {
229 $self->{'OIDtype'} = "hash";
230 }
231
232 if ($type =~ /^assigned$/) {
233 if (defined $metadata) {
234 $self->{'OIDmetadata'} = $metadata;
235 } else {
236 $self->{'OIDmetadata'} = "dc.Identifier";
237 }
238 }
239}
240
241# rename_method can be 'url', 'none', 'base64'
242sub set_source_filename {
243 my $self = shift (@_);
244 my ($source_filename, $rename_method) = @_;
245
246 # Since the gsdlsourcefilename element goes into the doc.xml it has
247 # to be utf8. However, it should also *represent* the source filename
248 # (in the import directory) which may not be utf8 at all.
249 # For instance, if this meta element (gsdlsourcefilename) will be used
250 # by other applications that parse doc.xml in order to locate
251 # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
252 # encode the real filename as this is a binary-to-text encoding meaning
253 # that the resulting string is ASCII (utf8). Decoding will give the original.
254
255# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
256
257 # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
258 $source_filename = $self->encode_filename($source_filename, $rename_method);
259# my ($srcfilename,$dirname,$suffix)
260# = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
261# print STDERR "-> $srcfilename -> ";
262# $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
263# $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
264# print STDERR "$source_filename\n";
265
266 $self->set_utf8_metadata_element ($self->get_top_section(),
267 "gsdlsourcefilename",
268 $source_filename);
269}
270
271sub encode_filename {
272 my $self = shift (@_);
273 my ($source_filename, $rename_method) = @_;
274
275 my ($srcfilename,$dirname,$suffix)
276 = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
277# print STDERR "-> $srcfilename -> ";
278 $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
279 $source_filename = &FileUtils::filenameConcatenate($dirname, $srcfilename);
280
281 return $source_filename;
282}
283
284sub set_converted_filename {
285 my $self = shift (@_);
286 my ($converted_filename) = @_;
287
288 # we know the converted filename is utf8
289 $self->set_utf8_metadata_element ($self->get_top_section(),
290 "gsdlconvertedfilename",
291 $converted_filename);
292}
293
294# returns the source_filename as it was provided
295sub get_unmodified_source_filename {
296 my $self = shift (@_);
297
298 return $self->{'terse_source_path'};
299}
300
301# returns the source_filename with whatever rename_method was given
302sub get_source_filename {
303 my $self = shift (@_);
304
305 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
306}
307
308
309
310# returns converted filename if available else returns source filename
311sub get_filename_for_hashing {
312 my $self = shift (@_);
313
314 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
315
316 if (!defined $filename) {
317 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
318 # if NULPlug processed file, then don't give a filename
319 if (defined $plugin_name && $plugin_name eq "NULPlug") {
320 $filename = undef;
321 } else { # returns the URL encoded source filename!
322 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
323 }
324 }
325
326 if (!&FileUtils::isFilenameAbsolute($filename)) {
327 $filename = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'},$filename);
328 }
329
330 return $filename;
331}
332
333sub set_doc_type {
334 my $self = shift (@_);
335 my ($doc_type) = @_;
336
337 $self->set_metadata_element ($self->get_top_section(),
338 "gsdldoctype",
339 $doc_type);
340}
341
342# returns the gsdldoctype as it was provided
343# the default of "indexed_doc" is used if no document
344# type was provided
345sub get_doc_type {
346 my $self = shift (@_);
347
348 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
349 return $doc_type if (defined $doc_type);
350 return "indexed_doc";
351}
352
353
354# look up the reference to the a particular section
355sub _lookup_section {
356 my $self = shift (@_);
357 my ($section) = @_;
358
359 my ($num);
360 my $sectionref = $self;
361
362 while (defined $section && $section ne "") {
363
364 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
365
366 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
367
368 $section = "" unless defined $section;
369
370
371 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
372 $sectionref = $sectionref->{'subsections'}->{$num};
373 } else {
374 return undef;
375 }
376 }
377
378 return $sectionref;
379}
380
381# calculate OID by hashing the contents of the document
382sub _calc_OID {
383 my $self = shift (@_);
384 my ($filename) = @_;
385
386
387 my $osexe = &util::get_os_exe();
388
389 my $hashfile_exe = &FileUtils::filenameConcatenate($ENV{'GSDLHOME'},"bin",
390 $ENV{'GSDLOS'},"hashfile$osexe");
391
392 &util::set_gnomelib_env(); # gnomelib_env (particularly lib/libiconv2.dylib) required to run the hashfile executable on Mac Lions
393 # The subroutine will set the gnomelib env once for each subshell launched, by first testing if GEXTGNOME is not already set
394
395 # A different way to set the gnomelib env would be to do it more locally: exporting the necessary vars
396 # (specifically DYLD/LD_LIB_PATH) for gnome_lib as part of the command executed.
397 # E.g. $result=`export LD_LIBRARY_PATH=../ext/gnome-lib/darwin/lib; hashfile...`
398
399 my $result = "NULL";
400
401
402 if (-e "$hashfile_exe") {
403# $result = `\"$hashfile_exe\" \"$filename\"`;
404# $result = `hashfile$osexe \"$filename\" 2>&1`;
405 $result = `hashfile$osexe \"$filename\"`;
406
407 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
408 } else {
409 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
410 }
411 return "HASH$result";
412}
413
414# methods dealing with OID, not groups of them.
415
416# if $OID is not provided one is calculated
417sub set_OID {
418 my $self = shift (@_);
419 my ($OID) = @_;
420
421 my $use_hash_oid = 0;
422 # if an OID wasn't provided calculate one
423 if (!defined $OID) {
424 $OID = "NULL";
425 if ($self->{'OIDtype'} =~ /^hash/) {
426 $use_hash_oid = 1;
427 } elsif ($self->{'OIDtype'} eq "incremental") {
428 $OID = "D" . $OIDcount;
429 $OIDcount ++;
430 } elsif ($self->{'OIDtype'} eq "filename") {
431 my $filename = $self->get_source_filename();
432 $OID = &File::Basename::fileparse($filename, qr/\.[^.]*/);
433 $OID = &util::tidy_up_oid($OID);
434 } elsif ($self->{'OIDtype'} eq "full_filename") {
435 my $source_filename = $self->get_source_filename();
436 my $dirsep = &util::get_os_dirsep();
437
438 $source_filename =~ s/^import$dirsep//;
439 $source_filename =~ s/$dirsep/-/g;
440 $source_filename =~ s/\./_/g;
441
442 $OID = $source_filename;
443 $OID = &util::tidy_up_oid($OID);
444 } elsif ($self->{'OIDtype'} eq "dirname") {
445 my $filename = $self->get_source_filename();
446 if (defined($filename)) { # && -e $filename) {
447 # get the immediate parent directory
448 $OID = &File::Basename::dirname($filename);
449 if (defined $OID) {
450 $OID = &File::Basename::basename($OID);
451 $OID = &util::tidy_up_oid($OID);
452 } else {
453 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
454 $use_hash_oid = 1;
455 }
456 } else {
457 print STDERR "Failed to find a filename, generating hash id\n";
458 $use_hash_oid = 1;
459 }
460
461 } elsif ($self->{'OIDtype'} eq "assigned") {
462 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
463 if (defined $identifier && $identifier ne "") {
464 $OID = $identifier;
465 $OID = &util::tidy_up_oid($OID);
466 } else {
467 # need a hash id
468 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
469 $use_hash_oid = 1;
470 }
471
472 } else {
473 $use_hash_oid = 1;
474 }
475
476 if ($use_hash_oid) {
477 my $hash_on_file = 1;
478 my $hash_on_ga_xml = 0;
479
480 if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
481 $hash_on_file = 0;
482 $hash_on_ga_xml = 1;
483 }
484
485 if ($self->{'OIDtype'} eq "hash_on_full_filename") {
486 $hash_on_file = 0;
487 $hash_on_ga_xml = 0;
488
489 my $source_filename = $self->get_source_filename();
490 my $dirsep = &util::get_os_dirsep();
491
492 $source_filename =~ s/^import$dirsep//;
493 $source_filename =~ s/$dirsep/-/g;
494 $source_filename =~ s/\./_/g;
495
496 # If the filename is very short then (handled naively)
497 # this can cause conjestion in the hash-values
498 # computed, leading documents sharing the same leading
499 # Hex values in the computed has.
500 #
501 # The solution taken here is to replace the name of
502 # the file name a sufficient number of times (up to
503 # the character limit defined in 'rep_limit' and
504 # make that the content that is hashed on
505
506 # *** Think twice before changing the following value
507 # as it will break backward compatability of computed
508 # document HASH values
509
510 my $rep_limit = 256;
511 my $hash_content = undef;
512
513 if (length($source_filename)<$rep_limit) {
514 my $rep_string = "$source_filename|";
515 my $rs_len = length($rep_string);
516
517 my $clone_times = int(($rep_limit-1)/$rs_len) +1;
518
519 $hash_content = substr($rep_string x $clone_times, 0, $rep_limit);
520 }
521 else {
522 $hash_content = $source_filename;
523 }
524
525 my $filename = &util::get_tmp_filename();
526 if (!open (OUTFILE, ">:utf8", $filename)) {
527 print STDERR "doc::set_OID could not write to $filename\n";
528 } else {
529 print OUTFILE $hash_content;
530 close (OUTFILE);
531 }
532 $OID = $self->_calc_OID ($filename);
533
534 &FileUtils::removeFiles ($filename);
535 }
536
537 if ($hash_on_file) {
538 # "hash" OID - feed file to hashfile.exe
539 my $filename = $self->get_filename_for_hashing();
540
541 # -z: don't want to hash on the file if it is zero size
542 if (defined($filename) && -e $filename && !-z $filename) {
543 $OID = $self->_calc_OID ($filename);
544 } else {
545 $hash_on_ga_xml = 1; # switch to back-up plan, and hash on GA file instead
546 }
547 }
548
549 if ($hash_on_ga_xml) {
550 # In addition being asked to explicity calculate the has based on the GA file,
551 # can also end up coming into this block is doing 'hash_on_file' but the file
552 # itself is of zero bytes (as could be the case with 'doc.nul' file
553
554 my $filename = &util::get_tmp_filename();
555 if (!open (OUTFILE, ">:utf8", $filename)) {
556 print STDERR "doc::set_OID could not write to $filename\n";
557 } else {
558 my $doc_text = &docprint::get_section_xml($self);
559 print OUTFILE $doc_text;
560 close (OUTFILE);
561 }
562 $OID = $self->_calc_OID ($filename);
563 &FileUtils::removeFiles($filename);
564 }
565 }
566 }
567 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
568}
569
570# this uses hashdoc (embedded c thingy) which is faster but still
571# needs a little work to be sufficiently stable
572sub ___set_OID {
573 my $self = shift (@_);
574 my ($OID) = @_;
575
576 # if an OID wasn't provided then calculate hash value based on document
577 if (!defined $OID)
578 {
579 my $hash_text = &docprint::get_section_xml($self);
580 my $hash_len = length($hash_text);
581
582 $OID = &hashdoc::buffer($hash_text,$hash_len);
583 }
584
585 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
586}
587
588# returns the OID for this document
589sub get_OID {
590 my $self = shift (@_);
591 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
592 return $OID if (defined $OID);
593 return "NULL";
594}
595
596sub delete_OID {
597 my $self = shift (@_);
598
599 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
600}
601
602
603# methods for manipulating section names
604
605# returns the name of the top-most section (the top
606# level of the document
607sub get_top_section {
608 my $self = shift (@_);
609
610 return "";
611}
612
613# returns a section
614sub get_parent_section {
615 my $self = shift (@_);
616 my ($section) = @_;
617
618 $section =~ s/(^|\.)\d+$//;
619
620 return $section;
621}
622
623# returns the first child section (or the end child
624# if there isn't any)
625sub get_begin_child {
626 my $self = shift (@_);
627 my ($section) = @_;
628
629 my $section_ptr = $self->_lookup_section($section);
630 return "" unless defined $section_ptr;
631
632 if (defined $section_ptr->{'subsection_order'}->[0]) {
633 return "$section.$section_ptr->{'subsection_order'}->[0]";
634 }
635
636 return $self->get_end_child ($section);
637}
638
639# returns the next child of a parent section
640sub get_next_child {
641 my $self = shift (@_);
642 my ($section) = @_;
643
644 my $parent_section = $self->get_parent_section($section);
645 my $parent_section_ptr = $self->_lookup_section($parent_section);
646 return undef unless defined $parent_section_ptr;
647
648 my ($section_num) = $section =~ /(\d+)$/;
649 return undef unless defined $section_num;
650
651 my $i = 0;
652 my $section_order = $parent_section_ptr->{'subsection_order'};
653 while ($i < scalar(@$section_order)) {
654 last if $section_order->[$i] eq $section_num;
655 $i++;
656 }
657
658 $i++; # the next child
659 if ($i < scalar(@$section_order)) {
660 return $section_order->[$i] if $parent_section eq "";
661 return "$parent_section.$section_order->[$i]";
662 }
663
664 # no more sections in this level
665 return undef;
666}
667
668# returns a reference to a list of children
669sub get_children {
670 my $self = shift (@_);
671 my ($section) = @_;
672
673 my $section_ptr = $self->_lookup_section($section);
674 return [] unless defined $section_ptr;
675
676 my @children = @{$section_ptr->{'subsection_order'}};
677
678 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
679 return \@children;
680}
681
682# returns the child section one past the last one (which
683# is coded as "0")
684sub get_end_child {
685 my $self = shift (@_);
686 my ($section) = @_;
687
688 return $section . ".0" unless $section eq "";
689 return "0";
690}
691
692# returns the next section in book order
693sub get_next_section {
694 my $self = shift (@_);
695 my ($section) = @_;
696
697 return undef unless defined $section;
698
699 my $section_ptr = $self->_lookup_section($section);
700 return undef unless defined $section_ptr;
701
702 # first try to find first child
703 if (defined $section_ptr->{'subsection_order'}->[0]) {
704 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
705 return "$section.$section_ptr->{'subsection_order'}->[0]";
706 }
707
708 do {
709 # try to find sibling
710 my $next_child = $self->get_next_child ($section);
711 return $next_child if (defined $next_child);
712
713 # move up one level
714 $section = $self->get_parent_section ($section);
715 } while $section =~ /\d/;
716
717 return undef;
718}
719
720sub is_leaf_section {
721 my $self = shift (@_);
722 my ($section) = @_;
723
724 my $section_ptr = $self->_lookup_section($section);
725 return 1 unless defined $section_ptr;
726
727 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
728}
729
730# methods for dealing with sections
731
732# returns the name of the inserted section
733sub insert_section {
734 my $self = shift (@_);
735 my ($before_section) = @_;
736
737 # get the child to insert before and its parent section
738 my $parent_section = "";
739 my $before_child = "0";
740 my @before_section = split (/\./, $before_section);
741 if (scalar(@before_section) > 0) {
742 $before_child = pop (@before_section);
743 $parent_section = join (".", @before_section);
744 }
745
746 my $parent_section_ptr = $self->_lookup_section($parent_section);
747 if (!defined $parent_section_ptr) {
748 print STDERR "doc::insert_section couldn't find parent section " .
749 "$parent_section\n";
750 return;
751 }
752
753 # get the next section number
754 my $section_num = $parent_section_ptr->{'next_subsection'}++;
755
756 my $i = 0;
757 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
758 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
759 $i++;
760 }
761
762 # insert the section number into the order list
763 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
764
765 # add this section to the parent section
766 my $section_ptr = {'subsection_order'=>[],
767 'next_subsection'=>1,
768 'subsections'=>{},
769 'metadata'=>[],
770 'text'=>""};
771 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
772
773 # work out the full section number
774 my $section = $parent_section;
775 $section .= "." unless $section eq "";
776 $section .= $section_num;
777
778 return $section;
779}
780
781# creates a pre-named section
782sub create_named_section {
783 my $self = shift (@_);
784 my ($mastersection) = @_;
785
786 my ($num);
787 my $section = $mastersection;
788 my $sectionref = $self;
789
790 while ($section ne "") {
791 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
792 $num =~ s/^0+(\d)/$1/; # remove leading 0s
793 $section = "" unless defined $section;
794
795 if (defined $num) {
796 if (!defined $sectionref->{'subsections'}->{$num}) {
797 push (@{$sectionref->{'subsection_order'}}, $num);
798 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
799 'next_subsection'=>1,
800 'subsections'=>{},
801 'metadata'=>[],
802 'text'=>""};
803 if ($num >= $sectionref->{'next_subsection'}) {
804 $sectionref->{'next_subsection'} = $num + 1;
805 }
806 }
807 $sectionref = $sectionref->{'subsections'}->{$num};
808
809 } else {
810 print STDERR "doc::create_named_section couldn't create section ";
811 print STDERR "$mastersection\n";
812 last;
813 }
814 }
815}
816
817# returns a reference to a list of subsections
818sub list_subsections {
819 my $self = shift (@_);
820 my ($section) = @_;
821
822 my $section_ptr = $self->_lookup_section ($section);
823 if (!defined $section_ptr) {
824 print STDERR "doc::list_subsections couldn't find section $section\n";
825 return [];
826 }
827
828 return [@{$section_ptr->{'subsection_order'}}];
829}
830
831sub delete_section {
832 my $self = shift (@_);
833 my ($section) = @_;
834
835# my $section_ptr = {'subsection_order'=>[],
836# 'next_subsection'=>1,
837# 'subsections'=>{},
838# 'metadata'=>[],
839# 'text'=>""};
840
841 # if this is the top section reset everything
842 if ($section eq "") {
843 $self->{'subsection_order'} = [];
844 $self->{'subsections'} = {};
845 $self->{'metadata'} = [];
846 $self->{'text'} = "";
847 return;
848 }
849
850 # find the parent of the section to delete
851 my $parent_section = "";
852 my $child = "0";
853 my @section = split (/\./, $section);
854 if (scalar(@section) > 0) {
855 $child = pop (@section);
856 $parent_section = join (".", @section);
857 }
858
859 my $parent_section_ptr = $self->_lookup_section($parent_section);
860 if (!defined $parent_section_ptr) {
861 print STDERR "doc::delete_section couldn't find parent section " .
862 "$parent_section\n";
863 return;
864 }
865
866 # remove this section from the subsection_order list
867 my $i = 0;
868 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
869 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
870 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
871 last;
872 }
873 $i++;
874 }
875
876 # remove this section from the subsection hash
877 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
878 undef $parent_section_ptr->{'subsections'}->{$child};
879 }
880}
881
882#--
883# methods for dealing with metadata
884
885# set_metadata_element and get_metadata_element are for metadata
886# which should only have one value. add_meta_data and get_metadata
887# are for metadata which can have more than one value.
888
889# returns the first metadata value which matches field
890
891# This version of get metadata element works much like the one above,
892# except it allows for the namespace portion of a metadata element to
893# be ignored, thus if you are searching for dc.Title, the first piece
894# of matching metadata ending with the name Title (once any namespace
895# is removed) would be returned.
896# 28-11-2003 John Thompson
897sub get_metadata_element {
898 my $self = shift (@_);
899 my ($section, $field, $ignore_namespace) = @_;
900 my ($data);
901
902 $ignore_namespace = 0 unless defined $ignore_namespace;
903
904 my $section_ptr = $self->_lookup_section($section);
905 if (!defined $section_ptr) {
906 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
907 return;
908 }
909
910 # Remove any namespace if we are being told to ignore them
911 if($ignore_namespace) {
912 $field =~ s/^.*\.//; #$field =~ s/^\w*\.//;
913 }
914
915 foreach $data (@{$section_ptr->{'metadata'}}) {
916
917 my $data_name = $data->[0];
918
919 # Remove any namespace if we are being told to ignore them
920 if($ignore_namespace) {
921 $data_name =~ s/^.*\.//; #$data_name =~ s/^\w*\.//;
922 }
923 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
924 $data_name =~ s/^ex\.([^.]+)$/$1/; #$data_name =~ s/^ex\.//;
925
926 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
927 }
928
929 return undef; # was not found
930}
931
932# returns a list of the form [value1, value2, ...]
933sub get_metadata {
934 my $self = shift (@_);
935 my ($section, $field, $ignore_namespace) = @_;
936 my ($data);
937
938 $ignore_namespace = 0 unless defined $ignore_namespace;
939
940 my $section_ptr = $self->_lookup_section($section);
941 if (!defined $section_ptr) {
942 print STDERR "doc::get_metadata couldn't find section ",
943 $section, "\n";
944 return;
945 }
946
947 # Remove any namespace if we are being told to ignore them
948 if($ignore_namespace) {
949 $field =~ s/^.*\.//;
950 }
951
952 my @metadata = ();
953 foreach $data (@{$section_ptr->{'metadata'}}) {
954
955 my $data_name = $data->[0];
956
957 # Remove any namespace if we are being told to ignore them
958 if($ignore_namespace) {
959 $data_name =~ s/^.*\.//;
960 }
961 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
962 $data_name =~ s/^ex\.([^.]+)$/$1/;
963
964 push (@metadata, $data->[1]) if ($data_name eq $field);
965 }
966
967 return \@metadata;
968}
969
970sub get_metadata_hashmap {
971 my $self = shift (@_);
972 my ($section, $opt_namespace) = @_;
973
974 my $section_ptr = $self->_lookup_section($section);
975 if (!defined $section_ptr) {
976 print STDERR "doc::get_metadata couldn't find section ",
977 $section, "\n";
978 return;
979 }
980
981 my $metadata_hashmap = {};
982 foreach my $data (@{$section_ptr->{'metadata'}}) {
983 my $metaname = $data->[0];
984
985 if ((!defined $opt_namespace) || ($metaname =~ m/^$opt_namespace\./)) {
986 if (!defined $metadata_hashmap->{$metaname}) {
987 $metadata_hashmap->{$metaname} = [];
988 }
989 my $metaval_list = $metadata_hashmap->{$metaname};
990 push(@$metaval_list, $data->[1]);
991 }
992 }
993
994 return $metadata_hashmap;
995}
996
997# returns a list of the form [[field,value],[field,value],...]
998sub get_all_metadata {
999 my $self = shift (@_);
1000 my ($section) = @_;
1001
1002 my $section_ptr = $self->_lookup_section($section);
1003 if (!defined $section_ptr) {
1004 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
1005 return;
1006 }
1007
1008 return $section_ptr->{'metadata'};
1009}
1010
1011# $value is optional
1012sub delete_metadata {
1013 my $self = shift (@_);
1014 my ($section, $field, $value) = @_;
1015
1016 my $section_ptr = $self->_lookup_section($section);
1017 if (!defined $section_ptr) {
1018 print STDERR "doc::delete_metadata couldn't find section ", $section, "$field\n";
1019 return;
1020 }
1021
1022 my $i = 0;
1023 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
1024 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
1025 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
1026 splice (@{$section_ptr->{'metadata'}}, $i, 1);
1027 } else {
1028 $i++;
1029 }
1030 }
1031}
1032
1033sub delete_all_metadata {
1034 my $self = shift (@_);
1035 my ($section) = @_;
1036
1037 my $section_ptr = $self->_lookup_section($section);
1038 if (!defined $section_ptr) {
1039 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
1040 return;
1041 }
1042
1043 $section_ptr->{'metadata'} = [];
1044}
1045
1046sub set_metadata_element {
1047 my $self = shift (@_);
1048 my ($section, $field, $value) = @_;
1049
1050 $self->set_utf8_metadata_element ($section, $field,
1051 &unicode::ascii2utf8(\$value));
1052}
1053
1054# set_utf8_metadata_element assumes the text has already been
1055# converted to the UTF-8 encoding.
1056sub set_utf8_metadata_element {
1057 my $self = shift (@_);
1058 my ($section, $field, $value) = @_;
1059
1060 $self->delete_metadata ($section, $field);
1061 $self->add_utf8_metadata ($section, $field, $value);
1062}
1063
1064
1065# add_metadata assumes the text is in (extended) ascii form. For
1066# text which has already been converted to the UTF-8 format use
1067# add_utf8_metadata.
1068sub add_metadata {
1069 my $self = shift (@_);
1070 my ($section, $field, $value) = @_;
1071
1072 $self->add_utf8_metadata ($section, $field,
1073 &unicode::ascii2utf8(\$value));
1074}
1075
1076sub add_utf8_metadata {
1077 my $self = shift (@_);
1078 my ($section, $field, $value) = @_;
1079
1080 # my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
1081 # my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
1082 # print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
1083
1084 my $section_ptr = $self->_lookup_section($section);
1085 if (!defined $section_ptr) {
1086 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
1087 return;
1088 }
1089 if (!defined $value) {
1090 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
1091 return;
1092 }
1093 if (!defined $field) {
1094 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
1095 return;
1096 }
1097
1098 #print STDERR "###$field=$value\n";
1099
1100 # For now, supress this check. Given that text data read in is now
1101 # Unicode aware, then the following block of code can (ironically enough)
1102 # cause our unicode compliant string to be re-encoded (leading to
1103 # a double-encoded UTF-8 string, which we definitely don't want!).
1104
1105
1106 # double check that the value is utf-8
1107 # if (!&unicode::check_is_utf8($value)) {
1108 # print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
1109 # &unicode::ensure_utf8(\$value);
1110 # print STDERR " Tried converting to utf8: $value\n";
1111 # }
1112
1113 #If the metadata value is either a latitude or a longitude value then we want to save a shortened version for spacial searching purposes
1114 if ($field =~ m/^(.+\.)?Latitude$/ || $field =~ m/^(.+\.)?Longitude$/)
1115 {
1116 my ($mdprefix,$metaname) = ($field =~ m/(.+)\.(.+)$/);
1117 if (defined $mdprefix) {
1118 # Add in a version of Latitude/Longitude without the metadata namespace prefix to keep Runtime happy
1119 push (@{$section_ptr->{'metadata'}}, [$metaname, $value]);
1120 }
1121
1122 my $direction;
1123 if($value =~ m/^-/)
1124 {
1125 $direction = ($field eq "Latitude") ? "S" : "W";
1126 }
1127 else
1128 {
1129 $direction = ($field eq "Latitude") ? "N" : "E";
1130 }
1131
1132 my ($beforeDec, $afterDec) = ($value =~ m/^-?([0-9]+)\.([0-9]+)$/);
1133 if(defined $beforeDec && defined $afterDec)
1134 {
1135 my $name = ($field eq "Latitude") ? "LatShort" : "LngShort";
1136 push (@{$section_ptr->{'metadata'}}, [$name, $beforeDec . $direction]);
1137
1138 for(my $i = 2; $i <= 4; $i++)
1139 {
1140 if(length($afterDec) >= $i)
1141 {
1142 push (@{$section_ptr->{'metadata'}}, [$name, substr($afterDec, 0, $i)]);
1143 }
1144 }
1145
1146 #Only add the metadata if it has not already been added
1147 my $metaMap = $self->get_metadata_hashmap($section); ### TODO: metaMap not used. Unnecesssary step? (Called function has no side-effects.)
1148 }
1149
1150 if($field =~ m/^(.+\.)?Longitude$/) {
1151 # if we are dealing with Longitude meta, we should 1. have Latitude meta too; 2. already have processed Latitude meta
1152 # in that case, add both Lat and Lng of this section as a Coordinate meta
1153 my $latitude = $self->get_metadata_element ($section, "Latitude");
1154 # TODO: would like all Longitude info together followed by all Coordinate info, but the following will add all coord info meta and end of this function will add Longitude meta
1155 $self->processCoordinate($section, $latitude, $value); # value is Longitude
1156 }
1157 }
1158
1159 elsif($field eq "GPS.mapOverlay") { # then the value is a JSON string
1160
1161 # TODO:
1162 # If we already have Coordinate meta for this section of the document (as can happen during buildcol.pl),
1163 # let's ASSUME this means we've already processed GPS.mapOverlay meta into Coordinate meta for this section (can have happened during import.pl)
1164 # to avoid adding duplicate Coordinates meta, which then end up duplicated in the index
1165 # Of course, the assumption is not always true! We could have an image with embedded Lat and Lng meta,
1166 # and the same image doc's section could have GPS.mapOverlay meta (from shapes) added via the doc editor.
1167 # This very function would then have converted Lat/Lng into Coordinate meta (just in the if stmt above) and added it to the section.
1168 # And then by the time we process this section's GPS.mapOverlay meta here, we would notice the section has Coordinate meta already,
1169 # and therefore skip converting the GPS.mapOverlay meta into Coordinate meta! What to dooooo?
1170 # So the return statement immediately below is a temporary solution, until we find a better one that will always work.
1171 if($cmd_line_mode eq "buildcol") {
1172 #my $metaMap = $self->get_metadata_hashmap($section); ## TODO: Check if necessary to avoid duplication of <Coordinate> meta in index\text\<coll>.jdb
1173
1174 #if(!$metaMap->{'Coordinate'}) {
1175 #print STDERR "@@@@@@@@@@@@@@ cmd line mode (build phase) is now: $doc::cmd_line_mode\n";
1176
1177
1178 print STDERR "GPS.mapOverlay before val: " . $value . "\n";
1179
1180 # TODO html decode?
1181 $value =~ s@&#091;@[@g;
1182 $value =~ s@&#093;@]@g;
1183 $value =~ s@&quot;@"@g;
1184 print STDERR "GPS.mapOverlay after val: " . $value . "\n";
1185
1186 my $json_array = decode_json $value;
1187 #my $json = JSON->new->allow_nonref;
1188 #&printAllShapes($json, $json_array);
1189
1190 foreach my $shape (@$json_array) {
1191
1192 my $type = $shape->{"type"};
1193 print STDERR "Type : " . $type . "\n";
1194
1195 if($type eq "circle") {
1196 #print STDERR "Found a circle:\n" . &printShape($json, $shape);
1197
1198 # work out bounding box
1199 # SCARY!
1200 # want the inverse of this useful page:
1201 # https://stackoverflow.com/questions/639695/how-to-convert-latitude-or-longitude-to-meters
1202 # https://www.geodatasource.com/developers/javascript
1203
1204
1205 # for now, just process the circle centre
1206 #my $centre = $shape->{"center"};
1207 #$self->processLatOrLng($section_ptr, "Latitude", $centre->{"lat"});
1208 #$self->processLatOrLng($section_ptr, "Longitude", $centre->{"lng"});
1209
1210
1211 # Dr Bainbridge wants us to use: https://gis.stackexchange.com/questions/5821/calculating-latitude-longitude-x-miles-from-point
1212 # But we're using the rule of thumb here, since for N,E,S,W it works out the same:
1213 # https://gis.stackexchange.com/questions/2951/algorithm-for-offsetting-a-latitude-longitude-by-some-amount-of-meters
1214 # which states
1215 # "If your displacements aren't too great (less than a few kilometers) and you're not right at the poles,
1216 # use the quick and dirty estimate that 111,111 meters (111.111 km) in the y direction is 1 degree
1217 # (of latitude) and 111,111 * cos(latitude) meters in the x direction is 1 degree (of longitude)."
1218 my $centre_lat = $shape->{"center"}->{"lat"};
1219 my $centre_lng = $shape->{"center"}->{"lng"};
1220 my $radius = $shape->{"radius"}; # in metres!
1221
1222 print STDERR "@@@ circle centre: ($centre_lat, $centre_lng), radius: $radius\n";
1223
1224 my $lat_north = $centre_lat + ($radius/111111);
1225 my $lat_south = $centre_lat - ($radius/111111);
1226
1227 print STDERR "### lat_north: $lat_north\n";
1228 print STDERR "### lat_south: $lat_south\n";
1229
1230 # our latitude and longitude values are in degrees. But cos and sin etc in perl and generally all prog languages
1231 # all expect their angle to be in radians. So need to convert from degree to radians before we can take the cose of it.
1232 my $centre_lat_radians = $self->degreesToRadians($centre_lat);
1233 my $cos_in_radians = cos($centre_lat_radians);
1234 print STDERR "cos $centre_lat_radians " . cos($centre_lat_radians) . "\n";
1235 my $lng_east = $centre_lng + ($radius/(111111 * $cos_in_radians));
1236 my $lng_west = $centre_lng - ($radius/(111111 * $cos_in_radians));
1237 print STDERR "### lng_east $lng_east\n";
1238 print STDERR "### lng_west $lng_west\n";
1239
1240
1241
1242 my $cos_lat = cos($centre_lat);
1243 print STDERR "cos $centre_lat is $cos_lat\n";
1244
1245 $self->processCoordinate($section, $lat_north, $lng_east);
1246 $self->processCoordinate($section, $lat_south, $lng_east);
1247 $self->processCoordinate($section, $lat_south, $lng_west);
1248 $self->processCoordinate($section, $lat_north, $lng_west);
1249
1250 }
1251 elsif ($type eq "marker") {
1252 print STDERR "@@ MARKER FOUND WITH LAT: " . $shape->{"position"}->{"lat"} . "\n";
1253 print STDERR "@@ MARKER FOUND WITH LNG: " . $shape->{"position"}->{"lng"} . "\n";
1254 $self->processCoordinate($section, $shape->{"position"}->{"lat"}, $shape->{"position"}->{"lng"});
1255 }
1256 elsif ($type eq "polyline" || $type eq "polygon") {
1257 my $path_array = $shape->{"path"};
1258 foreach my $position (@$path_array) {
1259 $self->processCoordinate($section, $position->{"lat"}, $position->{"lng"});
1260 }
1261 }
1262 elsif ($type eq "rectangle") {
1263
1264 my $bounds = $shape->{"bounds"};
1265 $self->processCoordinate($section, $bounds->{"north"}, $bounds->{"east"});
1266 $self->processCoordinate($section, $bounds->{"south"}, $bounds->{"east"});
1267 $self->processCoordinate($section, $bounds->{"south"}, $bounds->{"west"});
1268 $self->processCoordinate($section, $bounds->{"north"}, $bounds->{"west"});
1269 }
1270
1271 } # end for on each shape in GPS.mapOverlay
1272 #}
1273 }
1274 } # end GPS.mapOverlay meta
1275
1276 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1277}
1278
1279# https://en.wikipedia.org/wiki/Radian
1280sub degreesToRadians
1281{
1282 my $self = shift (@_);
1283 my ($degrees) = @_;
1284
1285 return $degrees * pi /180; # returns radians
1286}
1287
1288sub radiansToDegrees
1289{
1290 my $self = shift (@_);
1291 my ($radians) = @_;
1292
1293 return $radians * 180 / pi; # returns degrees
1294}
1295
1296sub printAllShapes {
1297 my ($json, $json_array) = @_;
1298
1299
1300 #my $pretty_print_shape = $json->pretty->encode( $json_array->[0] );
1301 foreach my $shape (@$json_array) {
1302 my $pretty_print_shape = $json->pretty->encode( $shape );
1303 print STDERR "Shape: $pretty_print_shape\n";
1304 #&printShape($shape);
1305 }
1306
1307}
1308
1309sub processCoordinate {
1310 my $self = shift (@_);
1311 my ($section, $latitude, $longitude) = @_;
1312
1313 my $section_ptr = $self->_lookup_section($section);
1314
1315 my $lat_direction = ($latitude =~ m/^-/) ? "S" : "N";
1316 my $lng_direction = ($longitude =~ m/^-/) ? "W" : "E";
1317
1318 # have to store (lat, lng) in pairs, when there are so many coords to store
1319 #push (@{$section_ptr->{'metadata'}}, ["Latitude", $latitude]);
1320 #push (@{$section_ptr->{'metadata'}}, ["Longitude", $longitude]);
1321
1322 push (@{$section_ptr->{'metadata'}}, ["Coordinate", "$latitude $longitude"]); # "$latitude$lat_direction $longitude$lng_direction"
1323
1324 my ($latBeforeDec, $latAfterDec);
1325 my ($lngBeforeDec, $lngAfterDec);
1326
1327 if($latitude !~ m/\./) {
1328 $latBeforeDec = $latitude;
1329 $latAfterDec = "";
1330 } else {
1331 ($latBeforeDec, $latAfterDec) = ($latitude =~ m/^-?([0-9]+)\.([0-9]+)$/);
1332 }
1333 if($longitude !~ m/\./) {
1334 $lngBeforeDec = $longitude;
1335 $lngAfterDec = "";
1336 } else {
1337 ($lngBeforeDec, $lngAfterDec) = ($longitude =~ m/^-?([0-9]+)\.([0-9]+)$/);
1338 }
1339
1340 #if(defined $beforeDec && defined $afterDec)
1341 #{
1342 my $name = "CoordShort";
1343 push (@{$section_ptr->{'metadata'}}, [$name, "$latBeforeDec$lat_direction $lngBeforeDec$lng_direction"]);
1344
1345 for(my $i = 2; $i <= 4; $i++)
1346 {
1347 my $latDecPlaces = (length($latAfterDec) >= $i) ? substr($latAfterDec, 0, $i) : "";
1348 my $lngDecPlaces = (length($lngAfterDec) >= $i) ? substr($lngAfterDec, 0, $i) : "";
1349
1350 push (@{$section_ptr->{'metadata'}}, [$name,
1351 $latBeforeDec . $lat_direction. $latDecPlaces . " " . $lngBeforeDec . $lng_direction. $lngDecPlaces]);
1352
1353 }
1354
1355 #Only add the metadata if it has not already been added
1356 #my $metaMap = $self->get_metadata_hashmap($section); ### TODO: metaMap not used. Unnecesssary step? (Called function has no side-effects.)
1357 #}
1358
1359
1360}
1361
1362
1363# methods for dealing with text
1364
1365# returns the text for a section
1366sub get_text {
1367 my $self = shift (@_);
1368 my ($section) = @_;
1369
1370 my $section_ptr = $self->_lookup_section($section);
1371 if (!defined $section_ptr) {
1372 print STDERR "doc::get_text couldn't find section " .
1373 "$section\n";
1374 return "";
1375 }
1376
1377 return $section_ptr->{'text'};
1378}
1379
1380# returns the (utf-8 encoded) length of the text for a section
1381sub get_text_length {
1382 my $self = shift (@_);
1383 my ($section) = @_;
1384
1385 my $section_ptr = $self->_lookup_section($section);
1386 if (!defined $section_ptr) {
1387 print STDERR "doc::get_text_length couldn't find section " .
1388 "$section\n";
1389 return 0;
1390 }
1391
1392 return length ($section_ptr->{'text'});
1393}
1394
1395# returns the total length for all the sections
1396sub get_total_text_length {
1397 my $self = shift (@_);
1398
1399 my $section = $self->get_top_section();
1400 my $length = 0;
1401 while (defined $section) {
1402 $length += $self->get_text_length($section);
1403 $section = $self->get_next_section($section);
1404 }
1405 return $length;
1406}
1407
1408sub delete_text {
1409 my $self = shift (@_);
1410 my ($section) = @_;
1411
1412 my $section_ptr = $self->_lookup_section($section);
1413 if (!defined $section_ptr) {
1414 print STDERR "doc::delete_text couldn't find section " .
1415 "$section\n";
1416 return;
1417 }
1418
1419 $section_ptr->{'text'} = "";
1420}
1421
1422# add_text assumes the text is in (extended) ascii form. For
1423# text which has been already converted to the UTF-8 format
1424# use add_utf8_text.
1425sub add_text {
1426 my $self = shift (@_);
1427 my ($section, $text) = @_;
1428
1429 # convert the text to UTF-8 encoded unicode characters
1430 # and add the text
1431 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1432}
1433
1434
1435# add_utf8_text assumes the text to be added has already
1436# been converted to the UTF-8 encoding. For ascii text use
1437# add_text
1438# Pass by value version (internally calls pass by ref version
1439# to avoid code duplication)
1440sub add_utf8_text {
1441 my $self = shift (@_);
1442 my ($section, $text) = @_;
1443
1444 $self->add_utf8_textref($section, \$text);
1445}
1446
1447# Pass by reference version, used by GreenstoneSQLPlugin for fulltext
1448sub add_utf8_textref {
1449 my $self = shift (@_);
1450 my ($section, $text_ref) = @_;
1451
1452 my $section_ptr = $self->_lookup_section($section);
1453 if (!defined $section_ptr) {
1454 print STDERR "doc::add_utf8_textref couldn't find section " .
1455 "$section\n";
1456 return;
1457 }
1458
1459 $section_ptr->{'text'} .= $$text_ref;
1460}
1461
1462# returns the Source meta, which is the utf8 filename generated.
1463# Added a separate method here for convenience
1464sub get_source {
1465 my $self = shift (@_);
1466 return $self->get_metadata_element ($self->get_top_section(), "Source");
1467}
1468
1469# returns the SourceFile meta, which is the url reference to the URL-encoded
1470# version of Source (the utf8 filename). Added a separate method here for convenience
1471sub get_sourcefile {
1472 my $self = shift (@_);
1473 return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
1474}
1475
1476# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
1477# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
1478sub get_assocfile_from_sourcefile {
1479 my $self = shift (@_);
1480
1481 # get the SourceFile meta, which is a *URL* to a file on the filesystem
1482 my $top_section = $self->get_top_section();
1483 my $source_file = $self->get_metadata_element($top_section, "SourceFile");
1484
1485 # get the actual filename as it exists on the filesystem which this url refers to
1486 $source_file = &unicode::url_to_filename($source_file);
1487 my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
1488 return $assocfilename;
1489}
1490
1491# methods for dealing with associated files
1492
1493# a file is associated with a document, NOT a section.
1494# if section is defined it is noted in the data structure
1495# only so that files associated from a particular section
1496# may be removed later (using delete_section_assoc_files)
1497sub associate_file {
1498 my $self = shift (@_);
1499 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1500 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1501
1502 # remove all associated files with the same name
1503 $self->delete_assoc_file ($assoc_filename);
1504
1505 # Too harsh a requirement
1506 # Definitely get HTML docs, for example, with some missing
1507 # support files
1508# if (!&util::fd_exists($real_filename)) {
1509# print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n";
1510# exit -1;
1511# }
1512
1513# print STDERR "**** is the following a UTF8 rep of *real* filename?\n $real_filename\n";
1514# print STDERR "****##### so, ensure it is before storing?!?!?\n";
1515## my $utf8_filename = Encode::encode("utf8",$filename);
1516
1517 push (@{$self->{'associated_files'}},
1518 [$real_filename, $assoc_filename, $mime_type, $section]);
1519}
1520
1521# returns a list of associated files in the form
1522# [[real_filename, assoc_filename, mimetype], ...]
1523sub get_assoc_files {
1524 my $self = shift (@_);
1525
1526 return $self->{'associated_files'};
1527}
1528
1529# the following two methods used to keep track of original associated files
1530# for incremental building. eg a txt file used by an item file does not end
1531# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1532# file for incremental build
1533sub associate_source_file {
1534 my $self = shift (@_);
1535 my ($full_filename) = @_;
1536
1537 push (@{$self->{'source_assoc_files'}}, $full_filename);
1538
1539}
1540
1541sub get_source_assoc_files {
1542 my $self = shift (@_);
1543
1544 return $self->{'source_assoc_files'};
1545
1546
1547}
1548sub metadata_file {
1549 my $self = shift (@_);
1550 my ($real_filename, $filename) = @_;
1551
1552 push (@{$self->{'metadata_files'}},
1553 [$real_filename, $filename]);
1554}
1555
1556# used for writing out the archiveinf-doc info database, to list all the metadata files
1557sub get_meta_files {
1558 my $self = shift (@_);
1559
1560 return $self->{'metadata_files'};
1561}
1562
1563sub delete_section_assoc_files {
1564 my $self = shift (@_);
1565 my ($section) = @_;
1566
1567 my $i=0;
1568 while ($i < scalar (@{$self->{'associated_files'}})) {
1569 if (defined $self->{'associated_files'}->[$i]->[3] &&
1570 $self->{'associated_files'}->[$i]->[3] eq $section) {
1571 splice (@{$self->{'associated_files'}}, $i, 1);
1572 } else {
1573 $i++;
1574 }
1575 }
1576}
1577
1578sub delete_assoc_file {
1579 my $self = shift (@_);
1580 my ($assoc_filename) = @_;
1581
1582 my $i=0;
1583 while ($i < scalar (@{$self->{'associated_files'}})) {
1584 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1585 splice (@{$self->{'associated_files'}}, $i, 1);
1586 } else {
1587 $i++;
1588 }
1589 }
1590}
1591
1592sub reset_nextsection_ptr {
1593 my $self = shift (@_);
1594 my ($section) = @_;
1595
1596 my $section_ptr = $self->_lookup_section($section);
1597 $section_ptr->{'next_subsection'} = 1;
1598}
1599
16001;
Note: See TracBrowser for help on using the repository browser.