source: main/trunk/greenstone2/perllib/doc.pm@ 26221

Last change on this file since 26221 was 26221, checked in by kjdon, 12 years ago

new OIDtype, filename, will use the file name without any folders or file extension. Must be unique filenames in the collection. BasePlugin add_OID method returns if an id has already been set

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 35.5 KB
RevLine 
[537]1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
[8894]10# This program is free software; you can redistr te it and/or modify
[537]11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
[1241]26# base class to hold documents
[4]27
28package doc;
[3834]29eval {require bytes};
[4]30
[832]31BEGIN {
[1241]32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
[832]34}
35
[15894]36use strict;
[1241]37use unicode;
38use util;
39use ghtml;
[8220]40use File::stat;
[1241]41##use hashdoc;
[13172]42use docprint;
[1241]43
[4]44# the document type may be indexed_doc, nonindexed_doc, or
45# classification
46
[18528]47our $OIDcount = 0;
[2327]48
[20412]49# rename_method can be 'url', 'none', 'base64'
[4]50sub new {
51 my $class = shift (@_);
[18319]52 my ($source_filename, $doc_type, $rename_method) = @_;
[1374]53
[13770]54
[1241]55 my $self = bless {'associated_files'=>[],
56 'subsection_order'=>[],
57 'next_subsection'=>1,
58 'subsections'=>{},
59 'metadata'=>[],
[2327]60 'text'=>"",
61 'OIDtype'=>"hash"}, $class;
[4]62
[10217]63 # used to set lastmodified here, but this can screw up the HASH ids, so
64 # the docsave processor now calls set_lastmodified
[13770]65
[23362]66 $self->set_source_path($source_filename);
[10217]67
[7929]68 if (defined $source_filename) {
[15874]69 $source_filename = &util::filename_within_collection($source_filename);
[18508]70 print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
[18319]71 $self->set_source_filename ($source_filename, $rename_method);
[7929]72 }
73
[1374]74 $self->set_doc_type ($doc_type) if defined $doc_type;
[485]75
[4]76 return $self;
77}
[23362]78
79
80sub set_source_path
81{
82 my $self = shift @_;
83 my ($source_filename) = @_;
84
85 if (defined $source_filename) {
86 # On Windows the source_filename can be in terse DOS format
87 # e.g. test~1.txt
88
89 $self->{'terse_source_path'} = $source_filename;
90
91 if (-e $source_filename) {
92 # See if we can do better for Windows with a filename
93 if ($ENV{'GSDLOS'} =~ /^windows$/i) {
94 require Win32;
95 $self->{'source_path'} = Win32::GetLongPathName($source_filename);
96 }
97 else {
98 # For Unix-based systems, there is no difference between the two
99 $self->{'source_path'} = $source_filename;
100 }
101 }
102 else {
103 print STDERR "Warning: In doc::set_source_path(), file\n";
104 print STDERR " $source_filename\n";
105 print STDERR " does not exist\n";
106
107 # (default) Set it to whatever we were given
108 $self->{'source_path'} = $source_filename;
109 }
110 }
111 else {
112 # Previous code for setting source_path allowed for
113 # it to be undefined, so continue this practice
114 $self->{'terse_source_path'} = undef;
115 $self->{'source_path'} = undef;
116 }
117}
118
119
120sub get_source_path
121{
122 my $self = shift @_;
123
124 return $self->{'terse_source_path'};
125}
126
[10217]127# set lastmodified for OAI purposes, added by GRB, moved by kjdon
[23939]128sub set_oailastmodified {
129 my $self = shift (@_);
130
131 my $source_path = $self->{'terse_source_path'};
132
133 if (defined $source_path && (-e $source_path)) {
134 my $current_time = time;
135
136 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
137 $wday, $yday, $isdst) = localtime($current_time);
138
139 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
140
141 $self->add_utf8_metadata($self->get_top_section(), "oailastmodified", $current_time);
142 $self->add_utf8_metadata($self->get_top_section(), "oailastmodifieddate", $date_modified);
143 }
144}
145
146# no longer used for OAI purposes, since lastmodified is not what we want as the
147# Datestamp of a document. This doc metadata may be useful for general purposes.
[10217]148sub set_lastmodified {
149 my $self = shift (@_);
[4]150
[23362]151 my $source_path = $self->{'terse_source_path'};
[13770]152
153 if (defined $source_path && (-e $source_path)) {
154
155 my $file_stat = stat($source_path);
[10217]156 my $mtime = $file_stat->mtime;
[21862]157 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
158 $wday, $yday, $isdst) = localtime($mtime);
159
160 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
161
[23923]162 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $mtime);
[21862]163 $self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
[10217]164 }
165}
166
[1241]167# clone the $self object
168sub duplicate {
169 my $self = shift (@_);
170
171 my $newobj = {};
172
[8716]173 foreach my $k (keys %$self) {
[1241]174 $newobj->{$k} = &clone ($self->{$k});
175 }
176
177 bless $newobj, ref($self);
178 return $newobj;
179}
180
181sub clone {
182 my ($from) = @_;
183 my $type = ref ($from);
184
185 if ($type eq "HASH") {
186 my $to = {};
[8716]187 foreach my $key (keys %$from) {
[1241]188 $to->{$key} = &clone ($from->{$key});
189 }
190 return $to;
191 } elsif ($type eq "ARRAY") {
192 my $to = [];
[8716]193 foreach my $v (@$from) {
[1241]194 push (@$to, &clone ($v));
195 }
196 return $to;
197 } else {
198 return $from;
199 }
200}
201
[2327]202sub set_OIDtype {
203 my $self = shift (@_);
[12268]204 my ($type, $metadata) = @_;
[1241]205
[26221]206 if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|incremental|filename|dirname|assigned)$/) {
[2327]207 $self->{'OIDtype'} = $type;
208 } else {
209 $self->{'OIDtype'} = "hash";
210 }
[16792]211
[12268]212 if ($type =~ /^assigned$/) {
213 if (defined $metadata) {
214 $self->{'OIDmetadata'} = $metadata;
215 } else {
216 $self->{'OIDmetadata'} = "dc.Identifier";
217 }
218 }
[2327]219}
220
[20412]221# rename_method can be 'url', 'none', 'base64'
[1241]222sub set_source_filename {
223 my $self = shift (@_);
[18319]224 my ($source_filename, $rename_method) = @_;
[1241]225
[16578]226 # Since the gsdlsourcefilename element goes into the doc.xml it has
227 # to be utf8. However, it should also *represent* the source filename
228 # (in the import directory) which may not be utf8 at all.
[18319]229 # For instance, if this meta element (gsdlsourcefilename) will be used
230 # by other applications that parse doc.xml in order to locate
231 # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
232 # encode the real filename as this is a binary-to-text encoding meaning
233 # that the resulting string is ASCII (utf8). Decoding will give the original.
[16578]234
[18319]235# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
236
[16670]237 # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
[23278]238 $source_filename = $self->encode_filename($source_filename, $rename_method);
239# my ($srcfilename,$dirname,$suffix)
240# = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
[16792]241# print STDERR "-> $srcfilename -> ";
[23278]242# $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
243# $source_filename = &util::filename_cat($dirname, $srcfilename);
[16792]244# print STDERR "$source_filename\n";
[16670]245
[16578]246 $self->set_utf8_metadata_element ($self->get_top_section(),
[1241]247 "gsdlsourcefilename",
248 $source_filename);
249}
250
[23278]251sub encode_filename {
252 my $self = shift (@_);
253 my ($source_filename, $rename_method) = @_;
254
255 my ($srcfilename,$dirname,$suffix)
256 = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
257# print STDERR "-> $srcfilename -> ";
258 $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
259 $source_filename = &util::filename_cat($dirname, $srcfilename);
260
261 return $source_filename;
262}
263
[7569]264sub set_converted_filename {
265 my $self = shift (@_);
266 my ($converted_filename) = @_;
267
[16578]268 # we know the converted filename is utf8
269 $self->set_utf8_metadata_element ($self->get_top_section(),
[7569]270 "gsdlconvertedfilename",
271 $converted_filename);
272}
273
[1241]274# returns the source_filename as it was provided
[19829]275sub get_unmodified_source_filename {
276 my $self = shift (@_);
277
[23362]278 return $self->{'terse_source_path'};
[19829]279}
280
281# returns the source_filename with whatever rename_method was given
[1241]282sub get_source_filename {
283 my $self = shift (@_);
284
285 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
286}
287
[19829]288
289
[7569]290# returns converted filename if available else returns source filename
291sub get_filename_for_hashing {
292 my $self = shift (@_);
293
294 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
295
296 if (!defined $filename) {
[11097]297 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
[10980]298 # if NULPlug processed file, then don't give a filename
[11097]299 if (defined $plugin_name && $plugin_name eq "NULPlug") {
[10980]300 $filename = undef;
[16670]301 } else { # returns the URL encoded source filename!
[10980]302 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
303 }
[7569]304 }
[23485]305
306 if (!&util::filename_is_absolute($filename)) {
307 $filename = &util::filename_cat($ENV{'GSDLCOLLECTDIR'},$filename);
308 }
309
[7569]310 return $filename;
311}
312
[1241]313sub set_doc_type {
314 my $self = shift (@_);
315 my ($doc_type) = @_;
316
317 $self->set_metadata_element ($self->get_top_section(),
318 "gsdldoctype",
319 $doc_type);
320}
321
[10217]322# returns the gsdldoctype as it was provided
[1241]323# the default of "indexed_doc" is used if no document
324# type was provided
325sub get_doc_type {
326 my $self = shift (@_);
327
328 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
329 return $doc_type if (defined $doc_type);
330 return "indexed_doc";
331}
332
333
334# look up the reference to the a particular section
335sub _lookup_section {
336 my $self = shift (@_);
337 my ($section) = @_;
338
339 my ($num);
340 my $sectionref = $self;
341
342 while (defined $section && $section ne "") {
[12327]343
[1241]344 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
[12327]345
346 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
347
[1241]348 $section = "" unless defined $section;
349
[12327]350
[1241]351 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
352 $sectionref = $sectionref->{'subsections'}->{$num};
353 } else {
354 return undef;
355 }
356 }
357
358 return $sectionref;
359}
360
[2327]361# calculate OID by hashing the contents of the document
[1241]362sub _calc_OID {
363 my $self = shift (@_);
364 my ($filename) = @_;
365
[22855]366
[1241]367 my $osexe = &util::get_os_exe();
368
369 my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
370 $ENV{'GSDLOS'},"hashfile$osexe");
[8504]371
[1241]372 my $result = "NULL";
[22855]373
[1679]374
[1241]375 if (-e "$hashfile_exe") {
[1679]376# $result = `\"$hashfile_exe\" \"$filename\"`;
[21862]377# $result = `hashfile$osexe \"$filename\" 2>&1`;
[1679]378 $result = `hashfile$osexe \"$filename\"`;
[21862]379
[1241]380 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
381 } else {
382 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
383 }
384 return "HASH$result";
385}
386
387# methods dealing with OID, not groups of them.
388
[2327]389# if $OID is not provided one is calculated
[1241]390sub set_OID {
391 my $self = shift (@_);
392 my ($OID) = @_;
[8504]393
[8797]394 my $use_hash_oid = 0;
[17057]395 # if an OID wasn't provided calculate one
[1241]396 if (!defined $OID) {
397 $OID = "NULL";
[17025]398 if ($self->{'OIDtype'} =~ /^hash/) {
[8797]399 $use_hash_oid = 1;
400 } elsif ($self->{'OIDtype'} eq "incremental") {
[2327]401 $OID = "D" . $OIDcount;
402 $OIDcount ++;
[26221]403 } elsif ($self->{'OIDtype'} eq "filename") {
404 my $filename = $self->get_source_filename();
405 $OID = &File::Basename::fileparse($filename, qr/\.[^.]*/);
406 $OID = &util::tidy_up_oid($OID);
[8716]407 } elsif ($self->{'OIDtype'} eq "dirname") {
408 $OID = 'J';
409 my $filename = $self->get_source_filename();
[8797]410 if (defined($filename)) { # && -e $filename) {
[8716]411 $OID = &File::Basename::dirname($filename);
412 if (defined $OID) {
413 $OID = 'J'.&File::Basename::basename($OID);
[19617]414 $OID = &util::tidy_up_oid($OID);
[8716]415 } else {
[8797]416 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
417 $use_hash_oid = 1;
[8716]418 }
419 } else {
[8797]420 print STDERR "Failed to find filename, generating hash id\n";
421 $use_hash_oid = 1;
[8716]422 }
[2327]423
[8797]424 } elsif ($self->{'OIDtype'} eq "assigned") {
[12268]425 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
[8797]426 if (defined $identifier && $identifier ne "") {
[17025]427 $OID = $identifier;
[19617]428 $OID = &util::tidy_up_oid($OID);
[8797]429 } else {
430 # need a hash id
[12268]431 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
[8797]432 $use_hash_oid = 1;
433 }
434
[2327]435 } else {
[8797]436 $use_hash_oid = 1;
437 }
438
439 if ($use_hash_oid) {
[23562]440 my $hash_on_file = 1;
[17025]441 if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
442 $hash_on_file = 0;
443 }
444 if ($hash_on_file) {
445 # "hash" OID - feed file to hashfile.exe
446 my $filename = $self->get_filename_for_hashing();
[23562]447
[17025]448 # -z: don't want to hash on the file if it is zero size
449 if (defined($filename) && -e $filename && !-z $filename) {
450 $OID = $self->_calc_OID ($filename);
451 } else {
452 $hash_on_file = 0;
453 }
454 }
455 if (!$hash_on_file) {
456 my $filename = &util::get_tmp_filename();
[22855]457 if (!open (OUTFILE, ">:utf8", $filename)) {
[2327]458 print STDERR "doc::set_OID could not write to $filename\n";
459 } else {
[13172]460 my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
461 print OUTFILE $doc_text;
[2327]462 close (OUTFILE);
463 }
464 $OID = $self->_calc_OID ($filename);
465 &util::rm ($filename);
[1374]466 }
[1241]467 }
468 }
469 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
470}
471
472# this uses hashdoc (embedded c thingy) which is faster but still
473# needs a little work to be suffiently stable
474sub ___set_OID {
475 my $self = shift (@_);
476 my ($OID) = @_;
477
478 # if an OID wasn't provided then calculate hash value based on document
479 if (!defined $OID)
480 {
[13172]481 my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
[1241]482 my $hash_len = length($hash_text);
483
484 $OID = &hashdoc::buffer($hash_text,$hash_len);
485 }
486
487 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
488}
489
490# returns the OID for this document
491sub get_OID {
492 my $self = shift (@_);
493 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
494 return $OID if (defined $OID);
495 return "NULL";
496}
497
498sub delete_OID {
499 my $self = shift (@_);
500
501 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
502}
503
504
505# methods for manipulating section names
506
507# returns the name of the top-most section (the top
508# level of the document
509sub get_top_section {
510 my $self = shift (@_);
511
512 return "";
513}
514
515# returns a section
516sub get_parent_section {
517 my $self = shift (@_);
518 my ($section) = @_;
519
520 $section =~ s/(^|\.)\d+$//;
521
522 return $section;
523}
524
525# returns the first child section (or the end child
526# if there isn't any)
527sub get_begin_child {
528 my $self = shift (@_);
529 my ($section) = @_;
530
531 my $section_ptr = $self->_lookup_section($section);
532 return "" unless defined $section_ptr;
533
534 if (defined $section_ptr->{'subsection_order'}->[0]) {
535 return "$section.$section_ptr->{'subsection_order'}->[0]";
536 }
537
538 return $self->get_end_child ($section);
539}
540
541# returns the next child of a parent section
542sub get_next_child {
543 my $self = shift (@_);
544 my ($section) = @_;
545
546 my $parent_section = $self->get_parent_section($section);
547 my $parent_section_ptr = $self->_lookup_section($parent_section);
548 return undef unless defined $parent_section_ptr;
549
550 my ($section_num) = $section =~ /(\d+)$/;
551 return undef unless defined $section_num;
552
553 my $i = 0;
554 my $section_order = $parent_section_ptr->{'subsection_order'};
555 while ($i < scalar(@$section_order)) {
556 last if $section_order->[$i] eq $section_num;
557 $i++;
558 }
559
560 $i++; # the next child
561 if ($i < scalar(@$section_order)) {
562 return $section_order->[$i] if $parent_section eq "";
563 return "$parent_section.$section_order->[$i]";
564 }
565
566 # no more sections in this level
567 return undef;
568}
569
570# returns a reference to a list of children
571sub get_children {
572 my $self = shift (@_);
573 my ($section) = @_;
574
575 my $section_ptr = $self->_lookup_section($section);
576 return [] unless defined $section_ptr;
577
578 my @children = @{$section_ptr->{'subsection_order'}};
579
580 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
581 return \@children;
582}
583
584# returns the child section one past the last one (which
585# is coded as "0")
586sub get_end_child {
587 my $self = shift (@_);
588 my ($section) = @_;
589
590 return $section . ".0" unless $section eq "";
591 return "0";
592}
593
594# returns the next section in book order
595sub get_next_section {
596 my $self = shift (@_);
597 my ($section) = @_;
598
599 return undef unless defined $section;
600
601 my $section_ptr = $self->_lookup_section($section);
602 return undef unless defined $section_ptr;
603
604 # first try to find first child
605 if (defined $section_ptr->{'subsection_order'}->[0]) {
606 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
607 return "$section.$section_ptr->{'subsection_order'}->[0]";
608 }
609
610 do {
611 # try to find sibling
612 my $next_child = $self->get_next_child ($section);
613 return $next_child if (defined $next_child);
614
615 # move up one level
616 $section = $self->get_parent_section ($section);
617 } while $section =~ /\d/;
618
619 return undef;
620}
621
622sub is_leaf_section {
623 my $self = shift (@_);
624 my ($section) = @_;
625
626 my $section_ptr = $self->_lookup_section($section);
627 return 1 unless defined $section_ptr;
628
629 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
630}
631
632# methods for dealing with sections
633
634# returns the name of the inserted section
635sub insert_section {
636 my $self = shift (@_);
637 my ($before_section) = @_;
638
639 # get the child to insert before and its parent section
640 my $parent_section = "";
641 my $before_child = "0";
642 my @before_section = split (/\./, $before_section);
643 if (scalar(@before_section) > 0) {
644 $before_child = pop (@before_section);
645 $parent_section = join (".", @before_section);
646 }
647
648 my $parent_section_ptr = $self->_lookup_section($parent_section);
649 if (!defined $parent_section_ptr) {
650 print STDERR "doc::insert_section couldn't find parent section " .
651 "$parent_section\n";
652 return;
653 }
654
655 # get the next section number
656 my $section_num = $parent_section_ptr->{'next_subsection'}++;
657
658 my $i = 0;
659 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
660 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
661 $i++;
662 }
663
664 # insert the section number into the order list
665 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
666
667 # add this section to the parent section
668 my $section_ptr = {'subsection_order'=>[],
669 'next_subsection'=>1,
670 'subsections'=>{},
671 'metadata'=>[],
672 'text'=>""};
673 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
674
675 # work out the full section number
676 my $section = $parent_section;
677 $section .= "." unless $section eq "";
678 $section .= $section_num;
679
680 return $section;
681}
682
683# creates a pre-named section
684sub create_named_section {
685 my $self = shift (@_);
686 my ($mastersection) = @_;
687
688 my ($num);
689 my $section = $mastersection;
690 my $sectionref = $self;
691
692 while ($section ne "") {
693 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
694 $num =~ s/^0+(\d)/$1/; # remove leading 0s
695 $section = "" unless defined $section;
696
697 if (defined $num) {
698 if (!defined $sectionref->{'subsections'}->{$num}) {
699 push (@{$sectionref->{'subsection_order'}}, $num);
700 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
701 'next_subsection'=>1,
702 'subsections'=>{},
703 'metadata'=>[],
704 'text'=>""};
705 if ($num >= $sectionref->{'next_subsection'}) {
706 $sectionref->{'next_subsection'} = $num + 1;
707 }
708 }
709 $sectionref = $sectionref->{'subsections'}->{$num};
710
711 } else {
712 print STDERR "doc::create_named_section couldn't create section ";
713 print STDERR "$mastersection\n";
714 last;
715 }
716 }
717}
718
719# returns a reference to a list of subsections
720sub list_subsections {
721 my $self = shift (@_);
722 my ($section) = @_;
723
724 my $section_ptr = $self->_lookup_section ($section);
725 if (!defined $section_ptr) {
726 print STDERR "doc::list_subsections couldn't find section $section\n";
727 return [];
728 }
729
730 return [@{$section_ptr->{'subsection_order'}}];
731}
732
733sub delete_section {
734 my $self = shift (@_);
735 my ($section) = @_;
736
737# my $section_ptr = {'subsection_order'=>[],
738# 'next_subsection'=>1,
739# 'subsections'=>{},
740# 'metadata'=>[],
741# 'text'=>""};
742
743 # if this is the top section reset everything
744 if ($section eq "") {
745 $self->{'subsection_order'} = [];
746 $self->{'subsections'} = {};
747 $self->{'metadata'} = [];
748 $self->{'text'} = "";
749 return;
750 }
751
752 # find the parent of the section to delete
753 my $parent_section = "";
754 my $child = "0";
755 my @section = split (/\./, $section);
756 if (scalar(@section) > 0) {
757 $child = pop (@section);
758 $parent_section = join (".", @section);
759 }
760
761 my $parent_section_ptr = $self->_lookup_section($parent_section);
762 if (!defined $parent_section_ptr) {
763 print STDERR "doc::delete_section couldn't find parent section " .
764 "$parent_section\n";
765 return;
766 }
767
768 # remove this section from the subsection_order list
769 my $i = 0;
770 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
771 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
772 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
773 last;
774 }
775 $i++;
776 }
777
778 # remove this section from the subsection hash
779 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
780 undef $parent_section_ptr->{'subsections'}->{$child};
781 }
782}
783
784#--
[4]785# methods for dealing with metadata
786
787# set_metadata_element and get_metadata_element are for metadata
788# which should only have one value. add_meta_data and get_metadata
789# are for metadata which can have more than one value.
790
[1241]791# returns the first metadata value which matches field
[6111]792
793# This version of get metadata element works much like the one above,
794# except it allows for the namespace portion of a metadata element to
795# be ignored, thus if you are searching for dc.Title, the first piece
796# of matching metadata ending with the name Title (once any namespace
797# is removed) would be returned.
798# 28-11-2003 John Thompson
[1241]799sub get_metadata_element {
800 my $self = shift (@_);
[6111]801 my ($section, $field, $ignore_namespace) = @_;
[1241]802 my ($data);
803
[6111]804 $ignore_namespace = 0 unless defined $ignore_namespace;
805
[1241]806 my $section_ptr = $self->_lookup_section($section);
807 if (!defined $section_ptr) {
[8716]808 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
[1241]809 return;
810 }
811
[24404]812 # Remove any namespace if we are being told to ignore them
[6111]813 if($ignore_namespace) {
[24404]814 $field =~ s/^.*\.//; #$field =~ s/^\w*\.//;
[6111]815 }
816
[1241]817 foreach $data (@{$section_ptr->{'metadata'}}) {
[6111]818
819 my $data_name = $data->[0];
[14966]820
[24404]821 # Remove any namespace if we are being told to ignore them
[6111]822 if($ignore_namespace) {
[24404]823 $data_name =~ s/^.*\.//; #$data_name =~ s/^\w*\.//;
[6111]824 }
[24404]825 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
826 $data_name =~ s/^ex\.([^.]+)$/$1/; #$data_name =~ s/^ex\.//;
827
[6111]828 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
[1241]829 }
830
831 return undef; # was not found
832}
833
834# returns a list of the form [value1, value2, ...]
835sub get_metadata {
836 my $self = shift (@_);
[6111]837 my ($section, $field, $ignore_namespace) = @_;
[1241]838 my ($data);
839
[6111]840 $ignore_namespace = 0 unless defined $ignore_namespace;
841
[1241]842 my $section_ptr = $self->_lookup_section($section);
843 if (!defined $section_ptr) {
[8716]844 print STDERR "doc::get_metadata couldn't find section ",
845 $section, "\n";
[1241]846 return;
847 }
848
[24404]849 # Remove any namespace if we are being told to ignore them
[6111]850 if($ignore_namespace) {
[24404]851 $field =~ s/^.*\.//;
[6111]852 }
853
[1241]854 my @metadata = ();
855 foreach $data (@{$section_ptr->{'metadata'}}) {
[6111]856
857 my $data_name = $data->[0];
[24404]858
859 # Remove any namespace if we are being told to ignore them
[6111]860 if($ignore_namespace) {
[24404]861 $data_name =~ s/^.*\.//;
862 }
863 # we always remove ex. (but not any subsequent namespace) - ex. maybe there in doc_obj, but we will never ask for it.
864 $data_name =~ s/^ex\.([^.]+)$/$1/;
865
[6111]866 push (@metadata, $data->[1]) if ($data_name eq $field);
[1241]867 }
[9241]868
[1241]869 return \@metadata;
870}
871
[23827]872sub get_metadata_hashmap {
873 my $self = shift (@_);
874 my ($section, $opt_namespace) = @_;
875
876 my $section_ptr = $self->_lookup_section($section);
877 if (!defined $section_ptr) {
878 print STDERR "doc::get_metadata couldn't find section ",
879 $section, "\n";
880 return;
881 }
882
883 my $metadata_hashmap = {};
884 foreach my $data (@{$section_ptr->{'metadata'}}) {
885 my $metaname = $data->[0];
886
887 if ((!defined $opt_namespace) || ($metaname =~ m/^$opt_namespace\./)) {
888 if (!defined $metadata_hashmap->{$metaname}) {
889 $metadata_hashmap->{$metaname} = [];
890 }
891 my $metaval_list = $metadata_hashmap->{$metaname};
892 push(@$metaval_list, $data->[1]);
893 }
894 }
895
896 return $metadata_hashmap;
897}
898
[1241]899# returns a list of the form [[field,value],[field,value],...]
900sub get_all_metadata {
901 my $self = shift (@_);
902 my ($section) = @_;
903
904 my $section_ptr = $self->_lookup_section($section);
905 if (!defined $section_ptr) {
[8716]906 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
[1241]907 return;
908 }
909
910 return $section_ptr->{'metadata'};
911}
912
913# $value is optional
914sub delete_metadata {
915 my $self = shift (@_);
916 my ($section, $field, $value) = @_;
917
918 my $section_ptr = $self->_lookup_section($section);
919 if (!defined $section_ptr) {
[23562]920 print STDERR "doc::delete_metadata couldn't find section ", $section, "$field\n";
[1241]921 return;
922 }
923
924 my $i = 0;
925 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
926 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
927 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
928 splice (@{$section_ptr->{'metadata'}}, $i, 1);
929 } else {
930 $i++;
931 }
932 }
933}
934
935sub delete_all_metadata {
936 my $self = shift (@_);
937 my ($section) = @_;
938
939 my $section_ptr = $self->_lookup_section($section);
940 if (!defined $section_ptr) {
[8716]941 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
[1241]942 return;
943 }
944
945 $section_ptr->{'metadata'} = [];
946}
947
[4]948sub set_metadata_element {
949 my $self = shift (@_);
950 my ($section, $field, $value) = @_;
951
[97]952 $self->set_utf8_metadata_element ($section, $field,
[1870]953 &unicode::ascii2utf8(\$value));
[73]954}
955
956# set_utf8_metadata_element assumes the text has already been
957# converted to the UTF-8 encoding.
958sub set_utf8_metadata_element {
959 my $self = shift (@_);
960 my ($section, $field, $value) = @_;
[25557]961
[4]962 $self->delete_metadata ($section, $field);
[73]963 $self->add_utf8_metadata ($section, $field, $value);
[4]964}
965
966
[73]967# add_metadata assumes the text is in (extended) ascii form. For
[8220]968# text which has already been converted to the UTF-8 format use
[73]969# add_utf8_metadata.
[4]970sub add_metadata {
971 my $self = shift (@_);
972 my ($section, $field, $value) = @_;
[25557]973
[97]974 $self->add_utf8_metadata ($section, $field,
[1870]975 &unicode::ascii2utf8(\$value));
[73]976}
977
978sub add_utf8_metadata {
979 my $self = shift (@_);
980 my ($section, $field, $value) = @_;
[25557]981
982 # my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
983 # my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
984 # print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
[73]985
[4]986 my $section_ptr = $self->_lookup_section($section);
987 if (!defined $section_ptr) {
[8716]988 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
[4]989 return;
990 }
[1732]991 if (!defined $value) {
992 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
993 return;
994 }
995 if (!defined $field) {
996 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
997 return;
998 }
[4]999
[8894]1000 #print STDERR "###$field=$value\n";
[22950]1001
1002 # For now, supress this check. Given that text data read in is now
1003 # Unicode aware, then the following block of code can (ironically enough)
1004 # cause our unicode compliant string to be re-encoded (leading to
1005 # a double-encoded UTF-8 string, which we definitely don't want!).
1006
1007
[7798]1008 # double check that the value is utf-8
[25557]1009 # if (!&unicode::check_is_utf8($value)) {
1010 # print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
1011 # &unicode::ensure_utf8(\$value);
1012 # print STDERR " Tried converting to utf8: $value\n";
1013 # }
[7798]1014
[25557]1015 #If the metadata value is either a latitude or a longitude value then we want to save a shortened version for spacial searching purposes
1016 if($field eq "Latitude" || $field eq "Longitude")
1017 {
1018 my $direction;
1019 if($value =~ m/^-/)
1020 {
1021 $direction = ($field eq "Latitude") ? "S" : "W";
1022 }
1023 else
1024 {
1025 $direction = ($field eq "Latitude") ? "N" : "E";
1026 }
1027
1028 my ($beforeDec, $afterDec) = ($value =~ m/^-?([0-9]+)\.([0-9]+)$/);
1029 if(defined $beforeDec && defined $afterDec)
1030 {
1031 my $name = ($field eq "Latitude") ? "LatShort" : "LngShort";
1032 push (@{$section_ptr->{'metadata'}}, [$name, $beforeDec . $direction]);
1033
1034 for(my $i = 2; $i <= 4; $i++)
1035 {
1036 if(length($afterDec) >= $i)
1037 {
1038 push (@{$section_ptr->{'metadata'}}, [$name, substr($afterDec, 0, $i)]);
1039 }
1040 }
1041
1042 #Only add the metadata if it has not already been added
1043 my $metaMap = $self->get_metadata_hashmap($section);
1044 }
1045 }
1046
[4]1047 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1048}
1049
1050
1051# methods for dealing with text
1052
[1241]1053# returns the text for a section
1054sub get_text {
1055 my $self = shift (@_);
1056 my ($section) = @_;
1057
1058 my $section_ptr = $self->_lookup_section($section);
1059 if (!defined $section_ptr) {
1060 print STDERR "doc::get_text couldn't find section " .
1061 "$section\n";
1062 return "";
1063 }
1064
1065 return $section_ptr->{'text'};
1066}
1067
1068# returns the (utf-8 encoded) length of the text for a section
1069sub get_text_length {
1070 my $self = shift (@_);
1071 my ($section) = @_;
1072
1073 my $section_ptr = $self->_lookup_section($section);
1074 if (!defined $section_ptr) {
1075 print STDERR "doc::get_text_length couldn't find section " .
1076 "$section\n";
1077 return 0;
1078 }
1079
1080 return length ($section_ptr->{'text'});
1081}
1082
[23131]1083# returns the total length for all the sections
1084sub get_total_text_length {
1085 my $self = shift (@_);
1086
1087 my $section = $self->get_top_section();
1088 my $length = 0;
1089 while (defined $section) {
1090 $length += $self->get_text_length($section);
1091 $section = $self->get_next_section($section);
1092 }
1093 return $length;
1094}
1095
[1241]1096sub delete_text {
1097 my $self = shift (@_);
1098 my ($section) = @_;
1099
1100 my $section_ptr = $self->_lookup_section($section);
1101 if (!defined $section_ptr) {
1102 print STDERR "doc::delete_text couldn't find section " .
1103 "$section\n";
1104 return;
1105 }
1106
1107 $section_ptr->{'text'} = "";
1108}
1109
[73]1110# add_text assumes the text is in (extended) ascii form. For
1111# text which has been already converted to the UTF-8 format
1112# use add_utf8_text.
[4]1113sub add_text {
1114 my $self = shift (@_);
1115 my ($section, $text) = @_;
1116
[73]1117 # convert the text to UTF-8 encoded unicode characters
1118 # and add the text
[1870]1119 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
[73]1120}
1121
1122
1123# add_utf8_text assumes the text to be added has already
1124# been converted to the UTF-8 encoding. For ascii text use
1125# add_text
1126sub add_utf8_text {
1127 my $self = shift (@_);
1128 my ($section, $text) = @_;
1129
[4]1130 my $section_ptr = $self->_lookup_section($section);
1131 if (!defined $section_ptr) {
[73]1132 print STDERR "doc::add_utf8_text couldn't find section " .
[4]1133 "$section\n";
1134 return;
1135 }
1136
1137 $section_ptr->{'text'} .= $text;
1138}
1139
[16950]1140# returns the Source meta, which is the utf8 filename generated.
1141# Added a separate method here for convenience
1142sub get_source {
[16924]1143 my $self = shift (@_);
[16950]1144 return $self->get_metadata_element ($self->get_top_section(), "Source");
1145}
1146
1147# returns the SourceFile meta, which is the url reference to the URL-encoded
1148# version of Source (the utf8 filename). Added a separate method here for convenience
1149sub get_sourcefile {
1150 my $self = shift (@_);
1151 return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
1152}
1153
1154# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
1155# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
1156sub get_assocfile_from_sourcefile {
1157 my $self = shift (@_);
[16924]1158
1159 # get the SourceFile meta, which is a *URL* to a file on the filesystem
1160 my $top_section = $self->get_top_section();
1161 my $source_file = $self->get_metadata_element($top_section, "SourceFile");
[4]1162
[16924]1163 # get the actual filename as it exists on the filesystem which this url refers to
[16928]1164 $source_file = &unicode::url_to_filename($source_file);
[16924]1165 my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
1166 return $assocfilename;
1167}
1168
[1241]1169# methods for dealing with associated files
1170
1171# a file is associated with a document, NOT a section.
1172# if section is defined it is noted in the data structure
1173# only so that files associated from a particular section
1174# may be removed later (using delete_section_assoc_files)
1175sub associate_file {
1176 my $self = shift (@_);
1177 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1178 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1179
1180 # remove all associated files with the same name
1181 $self->delete_assoc_file ($assoc_filename);
[23413]1182
1183 # Too harsh a requirement
1184 # Definitely get HTML docs, for example, with some missing
1185 # support files
1186# if (!&util::fd_exists($real_filename)) {
1187# print STDERR "****** doc::associate_file(): Failed to find the file $real_filename\n";
1188# exit -1;
1189# }
1190
[23387]1191# print STDERR "**** is the following a UTF8 rep of *real* filename?\n $real_filename\n";
1192# print STDERR "****##### so, ensure it is before storing?!?!?\n";
1193## my $utf8_filename = Encode::encode("utf8",$filename);
1194
[1241]1195 push (@{$self->{'associated_files'}},
1196 [$real_filename, $assoc_filename, $mime_type, $section]);
1197}
1198
1199# returns a list of associated files in the form
1200# [[real_filename, assoc_filename, mimetype], ...]
1201sub get_assoc_files {
1202 my $self = shift (@_);
1203
1204 return $self->{'associated_files'};
1205}
1206
[20775]1207# the following two methods used to keep track of original associated files
1208# for incremental building. eg a txt file used by an item file does not end
1209# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1210# file for incremental build
1211sub associate_source_file {
1212 my $self = shift (@_);
1213 my ($full_filename) = @_;
[19494]1214
[20775]1215 push (@{$self->{'source_assoc_files'}}, $full_filename);
1216
1217}
1218
1219sub get_source_assoc_files {
1220 my $self = shift (@_);
1221
1222 return $self->{'source_assoc_files'};
1223
1224
1225}
[19494]1226sub metadata_file {
1227 my $self = shift (@_);
1228 my ($real_filename, $filename) = @_;
1229
1230 push (@{$self->{'metadata_files'}},
1231 [$real_filename, $filename]);
1232}
1233
[21566]1234# used for writing out the archiveinf-doc info database, to list all the metadata files
[19494]1235sub get_meta_files {
1236 my $self = shift (@_);
1237
1238 return $self->{'metadata_files'};
1239}
1240
[1241]1241sub delete_section_assoc_files {
1242 my $self = shift (@_);
1243 my ($section) = @_;
1244
1245 my $i=0;
1246 while ($i < scalar (@{$self->{'associated_files'}})) {
1247 if (defined $self->{'associated_files'}->[$i]->[3] &&
1248 $self->{'associated_files'}->[$i]->[3] eq $section) {
1249 splice (@{$self->{'associated_files'}}, $i, 1);
1250 } else {
1251 $i++;
1252 }
1253 }
1254}
1255
1256sub delete_assoc_file {
1257 my $self = shift (@_);
1258 my ($assoc_filename) = @_;
1259
1260 my $i=0;
1261 while ($i < scalar (@{$self->{'associated_files'}})) {
1262 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1263 splice (@{$self->{'associated_files'}}, $i, 1);
1264 } else {
1265 $i++;
1266 }
1267 }
1268}
1269
1270sub reset_nextsection_ptr {
1271 my $self = shift (@_);
1272 my ($section) = @_;
1273
1274 my $section_ptr = $self->_lookup_section($section);
1275 $section_ptr->{'next_subsection'} = 1;
1276}
1277
[4]12781;
Note: See TracBrowser for help on using the repository browser.