source: main/trunk/greenstone2/perllib/doc.pm@ 23131

Last change on this file since 23131 was 23131, checked in by kjdon, 14 years ago

added a method get_total_text_length. returns the total lenght for the document, ie al the text lengths for each section

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 30.4 KB
RevLine 
[537]1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
[8894]10# This program is free software; you can redistr te it and/or modify
[537]11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
[1241]26# base class to hold documents
[4]27
28package doc;
[3834]29eval {require bytes};
[4]30
[832]31BEGIN {
[1241]32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
[832]34}
35
[15894]36use strict;
[1241]37use unicode;
38use util;
39use ghtml;
[8220]40use File::stat;
[1241]41##use hashdoc;
[13172]42use docprint;
[1241]43
[4]44# the document type may be indexed_doc, nonindexed_doc, or
45# classification
46
[18528]47our $OIDcount = 0;
[2327]48
[20412]49# rename_method can be 'url', 'none', 'base64'
[4]50sub new {
51 my $class = shift (@_);
[18319]52 my ($source_filename, $doc_type, $rename_method) = @_;
[1374]53
[13770]54
[1241]55 my $self = bless {'associated_files'=>[],
56 'subsection_order'=>[],
57 'next_subsection'=>1,
58 'subsections'=>{},
59 'metadata'=>[],
[2327]60 'text'=>"",
61 'OIDtype'=>"hash"}, $class;
[4]62
[10217]63 # used to set lastmodified here, but this can screw up the HASH ids, so
64 # the docsave processor now calls set_lastmodified
[13770]65
66 $self->{'source_path'} = $source_filename;
[10217]67
[7929]68 if (defined $source_filename) {
[15874]69 $source_filename = &util::filename_within_collection($source_filename);
[18508]70 print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
[18319]71 $self->set_source_filename ($source_filename, $rename_method);
[7929]72 }
73
[1374]74 $self->set_doc_type ($doc_type) if defined $doc_type;
[485]75
[4]76 return $self;
77}
[10217]78# set lastmodified for OAI purposes, added by GRB, moved by kjdon
79sub set_lastmodified {
80 my $self = shift (@_);
[4]81
[13770]82 my $source_path = $self->{'source_path'};
83
84 if (defined $source_path && (-e $source_path)) {
85
86 my $file_stat = stat($source_path);
[10217]87 my $mtime = $file_stat->mtime;
[21862]88 my ($seconds, $minutes, $hours, $day_of_month, $month, $year,
89 $wday, $yday, $isdst) = localtime($mtime);
90
91 my $date_modified = sprintf("%d%02d%02d",1900+$year,$month+1,$day_of_month);
92
[10217]93 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $file_stat->mtime);
[21862]94 $self->add_utf8_metadata($self->get_top_section(), "lastmodifieddate", $date_modified);
[10217]95 }
96}
97
[1241]98# clone the $self object
99sub duplicate {
100 my $self = shift (@_);
101
102 my $newobj = {};
103
[8716]104 foreach my $k (keys %$self) {
[1241]105 $newobj->{$k} = &clone ($self->{$k});
106 }
107
108 bless $newobj, ref($self);
109 return $newobj;
110}
111
112sub clone {
113 my ($from) = @_;
114 my $type = ref ($from);
115
116 if ($type eq "HASH") {
117 my $to = {};
[8716]118 foreach my $key (keys %$from) {
[1241]119 $to->{$key} = &clone ($from->{$key});
120 }
121 return $to;
122 } elsif ($type eq "ARRAY") {
123 my $to = [];
[8716]124 foreach my $v (@$from) {
[1241]125 push (@$to, &clone ($v));
126 }
127 return $to;
128 } else {
129 return $from;
130 }
131}
132
[2327]133sub set_OIDtype {
134 my $self = shift (@_);
[12268]135 my ($type, $metadata) = @_;
[1241]136
[17025]137 if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|incremental|dirname|assigned)$/) {
[2327]138 $self->{'OIDtype'} = $type;
139 } else {
140 $self->{'OIDtype'} = "hash";
141 }
[16792]142
[12268]143 if ($type =~ /^assigned$/) {
144 if (defined $metadata) {
145 $self->{'OIDmetadata'} = $metadata;
146 } else {
147 $self->{'OIDmetadata'} = "dc.Identifier";
148 }
149 }
[2327]150}
151
[20412]152# rename_method can be 'url', 'none', 'base64'
[1241]153sub set_source_filename {
154 my $self = shift (@_);
[18319]155 my ($source_filename, $rename_method) = @_;
[1241]156
[16578]157 # Since the gsdlsourcefilename element goes into the doc.xml it has
158 # to be utf8. However, it should also *represent* the source filename
159 # (in the import directory) which may not be utf8 at all.
[18319]160 # For instance, if this meta element (gsdlsourcefilename) will be used
161 # by other applications that parse doc.xml in order to locate
162 # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
163 # encode the real filename as this is a binary-to-text encoding meaning
164 # that the resulting string is ASCII (utf8). Decoding will give the original.
[16578]165
[18319]166# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
167
[16670]168 # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
169 my ($srcfilename,$dirname,$suffix)
170 = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
[16792]171# print STDERR "-> $srcfilename -> ";
[18319]172 $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
[16670]173 $source_filename = &util::filename_cat($dirname, $srcfilename);
[16792]174# print STDERR "$source_filename\n";
[16670]175
[16578]176 $self->set_utf8_metadata_element ($self->get_top_section(),
[1241]177 "gsdlsourcefilename",
178 $source_filename);
179}
180
[7569]181sub set_converted_filename {
182 my $self = shift (@_);
183 my ($converted_filename) = @_;
184
[16578]185 # we know the converted filename is utf8
186 $self->set_utf8_metadata_element ($self->get_top_section(),
[7569]187 "gsdlconvertedfilename",
188 $converted_filename);
189}
190
[1241]191# returns the source_filename as it was provided
[19829]192sub get_unmodified_source_filename {
193 my $self = shift (@_);
194
195 return $self->{'source_path'};
196}
197
198# returns the source_filename with whatever rename_method was given
[1241]199sub get_source_filename {
200 my $self = shift (@_);
201
202 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
203}
204
[19829]205
206
[7569]207# returns converted filename if available else returns source filename
208sub get_filename_for_hashing {
209 my $self = shift (@_);
210
211 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
212
213 if (!defined $filename) {
[11097]214 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
[10980]215 # if NULPlug processed file, then don't give a filename
[11097]216 if (defined $plugin_name && $plugin_name eq "NULPlug") {
[10980]217 $filename = undef;
[16670]218 } else { # returns the URL encoded source filename!
[10980]219 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
220 }
[7569]221 }
222 return $filename;
223}
224
[1241]225sub set_doc_type {
226 my $self = shift (@_);
227 my ($doc_type) = @_;
228
229 $self->set_metadata_element ($self->get_top_section(),
230 "gsdldoctype",
231 $doc_type);
232}
233
[10217]234# returns the gsdldoctype as it was provided
[1241]235# the default of "indexed_doc" is used if no document
236# type was provided
237sub get_doc_type {
238 my $self = shift (@_);
239
240 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
241 return $doc_type if (defined $doc_type);
242 return "indexed_doc";
243}
244
245
246# look up the reference to the a particular section
247sub _lookup_section {
248 my $self = shift (@_);
249 my ($section) = @_;
250
251 my ($num);
252 my $sectionref = $self;
253
254 while (defined $section && $section ne "") {
[12327]255
[1241]256 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
[12327]257
258 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
259
[1241]260 $section = "" unless defined $section;
261
[12327]262
[1241]263 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
264 $sectionref = $sectionref->{'subsections'}->{$num};
265 } else {
266 return undef;
267 }
268 }
269
270 return $sectionref;
271}
272
[2327]273# calculate OID by hashing the contents of the document
[1241]274sub _calc_OID {
275 my $self = shift (@_);
276 my ($filename) = @_;
277
[22855]278
[1241]279 my $osexe = &util::get_os_exe();
280
281 my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
282 $ENV{'GSDLOS'},"hashfile$osexe");
[8504]283
[1241]284 my $result = "NULL";
[22855]285
[1679]286
[1241]287 if (-e "$hashfile_exe") {
[1679]288# $result = `\"$hashfile_exe\" \"$filename\"`;
[21862]289# $result = `hashfile$osexe \"$filename\" 2>&1`;
[1679]290 $result = `hashfile$osexe \"$filename\"`;
[21862]291
[1241]292 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
293 } else {
294 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
295 }
296 return "HASH$result";
297}
298
299# methods dealing with OID, not groups of them.
300
[2327]301# if $OID is not provided one is calculated
[1241]302sub set_OID {
303 my $self = shift (@_);
304 my ($OID) = @_;
[8504]305
[8797]306 my $use_hash_oid = 0;
[17057]307 # if an OID wasn't provided calculate one
[1241]308 if (!defined $OID) {
309 $OID = "NULL";
[17025]310 if ($self->{'OIDtype'} =~ /^hash/) {
[8797]311 $use_hash_oid = 1;
312 } elsif ($self->{'OIDtype'} eq "incremental") {
[2327]313 $OID = "D" . $OIDcount;
314 $OIDcount ++;
[8716]315
316 } elsif ($self->{'OIDtype'} eq "dirname") {
317 $OID = 'J';
318 my $filename = $self->get_source_filename();
[8797]319 if (defined($filename)) { # && -e $filename) {
[8716]320 $OID = &File::Basename::dirname($filename);
321 if (defined $OID) {
322 $OID = 'J'.&File::Basename::basename($OID);
[19617]323 $OID = &util::tidy_up_oid($OID);
[8716]324 } else {
[8797]325 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
326 $use_hash_oid = 1;
[8716]327 }
328 } else {
[8797]329 print STDERR "Failed to find filename, generating hash id\n";
330 $use_hash_oid = 1;
[8716]331 }
[2327]332
[8797]333 } elsif ($self->{'OIDtype'} eq "assigned") {
[12268]334 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
[8797]335 if (defined $identifier && $identifier ne "") {
[17025]336 $OID = $identifier;
[19617]337 $OID = &util::tidy_up_oid($OID);
[8797]338 } else {
339 # need a hash id
[12268]340 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
[8797]341 $use_hash_oid = 1;
342 }
343
[2327]344 } else {
[8797]345 $use_hash_oid = 1;
346 }
347
348 if ($use_hash_oid) {
[17025]349 my $hash_on_file = 1;
350 if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
351 $hash_on_file = 0;
352 }
353 if ($hash_on_file) {
354 # "hash" OID - feed file to hashfile.exe
355 my $filename = $self->get_filename_for_hashing();
[22855]356
[17025]357 # -z: don't want to hash on the file if it is zero size
358 if (defined($filename) && -e $filename && !-z $filename) {
359 $OID = $self->_calc_OID ($filename);
360 } else {
361 $hash_on_file = 0;
362 }
363 }
364 if (!$hash_on_file) {
365 my $filename = &util::get_tmp_filename();
[22855]366 if (!open (OUTFILE, ">:utf8", $filename)) {
[2327]367 print STDERR "doc::set_OID could not write to $filename\n";
368 } else {
[13172]369 my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
370 print OUTFILE $doc_text;
[2327]371 close (OUTFILE);
372 }
373 $OID = $self->_calc_OID ($filename);
374 &util::rm ($filename);
[1374]375 }
[1241]376 }
377 }
378 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
379}
380
381# this uses hashdoc (embedded c thingy) which is faster but still
382# needs a little work to be suffiently stable
383sub ___set_OID {
384 my $self = shift (@_);
385 my ($OID) = @_;
386
387 # if an OID wasn't provided then calculate hash value based on document
388 if (!defined $OID)
389 {
[13172]390 my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
[1241]391 my $hash_len = length($hash_text);
392
393 $OID = &hashdoc::buffer($hash_text,$hash_len);
394 }
395
396 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
397}
398
399# returns the OID for this document
400sub get_OID {
401 my $self = shift (@_);
402 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
403 return $OID if (defined $OID);
404 return "NULL";
405}
406
407sub delete_OID {
408 my $self = shift (@_);
409
410 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
411}
412
413
414# methods for manipulating section names
415
416# returns the name of the top-most section (the top
417# level of the document
418sub get_top_section {
419 my $self = shift (@_);
420
421 return "";
422}
423
424# returns a section
425sub get_parent_section {
426 my $self = shift (@_);
427 my ($section) = @_;
428
429 $section =~ s/(^|\.)\d+$//;
430
431 return $section;
432}
433
434# returns the first child section (or the end child
435# if there isn't any)
436sub get_begin_child {
437 my $self = shift (@_);
438 my ($section) = @_;
439
440 my $section_ptr = $self->_lookup_section($section);
441 return "" unless defined $section_ptr;
442
443 if (defined $section_ptr->{'subsection_order'}->[0]) {
444 return "$section.$section_ptr->{'subsection_order'}->[0]";
445 }
446
447 return $self->get_end_child ($section);
448}
449
450# returns the next child of a parent section
451sub get_next_child {
452 my $self = shift (@_);
453 my ($section) = @_;
454
455 my $parent_section = $self->get_parent_section($section);
456 my $parent_section_ptr = $self->_lookup_section($parent_section);
457 return undef unless defined $parent_section_ptr;
458
459 my ($section_num) = $section =~ /(\d+)$/;
460 return undef unless defined $section_num;
461
462 my $i = 0;
463 my $section_order = $parent_section_ptr->{'subsection_order'};
464 while ($i < scalar(@$section_order)) {
465 last if $section_order->[$i] eq $section_num;
466 $i++;
467 }
468
469 $i++; # the next child
470 if ($i < scalar(@$section_order)) {
471 return $section_order->[$i] if $parent_section eq "";
472 return "$parent_section.$section_order->[$i]";
473 }
474
475 # no more sections in this level
476 return undef;
477}
478
479# returns a reference to a list of children
480sub get_children {
481 my $self = shift (@_);
482 my ($section) = @_;
483
484 my $section_ptr = $self->_lookup_section($section);
485 return [] unless defined $section_ptr;
486
487 my @children = @{$section_ptr->{'subsection_order'}};
488
489 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
490 return \@children;
491}
492
493# returns the child section one past the last one (which
494# is coded as "0")
495sub get_end_child {
496 my $self = shift (@_);
497 my ($section) = @_;
498
499 return $section . ".0" unless $section eq "";
500 return "0";
501}
502
503# returns the next section in book order
504sub get_next_section {
505 my $self = shift (@_);
506 my ($section) = @_;
507
508 return undef unless defined $section;
509
510 my $section_ptr = $self->_lookup_section($section);
511 return undef unless defined $section_ptr;
512
513 # first try to find first child
514 if (defined $section_ptr->{'subsection_order'}->[0]) {
515 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
516 return "$section.$section_ptr->{'subsection_order'}->[0]";
517 }
518
519 do {
520 # try to find sibling
521 my $next_child = $self->get_next_child ($section);
522 return $next_child if (defined $next_child);
523
524 # move up one level
525 $section = $self->get_parent_section ($section);
526 } while $section =~ /\d/;
527
528 return undef;
529}
530
531sub is_leaf_section {
532 my $self = shift (@_);
533 my ($section) = @_;
534
535 my $section_ptr = $self->_lookup_section($section);
536 return 1 unless defined $section_ptr;
537
538 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
539}
540
541# methods for dealing with sections
542
543# returns the name of the inserted section
544sub insert_section {
545 my $self = shift (@_);
546 my ($before_section) = @_;
547
548 # get the child to insert before and its parent section
549 my $parent_section = "";
550 my $before_child = "0";
551 my @before_section = split (/\./, $before_section);
552 if (scalar(@before_section) > 0) {
553 $before_child = pop (@before_section);
554 $parent_section = join (".", @before_section);
555 }
556
557 my $parent_section_ptr = $self->_lookup_section($parent_section);
558 if (!defined $parent_section_ptr) {
559 print STDERR "doc::insert_section couldn't find parent section " .
560 "$parent_section\n";
561 return;
562 }
563
564 # get the next section number
565 my $section_num = $parent_section_ptr->{'next_subsection'}++;
566
567 my $i = 0;
568 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
569 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
570 $i++;
571 }
572
573 # insert the section number into the order list
574 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
575
576 # add this section to the parent section
577 my $section_ptr = {'subsection_order'=>[],
578 'next_subsection'=>1,
579 'subsections'=>{},
580 'metadata'=>[],
581 'text'=>""};
582 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
583
584 # work out the full section number
585 my $section = $parent_section;
586 $section .= "." unless $section eq "";
587 $section .= $section_num;
588
589 return $section;
590}
591
592# creates a pre-named section
593sub create_named_section {
594 my $self = shift (@_);
595 my ($mastersection) = @_;
596
597 my ($num);
598 my $section = $mastersection;
599 my $sectionref = $self;
600
601 while ($section ne "") {
602 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
603 $num =~ s/^0+(\d)/$1/; # remove leading 0s
604 $section = "" unless defined $section;
605
606 if (defined $num) {
607 if (!defined $sectionref->{'subsections'}->{$num}) {
608 push (@{$sectionref->{'subsection_order'}}, $num);
609 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
610 'next_subsection'=>1,
611 'subsections'=>{},
612 'metadata'=>[],
613 'text'=>""};
614 if ($num >= $sectionref->{'next_subsection'}) {
615 $sectionref->{'next_subsection'} = $num + 1;
616 }
617 }
618 $sectionref = $sectionref->{'subsections'}->{$num};
619
620 } else {
621 print STDERR "doc::create_named_section couldn't create section ";
622 print STDERR "$mastersection\n";
623 last;
624 }
625 }
626}
627
628# returns a reference to a list of subsections
629sub list_subsections {
630 my $self = shift (@_);
631 my ($section) = @_;
632
633 my $section_ptr = $self->_lookup_section ($section);
634 if (!defined $section_ptr) {
635 print STDERR "doc::list_subsections couldn't find section $section\n";
636 return [];
637 }
638
639 return [@{$section_ptr->{'subsection_order'}}];
640}
641
642sub delete_section {
643 my $self = shift (@_);
644 my ($section) = @_;
645
646# my $section_ptr = {'subsection_order'=>[],
647# 'next_subsection'=>1,
648# 'subsections'=>{},
649# 'metadata'=>[],
650# 'text'=>""};
651
652 # if this is the top section reset everything
653 if ($section eq "") {
654 $self->{'subsection_order'} = [];
655 $self->{'subsections'} = {};
656 $self->{'metadata'} = [];
657 $self->{'text'} = "";
658 return;
659 }
660
661 # find the parent of the section to delete
662 my $parent_section = "";
663 my $child = "0";
664 my @section = split (/\./, $section);
665 if (scalar(@section) > 0) {
666 $child = pop (@section);
667 $parent_section = join (".", @section);
668 }
669
670 my $parent_section_ptr = $self->_lookup_section($parent_section);
671 if (!defined $parent_section_ptr) {
672 print STDERR "doc::delete_section couldn't find parent section " .
673 "$parent_section\n";
674 return;
675 }
676
677 # remove this section from the subsection_order list
678 my $i = 0;
679 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
680 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
681 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
682 last;
683 }
684 $i++;
685 }
686
687 # remove this section from the subsection hash
688 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
689 undef $parent_section_ptr->{'subsections'}->{$child};
690 }
691}
692
693#--
[4]694# methods for dealing with metadata
695
696# set_metadata_element and get_metadata_element are for metadata
697# which should only have one value. add_meta_data and get_metadata
698# are for metadata which can have more than one value.
699
[1241]700# returns the first metadata value which matches field
[6111]701
702# This version of get metadata element works much like the one above,
703# except it allows for the namespace portion of a metadata element to
704# be ignored, thus if you are searching for dc.Title, the first piece
705# of matching metadata ending with the name Title (once any namespace
706# is removed) would be returned.
707# 28-11-2003 John Thompson
[1241]708sub get_metadata_element {
709 my $self = shift (@_);
[6111]710 my ($section, $field, $ignore_namespace) = @_;
[1241]711 my ($data);
712
[6111]713 $ignore_namespace = 0 unless defined $ignore_namespace;
714
[1241]715 my $section_ptr = $self->_lookup_section($section);
716 if (!defined $section_ptr) {
[8716]717 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
[1241]718 return;
719 }
720
[20417]721 # Remove the namespace if we are being told to ignore them
[6111]722 if($ignore_namespace) {
723 $field =~ s/^\w*\.//;
724 }
725
[1241]726 foreach $data (@{$section_ptr->{'metadata'}}) {
[6111]727
728 my $data_name = $data->[0];
[14966]729
[6111]730 # Remove the any namespace if we are being told to ignore them
731 if($ignore_namespace) {
732 $data_name =~ s/^\w*\.//;
733 }
[22294]734 $data_name =~ s/^ex\.//; # we always remove ex. - it maybe there in doc_obj, but we will never ask for it.
[6111]735 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
[1241]736 }
737
738 return undef; # was not found
739}
740
741# returns a list of the form [value1, value2, ...]
742sub get_metadata {
743 my $self = shift (@_);
[6111]744 my ($section, $field, $ignore_namespace) = @_;
[1241]745 my ($data);
746
[6111]747 $ignore_namespace = 0 unless defined $ignore_namespace;
748
[1241]749 my $section_ptr = $self->_lookup_section($section);
750 if (!defined $section_ptr) {
[8716]751 print STDERR "doc::get_metadata couldn't find section ",
752 $section, "\n";
[1241]753 return;
754 }
755
[6111]756 # Remove the any namespace if we are being told to ignore them
757 if($ignore_namespace) {
758 $field =~ s/^\w*\.//;
759 }
760
[1241]761 my @metadata = ();
762 foreach $data (@{$section_ptr->{'metadata'}}) {
[6111]763
764 my $data_name = $data->[0];
765 # Remove the any namespace if we are being told to ignore them
766 if($ignore_namespace) {
767 $data_name =~ s/^\w*\.//;
768 }
[22294]769 $data_name =~ s/^ex\.//; # we always remove ex. - it maybe there in doc_obj, but we will never ask for it.
[6111]770 push (@metadata, $data->[1]) if ($data_name eq $field);
[1241]771 }
[9241]772
[1241]773 return \@metadata;
774}
775
776# returns a list of the form [[field,value],[field,value],...]
777sub get_all_metadata {
778 my $self = shift (@_);
779 my ($section) = @_;
780
781 my $section_ptr = $self->_lookup_section($section);
782 if (!defined $section_ptr) {
[8716]783 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
[1241]784 return;
785 }
786
787 return $section_ptr->{'metadata'};
788}
789
790# $value is optional
791sub delete_metadata {
792 my $self = shift (@_);
793 my ($section, $field, $value) = @_;
794
795 my $section_ptr = $self->_lookup_section($section);
796 if (!defined $section_ptr) {
[8716]797 print STDERR "doc::delete_metadata couldn't find section ", $section, "\n";
[1241]798 return;
799 }
800
801 my $i = 0;
802 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
803 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
804 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
805 splice (@{$section_ptr->{'metadata'}}, $i, 1);
806 } else {
807 $i++;
808 }
809 }
810}
811
812sub delete_all_metadata {
813 my $self = shift (@_);
814 my ($section) = @_;
815
816 my $section_ptr = $self->_lookup_section($section);
817 if (!defined $section_ptr) {
[8716]818 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
[1241]819 return;
820 }
821
822 $section_ptr->{'metadata'} = [];
823}
824
[4]825sub set_metadata_element {
826 my $self = shift (@_);
827 my ($section, $field, $value) = @_;
828
[97]829 $self->set_utf8_metadata_element ($section, $field,
[1870]830 &unicode::ascii2utf8(\$value));
[73]831}
832
833# set_utf8_metadata_element assumes the text has already been
834# converted to the UTF-8 encoding.
835sub set_utf8_metadata_element {
836 my $self = shift (@_);
837 my ($section, $field, $value) = @_;
838
[4]839 $self->delete_metadata ($section, $field);
[73]840 $self->add_utf8_metadata ($section, $field, $value);
[4]841}
842
843
[73]844# add_metadata assumes the text is in (extended) ascii form. For
[8220]845# text which has already been converted to the UTF-8 format use
[73]846# add_utf8_metadata.
[4]847sub add_metadata {
848 my $self = shift (@_);
849 my ($section, $field, $value) = @_;
850
[97]851 $self->add_utf8_metadata ($section, $field,
[1870]852 &unicode::ascii2utf8(\$value));
[73]853}
854
855sub add_utf8_metadata {
856 my $self = shift (@_);
857 my ($section, $field, $value) = @_;
858
[22950]859# my ($cpackage,$cfilename,$cline,$csubr,$chas_args,$cwantarray) = caller(1);
860# my ($lcfilename) = ($cfilename =~ m/([^\\\/]*)$/);
861# print STDERR "** Calling method: $lcfilename:$cline $cpackage->$csubr\n";
862
[4]863 my $section_ptr = $self->_lookup_section($section);
864 if (!defined $section_ptr) {
[8716]865 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
[4]866 return;
867 }
[1732]868 if (!defined $value) {
869 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
870 return;
871 }
872 if (!defined $field) {
873 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
874 return;
875 }
[4]876
[8894]877 #print STDERR "###$field=$value\n";
[22950]878
879 # For now, supress this check. Given that text data read in is now
880 # Unicode aware, then the following block of code can (ironically enough)
881 # cause our unicode compliant string to be re-encoded (leading to
882 # a double-encoded UTF-8 string, which we definitely don't want!).
883
884
[7798]885 # double check that the value is utf-8
[22950]886# if (!&unicode::check_is_utf8($value)) {
887# print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
888# &unicode::ensure_utf8(\$value);
889# print STDERR " Tried converting to utf8: $value\n";
890# }
[7798]891
[4]892 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
893}
894
895
896# methods for dealing with text
897
[1241]898# returns the text for a section
899sub get_text {
900 my $self = shift (@_);
901 my ($section) = @_;
902
903 my $section_ptr = $self->_lookup_section($section);
904 if (!defined $section_ptr) {
905 print STDERR "doc::get_text couldn't find section " .
906 "$section\n";
907 return "";
908 }
909
910 return $section_ptr->{'text'};
911}
912
913# returns the (utf-8 encoded) length of the text for a section
914sub get_text_length {
915 my $self = shift (@_);
916 my ($section) = @_;
917
918 my $section_ptr = $self->_lookup_section($section);
919 if (!defined $section_ptr) {
920 print STDERR "doc::get_text_length couldn't find section " .
921 "$section\n";
922 return 0;
923 }
924
925 return length ($section_ptr->{'text'});
926}
927
[23131]928# returns the total length for all the sections
929sub get_total_text_length {
930 my $self = shift (@_);
931
932 my $section = $self->get_top_section();
933 my $length = 0;
934 while (defined $section) {
935 $length += $self->get_text_length($section);
936 $section = $self->get_next_section($section);
937 }
938 return $length;
939}
940
[1241]941sub delete_text {
942 my $self = shift (@_);
943 my ($section) = @_;
944
945 my $section_ptr = $self->_lookup_section($section);
946 if (!defined $section_ptr) {
947 print STDERR "doc::delete_text couldn't find section " .
948 "$section\n";
949 return;
950 }
951
952 $section_ptr->{'text'} = "";
953}
954
[73]955# add_text assumes the text is in (extended) ascii form. For
956# text which has been already converted to the UTF-8 format
957# use add_utf8_text.
[4]958sub add_text {
959 my $self = shift (@_);
960 my ($section, $text) = @_;
961
[73]962 # convert the text to UTF-8 encoded unicode characters
963 # and add the text
[1870]964 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
[73]965}
966
967
968# add_utf8_text assumes the text to be added has already
969# been converted to the UTF-8 encoding. For ascii text use
970# add_text
971sub add_utf8_text {
972 my $self = shift (@_);
973 my ($section, $text) = @_;
974
[4]975 my $section_ptr = $self->_lookup_section($section);
976 if (!defined $section_ptr) {
[73]977 print STDERR "doc::add_utf8_text couldn't find section " .
[4]978 "$section\n";
979 return;
980 }
981
982 $section_ptr->{'text'} .= $text;
983}
984
[16950]985# returns the Source meta, which is the utf8 filename generated.
986# Added a separate method here for convenience
987sub get_source {
[16924]988 my $self = shift (@_);
[16950]989 return $self->get_metadata_element ($self->get_top_section(), "Source");
990}
991
992# returns the SourceFile meta, which is the url reference to the URL-encoded
993# version of Source (the utf8 filename). Added a separate method here for convenience
994sub get_sourcefile {
995 my $self = shift (@_);
996 return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
997}
998
999# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
1000# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
1001sub get_assocfile_from_sourcefile {
1002 my $self = shift (@_);
[16924]1003
1004 # get the SourceFile meta, which is a *URL* to a file on the filesystem
1005 my $top_section = $self->get_top_section();
1006 my $source_file = $self->get_metadata_element($top_section, "SourceFile");
[4]1007
[16924]1008 # get the actual filename as it exists on the filesystem which this url refers to
[16928]1009 $source_file = &unicode::url_to_filename($source_file);
[16924]1010 my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
1011 return $assocfilename;
1012}
1013
[1241]1014# methods for dealing with associated files
1015
1016# a file is associated with a document, NOT a section.
1017# if section is defined it is noted in the data structure
1018# only so that files associated from a particular section
1019# may be removed later (using delete_section_assoc_files)
1020sub associate_file {
1021 my $self = shift (@_);
1022 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1023 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1024
1025 # remove all associated files with the same name
1026 $self->delete_assoc_file ($assoc_filename);
[8504]1027
[1241]1028 push (@{$self->{'associated_files'}},
1029 [$real_filename, $assoc_filename, $mime_type, $section]);
1030}
1031
1032# returns a list of associated files in the form
1033# [[real_filename, assoc_filename, mimetype], ...]
1034sub get_assoc_files {
1035 my $self = shift (@_);
1036
1037 return $self->{'associated_files'};
1038}
1039
[20775]1040# the following two methods used to keep track of original associated files
1041# for incremental building. eg a txt file used by an item file does not end
1042# up as an assoc file for the doc.xml, but it needs to be recorded as a source
1043# file for incremental build
1044sub associate_source_file {
1045 my $self = shift (@_);
1046 my ($full_filename) = @_;
[19494]1047
[20775]1048 push (@{$self->{'source_assoc_files'}}, $full_filename);
1049
1050}
1051
1052sub get_source_assoc_files {
1053 my $self = shift (@_);
1054
1055 return $self->{'source_assoc_files'};
1056
1057
1058}
[19494]1059sub metadata_file {
1060 my $self = shift (@_);
1061 my ($real_filename, $filename) = @_;
1062
1063 push (@{$self->{'metadata_files'}},
1064 [$real_filename, $filename]);
1065}
1066
[21566]1067# used for writing out the archiveinf-doc info database, to list all the metadata files
[19494]1068sub get_meta_files {
1069 my $self = shift (@_);
1070
1071 return $self->{'metadata_files'};
1072}
1073
[1241]1074sub delete_section_assoc_files {
1075 my $self = shift (@_);
1076 my ($section) = @_;
1077
1078 my $i=0;
1079 while ($i < scalar (@{$self->{'associated_files'}})) {
1080 if (defined $self->{'associated_files'}->[$i]->[3] &&
1081 $self->{'associated_files'}->[$i]->[3] eq $section) {
1082 splice (@{$self->{'associated_files'}}, $i, 1);
1083 } else {
1084 $i++;
1085 }
1086 }
1087}
1088
1089sub delete_assoc_file {
1090 my $self = shift (@_);
1091 my ($assoc_filename) = @_;
1092
1093 my $i=0;
1094 while ($i < scalar (@{$self->{'associated_files'}})) {
1095 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1096 splice (@{$self->{'associated_files'}}, $i, 1);
1097 } else {
1098 $i++;
1099 }
1100 }
1101}
1102
1103sub reset_nextsection_ptr {
1104 my $self = shift (@_);
1105 my ($section) = @_;
1106
1107 my $section_ptr = $self->_lookup_section($section);
1108 $section_ptr->{'next_subsection'} = 1;
1109}
1110
[4]11111;
Note: See TracBrowser for help on using the repository browser.