source: gsdl/trunk/perllib/doc.pm@ 19494

Last change on this file since 19494 was 19494, checked in by davidb, 15 years ago

Supporting routines that exploit the new 'metafiles' structures, introduction to track which metadata.xml file a piece of metadata came from

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 28.5 KB
RevLine 
[537]1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
[8894]10# This program is free software; you can redistr te it and/or modify
[537]11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
[1241]26# base class to hold documents
[4]27
28package doc;
[3834]29eval {require bytes};
[4]30
[832]31BEGIN {
[1241]32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
[832]34}
35
[15894]36use strict;
[1241]37use unicode;
38use util;
39use ghtml;
[8220]40use File::stat;
[1241]41##use hashdoc;
[13172]42use docprint;
[1241]43
[4]44# the document type may be indexed_doc, nonindexed_doc, or
45# classification
46
[18528]47our $OIDcount = 0;
[2327]48
[4]49sub new {
50 my $class = shift (@_);
[18319]51 my ($source_filename, $doc_type, $rename_method) = @_;
[1374]52
[13770]53
[1241]54 my $self = bless {'associated_files'=>[],
55 'subsection_order'=>[],
56 'next_subsection'=>1,
57 'subsections'=>{},
58 'metadata'=>[],
[2327]59 'text'=>"",
60 'OIDtype'=>"hash"}, $class;
[4]61
[10217]62 # used to set lastmodified here, but this can screw up the HASH ids, so
63 # the docsave processor now calls set_lastmodified
[13770]64
65 $self->{'source_path'} = $source_filename;
[10217]66
[7929]67 if (defined $source_filename) {
[15874]68 $source_filename = &util::filename_within_collection($source_filename);
[18508]69 print STDERR "****** doc.pm::new(): no file rename method provided\n" unless $rename_method;
[18319]70 $self->set_source_filename ($source_filename, $rename_method);
[7929]71 }
72
[1374]73 $self->set_doc_type ($doc_type) if defined $doc_type;
[485]74
[4]75 return $self;
76}
[10217]77# set lastmodified for OAI purposes, added by GRB, moved by kjdon
78sub set_lastmodified {
79 my $self = shift (@_);
[4]80
[13770]81 my $source_path = $self->{'source_path'};
82
83 if (defined $source_path && (-e $source_path)) {
84
85 my $file_stat = stat($source_path);
[10217]86 my $mtime = $file_stat->mtime;
87 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $file_stat->mtime);
88 }
89}
90
[1241]91# clone the $self object
92sub duplicate {
93 my $self = shift (@_);
94
95 my $newobj = {};
96
[8716]97 foreach my $k (keys %$self) {
[1241]98 $newobj->{$k} = &clone ($self->{$k});
99 }
100
101 bless $newobj, ref($self);
102 return $newobj;
103}
104
105sub clone {
106 my ($from) = @_;
107 my $type = ref ($from);
108
109 if ($type eq "HASH") {
110 my $to = {};
[8716]111 foreach my $key (keys %$from) {
[1241]112 $to->{$key} = &clone ($from->{$key});
113 }
114 return $to;
115 } elsif ($type eq "ARRAY") {
116 my $to = [];
[8716]117 foreach my $v (@$from) {
[1241]118 push (@$to, &clone ($v));
119 }
120 return $to;
121 } else {
122 return $from;
123 }
124}
125
[2327]126sub set_OIDtype {
127 my $self = shift (@_);
[12268]128 my ($type, $metadata) = @_;
[1241]129
[17025]130 if (defined $type && $type =~ /^(hash|hash_on_file|hash_on_ga_xml|incremental|dirname|assigned)$/) {
[2327]131 $self->{'OIDtype'} = $type;
132 } else {
133 $self->{'OIDtype'} = "hash";
134 }
[16792]135
[12268]136 if ($type =~ /^assigned$/) {
137 if (defined $metadata) {
138 $self->{'OIDmetadata'} = $metadata;
139 } else {
140 $self->{'OIDmetadata'} = "dc.Identifier";
141 }
142 }
[2327]143}
144
[1241]145sub set_source_filename {
146 my $self = shift (@_);
[18319]147 my ($source_filename, $rename_method) = @_;
[1241]148
[16578]149 # Since the gsdlsourcefilename element goes into the doc.xml it has
150 # to be utf8. However, it should also *represent* the source filename
151 # (in the import directory) which may not be utf8 at all.
[18319]152 # For instance, if this meta element (gsdlsourcefilename) will be used
153 # by other applications that parse doc.xml in order to locate
154 # gsdlsourcefilename. Therefore, the solution is to URLencode or base64
155 # encode the real filename as this is a binary-to-text encoding meaning
156 # that the resulting string is ASCII (utf8). Decoding will give the original.
[16578]157
[18319]158# print STDERR "******URL/base64 encoding the gsdl_source_filename $source_filename ";
159
[16670]160 # URLencode just the gsdl_source_filename, not the directory. Then prepend dir
161 my ($srcfilename,$dirname,$suffix)
162 = &File::Basename::fileparse($source_filename, "\\.[^\\.]+\$");
[16792]163# print STDERR "-> $srcfilename -> ";
[18319]164 $srcfilename = &util::rename_file($srcfilename.$suffix, $rename_method);
[16670]165 $source_filename = &util::filename_cat($dirname, $srcfilename);
[16792]166# print STDERR "$source_filename\n";
[16670]167
[16578]168 $self->set_utf8_metadata_element ($self->get_top_section(),
[1241]169 "gsdlsourcefilename",
170 $source_filename);
171}
172
[7569]173sub set_converted_filename {
174 my $self = shift (@_);
175 my ($converted_filename) = @_;
176
[16578]177 # we know the converted filename is utf8
178 $self->set_utf8_metadata_element ($self->get_top_section(),
[7569]179 "gsdlconvertedfilename",
180 $converted_filename);
181}
182
[1241]183# returns the source_filename as it was provided
184sub get_source_filename {
185 my $self = shift (@_);
186
187 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
188}
189
[7569]190# returns converted filename if available else returns source filename
191sub get_filename_for_hashing {
192 my $self = shift (@_);
193
194 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
195
196 if (!defined $filename) {
[11097]197 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
[10980]198 # if NULPlug processed file, then don't give a filename
[11097]199 if (defined $plugin_name && $plugin_name eq "NULPlug") {
[10980]200 $filename = undef;
[16670]201 } else { # returns the URL encoded source filename!
[10980]202 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
203 }
[7569]204 }
205 return $filename;
206}
207
[1241]208sub set_doc_type {
209 my $self = shift (@_);
210 my ($doc_type) = @_;
211
212 $self->set_metadata_element ($self->get_top_section(),
213 "gsdldoctype",
214 $doc_type);
215}
216
[10217]217# returns the gsdldoctype as it was provided
[1241]218# the default of "indexed_doc" is used if no document
219# type was provided
220sub get_doc_type {
221 my $self = shift (@_);
222
223 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
224 return $doc_type if (defined $doc_type);
225 return "indexed_doc";
226}
227
228
229# look up the reference to the a particular section
230sub _lookup_section {
231 my $self = shift (@_);
232 my ($section) = @_;
233
234 my ($num);
235 my $sectionref = $self;
236
237 while (defined $section && $section ne "") {
[12327]238
[1241]239 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
[12327]240
241 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
242
[1241]243 $section = "" unless defined $section;
244
[12327]245
[1241]246 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
247 $sectionref = $sectionref->{'subsections'}->{$num};
248 } else {
249 return undef;
250 }
251 }
252
253 return $sectionref;
254}
255
[2327]256# calculate OID by hashing the contents of the document
[1241]257sub _calc_OID {
258 my $self = shift (@_);
259 my ($filename) = @_;
260
261 my $osexe = &util::get_os_exe();
262
263 my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
264 $ENV{'GSDLOS'},"hashfile$osexe");
[8504]265
[1241]266 my $result = "NULL";
[1679]267
[1241]268 if (-e "$hashfile_exe") {
[1679]269# $result = `\"$hashfile_exe\" \"$filename\"`;
270 $result = `hashfile$osexe \"$filename\"`;
[1241]271 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
272 } else {
273 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
274 }
275 return "HASH$result";
276}
277
278# methods dealing with OID, not groups of them.
279
[2327]280# if $OID is not provided one is calculated
[1241]281sub set_OID {
282 my $self = shift (@_);
283 my ($OID) = @_;
[8504]284
[8797]285 my $use_hash_oid = 0;
[17057]286 # if an OID wasn't provided calculate one
[1241]287 if (!defined $OID) {
288 $OID = "NULL";
[17025]289 if ($self->{'OIDtype'} =~ /^hash/) {
[8797]290 $use_hash_oid = 1;
291 } elsif ($self->{'OIDtype'} eq "incremental") {
[2327]292 $OID = "D" . $OIDcount;
293 $OIDcount ++;
[8716]294
295 } elsif ($self->{'OIDtype'} eq "dirname") {
296 $OID = 'J';
297 my $filename = $self->get_source_filename();
[8797]298 if (defined($filename)) { # && -e $filename) {
[8716]299 $OID = &File::Basename::dirname($filename);
300 if (defined $OID) {
301 $OID = 'J'.&File::Basename::basename($OID);
[8797]302 $OID =~ s/\.//; #remove any periods
[8716]303 } else {
[8797]304 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
305 $use_hash_oid = 1;
[8716]306 }
307 } else {
[8797]308 print STDERR "Failed to find filename, generating hash id\n";
309 $use_hash_oid = 1;
[8716]310 }
[2327]311
[8797]312 } elsif ($self->{'OIDtype'} eq "assigned") {
[12268]313 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
[8797]314 if (defined $identifier && $identifier ne "") {
[17025]315 $OID = $identifier;
[17057]316 if ($OID =~ /\./) {
317 print STDERR "Warning, assigned identifier $identifier contains periods (.), removing them\n";
318 $OID =~ s/\.//g; #remove any periods
319 }
[18561]320 if ($OID =~ /^[\d]*$/) {
[17057]321 print STDERR "Warning, assigned identifier $identifier contains only digits. Prepending 'D'.\n";
322 $OID = "D" . $OID;
[18561]323 }
[8797]324 } else {
325 # need a hash id
[12268]326 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
[8797]327 $use_hash_oid = 1;
328 }
329
[2327]330 } else {
[8797]331 $use_hash_oid = 1;
332 }
333
334 if ($use_hash_oid) {
[17025]335 my $hash_on_file = 1;
336 if ($self->{'OIDtype'} eq "hash_on_ga_xml") {
337 $hash_on_file = 0;
338 }
339 if ($hash_on_file) {
340 # "hash" OID - feed file to hashfile.exe
341 my $filename = $self->get_filename_for_hashing();
342 # -z: don't want to hash on the file if it is zero size
343 if (defined($filename) && -e $filename && !-z $filename) {
344 $OID = $self->_calc_OID ($filename);
345 } else {
346 $hash_on_file = 0;
347 }
348 }
349 if (!$hash_on_file) {
350 my $filename = &util::get_tmp_filename();
[2327]351 if (!open (OUTFILE, ">$filename")) {
352 print STDERR "doc::set_OID could not write to $filename\n";
353 } else {
[13172]354 my $doc_text = &docprint::get_section_xml($self, $self->get_top_section());
355 print OUTFILE $doc_text;
[2327]356 close (OUTFILE);
357 }
358 $OID = $self->_calc_OID ($filename);
359 &util::rm ($filename);
[1374]360 }
[1241]361 }
362 }
363 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
364}
365
366# this uses hashdoc (embedded c thingy) which is faster but still
367# needs a little work to be suffiently stable
368sub ___set_OID {
369 my $self = shift (@_);
370 my ($OID) = @_;
371
372 # if an OID wasn't provided then calculate hash value based on document
373 if (!defined $OID)
374 {
[13172]375 my $hash_text = &docprint::get_section_xml($self, $self->get_top_section());
[1241]376 my $hash_len = length($hash_text);
377
378 $OID = &hashdoc::buffer($hash_text,$hash_len);
379 }
380
381 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
382}
383
384# returns the OID for this document
385sub get_OID {
386 my $self = shift (@_);
387 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
388 return $OID if (defined $OID);
389 return "NULL";
390}
391
392sub delete_OID {
393 my $self = shift (@_);
394
395 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
396}
397
398
399# methods for manipulating section names
400
401# returns the name of the top-most section (the top
402# level of the document
403sub get_top_section {
404 my $self = shift (@_);
405
406 return "";
407}
408
409# returns a section
410sub get_parent_section {
411 my $self = shift (@_);
412 my ($section) = @_;
413
414 $section =~ s/(^|\.)\d+$//;
415
416 return $section;
417}
418
419# returns the first child section (or the end child
420# if there isn't any)
421sub get_begin_child {
422 my $self = shift (@_);
423 my ($section) = @_;
424
425 my $section_ptr = $self->_lookup_section($section);
426 return "" unless defined $section_ptr;
427
428 if (defined $section_ptr->{'subsection_order'}->[0]) {
429 return "$section.$section_ptr->{'subsection_order'}->[0]";
430 }
431
432 return $self->get_end_child ($section);
433}
434
435# returns the next child of a parent section
436sub get_next_child {
437 my $self = shift (@_);
438 my ($section) = @_;
439
440 my $parent_section = $self->get_parent_section($section);
441 my $parent_section_ptr = $self->_lookup_section($parent_section);
442 return undef unless defined $parent_section_ptr;
443
444 my ($section_num) = $section =~ /(\d+)$/;
445 return undef unless defined $section_num;
446
447 my $i = 0;
448 my $section_order = $parent_section_ptr->{'subsection_order'};
449 while ($i < scalar(@$section_order)) {
450 last if $section_order->[$i] eq $section_num;
451 $i++;
452 }
453
454 $i++; # the next child
455 if ($i < scalar(@$section_order)) {
456 return $section_order->[$i] if $parent_section eq "";
457 return "$parent_section.$section_order->[$i]";
458 }
459
460 # no more sections in this level
461 return undef;
462}
463
464# returns a reference to a list of children
465sub get_children {
466 my $self = shift (@_);
467 my ($section) = @_;
468
469 my $section_ptr = $self->_lookup_section($section);
470 return [] unless defined $section_ptr;
471
472 my @children = @{$section_ptr->{'subsection_order'}};
473
474 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
475 return \@children;
476}
477
478# returns the child section one past the last one (which
479# is coded as "0")
480sub get_end_child {
481 my $self = shift (@_);
482 my ($section) = @_;
483
484 return $section . ".0" unless $section eq "";
485 return "0";
486}
487
488# returns the next section in book order
489sub get_next_section {
490 my $self = shift (@_);
491 my ($section) = @_;
492
493 return undef unless defined $section;
494
495 my $section_ptr = $self->_lookup_section($section);
496 return undef unless defined $section_ptr;
497
498 # first try to find first child
499 if (defined $section_ptr->{'subsection_order'}->[0]) {
500 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
501 return "$section.$section_ptr->{'subsection_order'}->[0]";
502 }
503
504 do {
505 # try to find sibling
506 my $next_child = $self->get_next_child ($section);
507 return $next_child if (defined $next_child);
508
509 # move up one level
510 $section = $self->get_parent_section ($section);
511 } while $section =~ /\d/;
512
513 return undef;
514}
515
516sub is_leaf_section {
517 my $self = shift (@_);
518 my ($section) = @_;
519
520 my $section_ptr = $self->_lookup_section($section);
521 return 1 unless defined $section_ptr;
522
523 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
524}
525
526# methods for dealing with sections
527
528# returns the name of the inserted section
529sub insert_section {
530 my $self = shift (@_);
531 my ($before_section) = @_;
532
533 # get the child to insert before and its parent section
534 my $parent_section = "";
535 my $before_child = "0";
536 my @before_section = split (/\./, $before_section);
537 if (scalar(@before_section) > 0) {
538 $before_child = pop (@before_section);
539 $parent_section = join (".", @before_section);
540 }
541
542 my $parent_section_ptr = $self->_lookup_section($parent_section);
543 if (!defined $parent_section_ptr) {
544 print STDERR "doc::insert_section couldn't find parent section " .
545 "$parent_section\n";
546 return;
547 }
548
549 # get the next section number
550 my $section_num = $parent_section_ptr->{'next_subsection'}++;
551
552 my $i = 0;
553 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
554 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
555 $i++;
556 }
557
558 # insert the section number into the order list
559 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
560
561 # add this section to the parent section
562 my $section_ptr = {'subsection_order'=>[],
563 'next_subsection'=>1,
564 'subsections'=>{},
565 'metadata'=>[],
566 'text'=>""};
567 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
568
569 # work out the full section number
570 my $section = $parent_section;
571 $section .= "." unless $section eq "";
572 $section .= $section_num;
573
574 return $section;
575}
576
577# creates a pre-named section
578sub create_named_section {
579 my $self = shift (@_);
580 my ($mastersection) = @_;
581
582 my ($num);
583 my $section = $mastersection;
584 my $sectionref = $self;
585
586 while ($section ne "") {
587 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
588 $num =~ s/^0+(\d)/$1/; # remove leading 0s
589 $section = "" unless defined $section;
590
591 if (defined $num) {
592 if (!defined $sectionref->{'subsections'}->{$num}) {
593 push (@{$sectionref->{'subsection_order'}}, $num);
594 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
595 'next_subsection'=>1,
596 'subsections'=>{},
597 'metadata'=>[],
598 'text'=>""};
599 if ($num >= $sectionref->{'next_subsection'}) {
600 $sectionref->{'next_subsection'} = $num + 1;
601 }
602 }
603 $sectionref = $sectionref->{'subsections'}->{$num};
604
605 } else {
606 print STDERR "doc::create_named_section couldn't create section ";
607 print STDERR "$mastersection\n";
608 last;
609 }
610 }
611}
612
613# returns a reference to a list of subsections
614sub list_subsections {
615 my $self = shift (@_);
616 my ($section) = @_;
617
618 my $section_ptr = $self->_lookup_section ($section);
619 if (!defined $section_ptr) {
620 print STDERR "doc::list_subsections couldn't find section $section\n";
621 return [];
622 }
623
624 return [@{$section_ptr->{'subsection_order'}}];
625}
626
627sub delete_section {
628 my $self = shift (@_);
629 my ($section) = @_;
630
631# my $section_ptr = {'subsection_order'=>[],
632# 'next_subsection'=>1,
633# 'subsections'=>{},
634# 'metadata'=>[],
635# 'text'=>""};
636
637 # if this is the top section reset everything
638 if ($section eq "") {
639 $self->{'subsection_order'} = [];
640 $self->{'subsections'} = {};
641 $self->{'metadata'} = [];
642 $self->{'text'} = "";
643 return;
644 }
645
646 # find the parent of the section to delete
647 my $parent_section = "";
648 my $child = "0";
649 my @section = split (/\./, $section);
650 if (scalar(@section) > 0) {
651 $child = pop (@section);
652 $parent_section = join (".", @section);
653 }
654
655 my $parent_section_ptr = $self->_lookup_section($parent_section);
656 if (!defined $parent_section_ptr) {
657 print STDERR "doc::delete_section couldn't find parent section " .
658 "$parent_section\n";
659 return;
660 }
661
662 # remove this section from the subsection_order list
663 my $i = 0;
664 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
665 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
666 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
667 last;
668 }
669 $i++;
670 }
671
672 # remove this section from the subsection hash
673 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
674 undef $parent_section_ptr->{'subsections'}->{$child};
675 }
676}
677
678#--
[4]679# methods for dealing with metadata
680
681# set_metadata_element and get_metadata_element are for metadata
682# which should only have one value. add_meta_data and get_metadata
683# are for metadata which can have more than one value.
684
[1241]685# returns the first metadata value which matches field
[6111]686
687# This version of get metadata element works much like the one above,
688# except it allows for the namespace portion of a metadata element to
689# be ignored, thus if you are searching for dc.Title, the first piece
690# of matching metadata ending with the name Title (once any namespace
691# is removed) would be returned.
692# 28-11-2003 John Thompson
[1241]693sub get_metadata_element {
694 my $self = shift (@_);
[6111]695 my ($section, $field, $ignore_namespace) = @_;
[1241]696 my ($data);
697
[6111]698 $ignore_namespace = 0 unless defined $ignore_namespace;
699
[1241]700 my $section_ptr = $self->_lookup_section($section);
701 if (!defined $section_ptr) {
[8716]702 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
[1241]703 return;
704 }
705
[6111]706 # Remove the any namespace if we are being told to ignore them
707 if($ignore_namespace) {
708 $field =~ s/^\w*\.//;
709 }
710
[1241]711 foreach $data (@{$section_ptr->{'metadata'}}) {
[6111]712
713 my $data_name = $data->[0];
[14966]714
[6111]715 # Remove the any namespace if we are being told to ignore them
716 if($ignore_namespace) {
717 $data_name =~ s/^\w*\.//;
718 }
719
720 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
[1241]721 }
722
723 return undef; # was not found
724}
725
726# returns a list of the form [value1, value2, ...]
727sub get_metadata {
728 my $self = shift (@_);
[6111]729 my ($section, $field, $ignore_namespace) = @_;
[1241]730 my ($data);
731
[6111]732 $ignore_namespace = 0 unless defined $ignore_namespace;
733
[1241]734 my $section_ptr = $self->_lookup_section($section);
735 if (!defined $section_ptr) {
[8716]736 print STDERR "doc::get_metadata couldn't find section ",
737 $section, "\n";
[1241]738 return;
739 }
740
[6111]741 # Remove the any namespace if we are being told to ignore them
742 if($ignore_namespace) {
743 $field =~ s/^\w*\.//;
744 }
745
[1241]746 my @metadata = ();
747 foreach $data (@{$section_ptr->{'metadata'}}) {
[6111]748
749 my $data_name = $data->[0];
750 # Remove the any namespace if we are being told to ignore them
751 if($ignore_namespace) {
752 $data_name =~ s/^\w*\.//;
753 }
754
755 push (@metadata, $data->[1]) if ($data_name eq $field);
[1241]756 }
[9241]757
[1241]758 return \@metadata;
759}
760
761# returns a list of the form [[field,value],[field,value],...]
762sub get_all_metadata {
763 my $self = shift (@_);
764 my ($section) = @_;
765
766 my $section_ptr = $self->_lookup_section($section);
767 if (!defined $section_ptr) {
[8716]768 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
[1241]769 return;
770 }
771
772 return $section_ptr->{'metadata'};
773}
774
775# $value is optional
776sub delete_metadata {
777 my $self = shift (@_);
778 my ($section, $field, $value) = @_;
779
780 my $section_ptr = $self->_lookup_section($section);
781 if (!defined $section_ptr) {
[8716]782 print STDERR "doc::delete_metadata couldn't find section ", $section, "\n";
[1241]783 return;
784 }
785
786 my $i = 0;
787 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
788 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
789 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
790 splice (@{$section_ptr->{'metadata'}}, $i, 1);
791 } else {
792 $i++;
793 }
794 }
795}
796
797sub delete_all_metadata {
798 my $self = shift (@_);
799 my ($section) = @_;
800
801 my $section_ptr = $self->_lookup_section($section);
802 if (!defined $section_ptr) {
[8716]803 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
[1241]804 return;
805 }
806
807 $section_ptr->{'metadata'} = [];
808}
809
[4]810sub set_metadata_element {
811 my $self = shift (@_);
812 my ($section, $field, $value) = @_;
813
[97]814 $self->set_utf8_metadata_element ($section, $field,
[1870]815 &unicode::ascii2utf8(\$value));
[73]816}
817
818# set_utf8_metadata_element assumes the text has already been
819# converted to the UTF-8 encoding.
820sub set_utf8_metadata_element {
821 my $self = shift (@_);
822 my ($section, $field, $value) = @_;
823
[4]824 $self->delete_metadata ($section, $field);
[73]825 $self->add_utf8_metadata ($section, $field, $value);
[4]826}
827
828
[73]829# add_metadata assumes the text is in (extended) ascii form. For
[8220]830# text which has already been converted to the UTF-8 format use
[73]831# add_utf8_metadata.
[4]832sub add_metadata {
833 my $self = shift (@_);
834 my ($section, $field, $value) = @_;
835
[97]836 $self->add_utf8_metadata ($section, $field,
[1870]837 &unicode::ascii2utf8(\$value));
[73]838}
839
840sub add_utf8_metadata {
841 my $self = shift (@_);
842 my ($section, $field, $value) = @_;
843
[4]844 my $section_ptr = $self->_lookup_section($section);
845 if (!defined $section_ptr) {
[8716]846 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
[4]847 return;
848 }
[1732]849 if (!defined $value) {
850 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
851 return;
852 }
853 if (!defined $field) {
854 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
855 return;
856 }
[4]857
[8894]858 #print STDERR "###$field=$value\n";
[7798]859 # double check that the value is utf-8
[16578]860 if (!&unicode::check_is_utf8($value)) {
[16670]861 print STDERR "doc::add_utf8_metadata - warning: '$field''s value $value wasn't utf8.";
[16578]862 &unicode::ensure_utf8(\$value);
[16670]863 print STDERR " Tried converting to utf8: $value\n";
[7798]864 }
865
[4]866 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
867}
868
869
870# methods for dealing with text
871
[1241]872# returns the text for a section
873sub get_text {
874 my $self = shift (@_);
875 my ($section) = @_;
876
877 my $section_ptr = $self->_lookup_section($section);
878 if (!defined $section_ptr) {
879 print STDERR "doc::get_text couldn't find section " .
880 "$section\n";
881 return "";
882 }
883
884 return $section_ptr->{'text'};
885}
886
887# returns the (utf-8 encoded) length of the text for a section
888sub get_text_length {
889 my $self = shift (@_);
890 my ($section) = @_;
891
892 my $section_ptr = $self->_lookup_section($section);
893 if (!defined $section_ptr) {
894 print STDERR "doc::get_text_length couldn't find section " .
895 "$section\n";
896 return 0;
897 }
898
899 return length ($section_ptr->{'text'});
900}
901
902sub delete_text {
903 my $self = shift (@_);
904 my ($section) = @_;
905
906 my $section_ptr = $self->_lookup_section($section);
907 if (!defined $section_ptr) {
908 print STDERR "doc::delete_text couldn't find section " .
909 "$section\n";
910 return;
911 }
912
913 $section_ptr->{'text'} = "";
914}
915
[73]916# add_text assumes the text is in (extended) ascii form. For
917# text which has been already converted to the UTF-8 format
918# use add_utf8_text.
[4]919sub add_text {
920 my $self = shift (@_);
921 my ($section, $text) = @_;
922
[73]923 # convert the text to UTF-8 encoded unicode characters
924 # and add the text
[1870]925 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
[73]926}
927
928
929# add_utf8_text assumes the text to be added has already
930# been converted to the UTF-8 encoding. For ascii text use
931# add_text
932sub add_utf8_text {
933 my $self = shift (@_);
934 my ($section, $text) = @_;
935
[4]936 my $section_ptr = $self->_lookup_section($section);
937 if (!defined $section_ptr) {
[73]938 print STDERR "doc::add_utf8_text couldn't find section " .
[4]939 "$section\n";
940 return;
941 }
942
943 $section_ptr->{'text'} .= $text;
944}
945
[16950]946# returns the Source meta, which is the utf8 filename generated.
947# Added a separate method here for convenience
948sub get_source {
[16924]949 my $self = shift (@_);
[16950]950 return $self->get_metadata_element ($self->get_top_section(), "Source");
951}
952
953# returns the SourceFile meta, which is the url reference to the URL-encoded
954# version of Source (the utf8 filename). Added a separate method here for convenience
955sub get_sourcefile {
956 my $self = shift (@_);
957 return $self->get_metadata_element ($self->get_top_section(), "SourceFile");
958}
959
960# Get the actual name of the assocfile, a url-encoded string derived from SourceFile.
961# The SourceFile meta is the (escaped) url reference to the url-encoded assocfile.
962sub get_assocfile_from_sourcefile {
963 my $self = shift (@_);
[16924]964
965 # get the SourceFile meta, which is a *URL* to a file on the filesystem
966 my $top_section = $self->get_top_section();
967 my $source_file = $self->get_metadata_element($top_section, "SourceFile");
[4]968
[16924]969 # get the actual filename as it exists on the filesystem which this url refers to
[16928]970 $source_file = &unicode::url_to_filename($source_file);
[16924]971 my ($assocfilename) = $source_file =~ /([^\\\/]+)$/;
972 return $assocfilename;
973}
974
[1241]975# methods for dealing with associated files
976
977# a file is associated with a document, NOT a section.
978# if section is defined it is noted in the data structure
979# only so that files associated from a particular section
980# may be removed later (using delete_section_assoc_files)
981sub associate_file {
982 my $self = shift (@_);
983 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
984 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
985
986 # remove all associated files with the same name
987 $self->delete_assoc_file ($assoc_filename);
[8504]988
[1241]989 push (@{$self->{'associated_files'}},
990 [$real_filename, $assoc_filename, $mime_type, $section]);
991}
992
993# returns a list of associated files in the form
994# [[real_filename, assoc_filename, mimetype], ...]
995sub get_assoc_files {
996 my $self = shift (@_);
997
998 return $self->{'associated_files'};
999}
1000
[19494]1001
1002sub metadata_file {
1003 my $self = shift (@_);
1004 my ($real_filename, $filename) = @_;
1005
1006 push (@{$self->{'metadata_files'}},
1007 [$real_filename, $filename]);
1008}
1009
1010sub get_meta_files {
1011 my $self = shift (@_);
1012
1013 return $self->{'metadata_files'};
1014}
1015
[1241]1016sub delete_section_assoc_files {
1017 my $self = shift (@_);
1018 my ($section) = @_;
1019
1020 my $i=0;
1021 while ($i < scalar (@{$self->{'associated_files'}})) {
1022 if (defined $self->{'associated_files'}->[$i]->[3] &&
1023 $self->{'associated_files'}->[$i]->[3] eq $section) {
1024 splice (@{$self->{'associated_files'}}, $i, 1);
1025 } else {
1026 $i++;
1027 }
1028 }
1029}
1030
1031sub delete_assoc_file {
1032 my $self = shift (@_);
1033 my ($assoc_filename) = @_;
1034
1035 my $i=0;
1036 while ($i < scalar (@{$self->{'associated_files'}})) {
1037 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1038 splice (@{$self->{'associated_files'}}, $i, 1);
1039 } else {
1040 $i++;
1041 }
1042 }
1043}
1044
1045sub reset_nextsection_ptr {
1046 my $self = shift (@_);
1047 my ($section) = @_;
1048
1049 my $section_ptr = $self->_lookup_section($section);
1050 $section_ptr->{'next_subsection'} = 1;
1051}
1052
[4]10531;
Note: See TracBrowser for help on using the repository browser.