source: main/tags/2.71/gsdl/perllib/doc.pm@ 25382

Last change on this file since 25382 was 13050, checked in by kjdon, 18 years ago

moved all the mets outputting methods from doc.pm into METSPlugout. deleted output_dspace_section method from doc.pm cos it wasn't being used by anything. _escape_text is now a class method - so other classes can use it

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 32.2 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use unicode;
37use util;
38use ghtml;
39use File::stat;
40##use hashdoc;
41
42# the document type may be indexed_doc, nonindexed_doc, or
43# classification
44
45my $OIDcount = 0;
46
47sub new {
48 my $class = shift (@_);
49 my ($source_filename, $doc_type) = @_;
50
51 my $self = bless {'associated_files'=>[],
52 'subsection_order'=>[],
53 'next_subsection'=>1,
54 'subsections'=>{},
55 'metadata'=>[],
56 'text'=>"",
57 'OIDtype'=>"hash"}, $class;
58
59 # used to set lastmodified here, but this can screw up the HASH ids, so
60 # the docsave processor now calls set_lastmodified
61
62 if (defined $source_filename) {
63 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
64
65 if (defined $collect_dir) {
66 my $dirsep = &util::get_dirsep();
67 if ($collect_dir !~ m/$dirsep$/) {
68 $collect_dir .= $dirsep;
69 }
70
71 $collect_dir =~ s/\\/\\\\/g; # escape DOS style file separator
72
73 # if from within GSDLCOLLECTDIR, then remove directory prefix
74 # so source_filename is realative to it. This is done to aid
75 # portability, i.e. the collection can be moved to somewhere
76 # else on the file system and the archives directory will still
77 # work. This is needed, for example in the applet version of
78 # GLI where GSDLHOME/collect on the server will be different to
79 # the collect directory of the remove user. Of course,
80 # GSDLCOLLECTDIR subsequently needs to be put back on to turn
81 # it back into a full pathname.
82
83 if ($source_filename =~ /^$collect_dir(.*)$/) {
84 $source_filename = $1;
85 }
86 }
87
88 $self->set_source_filename ($source_filename);
89 }
90
91 $self->set_doc_type ($doc_type) if defined $doc_type;
92
93 return $self;
94}
95# set lastmodified for OAI purposes, added by GRB, moved by kjdon
96sub set_lastmodified {
97 my $self = shift (@_);
98
99 my $source_filename = $self->get_source_filename();
100 if ((defined $self->get_metadata_element ($self->get_top_section(), "gsdldoctype")) &&
101 (defined $source_filename) && (-e $source_filename)) {
102 my $file_stat = stat($source_filename);
103 my $mtime = $file_stat->mtime;
104 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $file_stat->mtime);
105 }
106}
107
108# clone the $self object
109sub duplicate {
110 my $self = shift (@_);
111
112 my $newobj = {};
113
114 foreach my $k (keys %$self) {
115 $newobj->{$k} = &clone ($self->{$k});
116 }
117
118 bless $newobj, ref($self);
119 return $newobj;
120}
121
122sub clone {
123 my ($from) = @_;
124 my $type = ref ($from);
125
126 if ($type eq "HASH") {
127 my $to = {};
128 foreach my $key (keys %$from) {
129 $to->{$key} = &clone ($from->{$key});
130 }
131 return $to;
132 } elsif ($type eq "ARRAY") {
133 my $to = [];
134 foreach my $v (@$from) {
135 push (@$to, &clone ($v));
136 }
137 return $to;
138 } else {
139 return $from;
140 }
141}
142
143sub set_OIDtype {
144 my $self = shift (@_);
145 my ($type, $metadata) = @_;
146
147 if (defined $type && $type =~ /^(hash|incremental|dirname|assigned)$/) {
148 $self->{'OIDtype'} = $type;
149 } else {
150 $self->{'OIDtype'} = "hash";
151 }
152 if ($type =~ /^assigned$/) {
153 if (defined $metadata) {
154 $self->{'OIDmetadata'} = $metadata;
155 } else {
156 $self->{'OIDmetadata'} = "dc.Identifier";
157 }
158 }
159}
160
161sub set_source_filename {
162 my $self = shift (@_);
163 my ($source_filename) = @_;
164
165 $self->set_metadata_element ($self->get_top_section(),
166 "gsdlsourcefilename",
167 $source_filename);
168}
169
170sub set_converted_filename {
171 my $self = shift (@_);
172 my ($converted_filename) = @_;
173
174 $self->set_metadata_element ($self->get_top_section(),
175 "gsdlconvertedfilename",
176 $converted_filename);
177}
178
179
180# returns the source_filename as it was provided
181sub get_source_filename {
182 my $self = shift (@_);
183
184 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
185}
186
187# returns converted filename if available else returns source filename
188sub get_filename_for_hashing {
189 my $self = shift (@_);
190
191 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
192
193 if (!defined $filename) {
194 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
195 # if NULPlug processed file, then don't give a filename
196 if (defined $plugin_name && $plugin_name eq "NULPlug") {
197 $filename = undef;
198 } else {
199 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
200 }
201 }
202 return $filename;
203}
204
205sub set_doc_type {
206 my $self = shift (@_);
207 my ($doc_type) = @_;
208
209 $self->set_metadata_element ($self->get_top_section(),
210 "gsdldoctype",
211 $doc_type);
212}
213
214# returns the gsdldoctype as it was provided
215# the default of "indexed_doc" is used if no document
216# type was provided
217sub get_doc_type {
218 my $self = shift (@_);
219
220 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
221 return $doc_type if (defined $doc_type);
222 return "indexed_doc";
223}
224
225sub _escape_text {
226 my $self = shift (@_);
227 my ($text) = @_;
228 # special characters in the gml encoding
229 $text =~ s/&/&/g; # this has to be first...
230 $text =~ s/</&lt;/g;
231 $text =~ s/>/&gt;/g;
232 $text =~ s/\"/&quot;/g;
233
234 return $text;
235}
236
237sub buffer_section_xml {
238 my $self = shift (@_);
239 my ($section) = @_;
240
241 my $section_ptr = $self->_lookup_section ($section);
242 return "" unless defined $section_ptr;
243
244 my $all_text = "<Section>\n";
245 $all_text .= " <Description>\n";
246
247 # output metadata
248 foreach my $data (@{$section_ptr->{'metadata'}}) {
249 my $escaped_value = $self->_escape_text($data->[1]);
250 $all_text .= ' <Metadata name="' . $data->[0] . '">' . $escaped_value . "</Metadata>\n";
251 }
252
253 $all_text .= " </Description>\n";
254
255 # output the text
256 $all_text .= " <Content>";
257 $all_text .= $self->_escape_text($section_ptr->{'text'});
258 $all_text .= "</Content>\n";
259
260 # output all the subsections
261 foreach my $subsection (@{$section_ptr->{'subsection_order'}}) {
262 $all_text .= $self->buffer_section_xml("$section.$subsection");
263 }
264
265 $all_text .= "</Section>\n";
266
267 # make sure no nasty control characters have snuck through
268 # (XML::Parser will barf on anything it doesn't consider to be
269 # valid UTF-8 text, including things like \c@, \cC etc.)
270 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
271
272 return $all_text;
273}
274
275
276
277
278sub output_section {
279 my $self = shift (@_);
280 my ($handle, $section) = @_;
281
282 print $handle $self->buffer_section_xml($section);
283}
284
285my $dc_set = { Title => 1,
286 Creator => 1,
287 Subject => 1,
288 Description => 1,
289 Publisher => 1,
290 Contributor => 1,
291 Date => 1,
292 Type => 1,
293 Format => 1,
294 Identifier => 1,
295 Source => 1,
296 Language => 1,
297 Relation => 1,
298 Coverage => 1,
299 Rights => 1};
300
301
302
303
304# Build up dublin_core metadata. Priority given to dc.* over ex.*
305# This method was apparently added by Jeffrey and committed by Shaoqun.
306# But we don't know why it was added, so not using it anymore.
307sub new_buffer_dc_section {
308
309 my $self = shift(@_);
310 my ($section, $version) = @_;
311
312 # build up string of dublin core metadata
313 $section="" unless defined $section;
314
315 my $section_ptr=$self->_lookup_section($section);
316 return "" unless defined $section_ptr;
317 foreach my $data (@{$section_ptr->{'metadata'}}){
318 my $escaped_value = $self->_escape_text($data->[1]);
319 my $dc_element = $data->[0];
320
321 my @array = split('\.',$dc_element);
322 my ($type,$name);
323
324 if(defined $array[1])
325 {
326 $type = $array[0];
327 $name = $array[1];
328 }
329 else
330 {
331 $type = "ex";
332 $name = $array[0];
333 }
334
335 $all_text .= ' <Metadata Type="'. $type.'" Name="'.$name.'">'. $escaped_value. "</Metadata>\n";
336 }
337 return $all_text;
338}
339
340
341sub buffer_dc_section {
342 my $self = shift(@_);
343 my ($section, $version) = @_;
344
345 # build up string of dublin core metadata
346 $section="" unless defined $section;
347
348 my $section_ptr=$self->_lookup_section($section);
349 return "" unless defined $section_ptr;
350
351
352 my $explicit_dc = {};
353 my $explicit_ex = {};
354
355 my $all_text="";
356 foreach my $data (@{$section_ptr->{'metadata'}}){
357 foreach my $temp (@$data)
358 {
359 print "($temp) ";
360 }
361 print "\n";
362 my $escaped_value = $self->_escape_text($data->[1]);
363 if ($data->[0]=~ m/^dc\./) {
364 $data->[0] =~ tr/[A-Z]/[a-z]/;
365
366 $data->[0] =~ m/^dc\.(.*)/;
367 my $dc_element = $1;
368
369 if (!defined $explicit_dc->{$dc_element}) {
370 $explicit_dc->{$dc_element} = [];
371 }
372 push(@{$explicit_dc->{$dc_element}},$escaped_value);
373
374 #$all_text .= ' <dcvalue element="'. $data->[0].'" qualifier="#####">'. $escaped_value. "</dcvalue>\n";
375 if (defined $version && ($version eq "oai_dc")) {
376 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
377 }
378 else {
379 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
380 }
381
382 }
383 elsif (($data->[0] =~ m/^ex\./) || ($data->[0] !~ m/\./)) {
384 $data->[0] =~ m/^(ex\.)?(.*)/;
385 my $ex_element = $2;
386 my $lc_ex_element = lc($ex_element);
387
388 if (defined $dc_set->{$ex_element}) {
389 if (!defined $explicit_ex->{$lc_ex_element}) {
390 $explicit_ex->{$lc_ex_element} = [];
391 }
392 push(@{$explicit_ex->{$lc_ex_element}},$escaped_value);
393 }
394 }
395 }
396
397 # go through dc_set and for any element *not* defined in explicit_dc
398 # that do exist in explicit_ex, add it in as metadata
399 foreach my $k ( keys %$dc_set ) {
400 my $lc_k = lc($k);
401
402 if (!defined $explicit_dc->{$lc_k}) {
403 if (defined $explicit_ex->{$lc_k}) {
404
405 foreach my $v (@{$explicit_ex->{$lc_k}}) {
406 my $dc_element = $lc_k;
407 my $escaped_value = $v;
408
409 if (defined $version && ($version eq "oai_dc")) {
410 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
411 }
412 else {
413 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
414 }
415
416 }
417 }
418 }
419 }
420
421 if ($all_text eq "") {
422 $all_text .= " There is no Dublin Core metatdata in this document\n";
423 }
424 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
425
426 return $all_text;
427}
428
429
430# Print out dublin_core metadata
431# changed back to using old method, don't know why Jeffrey changed this
432sub output_dc_section {
433 my $self = shift(@_);
434 my ($handle, $section, $version) = @_;
435
436 my $all_text = $self->buffer_dc_section($section,$version);
437 #my $all_text = $self->new_buffer_dc_section($section,$version);
438
439 print $handle $all_text;
440}
441
442
443# look up the reference to the a particular section
444sub _lookup_section {
445 my $self = shift (@_);
446 my ($section) = @_;
447
448 my ($num);
449 my $sectionref = $self;
450
451 while (defined $section && $section ne "") {
452
453 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
454
455 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
456
457 $section = "" unless defined $section;
458
459
460 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
461 $sectionref = $sectionref->{'subsections'}->{$num};
462 } else {
463 return undef;
464 }
465 }
466
467 return $sectionref;
468}
469
470# calculate OID by hashing the contents of the document
471sub _calc_OID {
472 my $self = shift (@_);
473 my ($filename) = @_;
474
475 my $osexe = &util::get_os_exe();
476
477 my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
478 $ENV{'GSDLOS'},"hashfile$osexe");
479
480 my $result = "NULL";
481
482 if (-e "$hashfile_exe") {
483# $result = `\"$hashfile_exe\" \"$filename\"`;
484 $result = `hashfile$osexe \"$filename\"`;
485 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
486
487 } else {
488 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
489 }
490 return "HASH$result";
491}
492
493# methods dealing with OID, not groups of them.
494
495# if $OID is not provided one is calculated
496sub set_OID {
497 my $self = shift (@_);
498 my ($OID) = @_;
499
500 my $use_hash_oid = 0;
501 # if an OID wasn't provided claculate one
502 if (!defined $OID) {
503 $OID = "NULL";
504 if ($self->{'OIDtype'} eq "hash") {
505 $use_hash_oid = 1;
506 } elsif ($self->{'OIDtype'} eq "incremental") {
507 $OID = "D" . $OIDcount;
508 $OIDcount ++;
509
510 } elsif ($self->{'OIDtype'} eq "dirname") {
511 $OID = 'J';
512 my $filename = $self->get_source_filename();
513 if (defined($filename)) { # && -e $filename) {
514 $OID = &File::Basename::dirname($filename);
515 if (defined $OID) {
516 $OID = 'J'.&File::Basename::basename($OID);
517 $OID =~ s/\.//; #remove any periods
518 } else {
519 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
520 $use_hash_oid = 1;
521 }
522 } else {
523 print STDERR "Failed to find filename, generating hash id\n";
524 $use_hash_oid = 1;
525 }
526
527 } elsif ($self->{'OIDtype'} eq "assigned") {
528 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
529 if (defined $identifier && $identifier ne "") {
530 $OID = "D" . $identifier;
531 $OID =~ s/\.//; #remove any periods
532 } else {
533 # need a hash id
534 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
535 $use_hash_oid = 1;
536 }
537
538 } else {
539 $use_hash_oid = 1;
540 }
541
542 if ($use_hash_oid) {
543
544 # "hash" OID - feed file to hashfile.exe
545 #my $filename = $self->get_source_filename();
546 # we want to use the converted file for hashing if available
547 # cos its quicker
548 my $filename = $self->get_filename_for_hashing();
549 # -z: don't want to hash on the file if it is zero size
550 if (defined($filename) && -e $filename && !-z $filename) {
551 $OID = $self->_calc_OID ($filename);
552 } else {
553 $filename = &util::get_tmp_filename();
554 if (!open (OUTFILE, ">$filename")) {
555 print STDERR "doc::set_OID could not write to $filename\n";
556 } else {
557 $self->output_section('OUTFILE', $self->get_top_section(),
558 undef, 1);
559 close (OUTFILE);
560 }
561 $OID = $self->_calc_OID ($filename);
562 &util::rm ($filename);
563 }
564 }
565 }
566 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
567}
568
569# this uses hashdoc (embedded c thingy) which is faster but still
570# needs a little work to be suffiently stable
571sub ___set_OID {
572 my $self = shift (@_);
573 my ($OID) = @_;
574
575 # if an OID wasn't provided then calculate hash value based on document
576 if (!defined $OID)
577 {
578 my $hash_text = $self->buffer_section_gml($self->get_top_section(),
579 undef, 1);
580 my $hash_len = length($hash_text);
581
582 $OID = &hashdoc::buffer($hash_text,$hash_len);
583 }
584
585 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
586}
587
588# returns the OID for this document
589sub get_OID {
590 my $self = shift (@_);
591 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
592 return $OID if (defined $OID);
593 return "NULL";
594}
595
596sub delete_OID {
597 my $self = shift (@_);
598
599 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
600}
601
602
603# methods for manipulating section names
604
605# returns the name of the top-most section (the top
606# level of the document
607sub get_top_section {
608 my $self = shift (@_);
609
610 return "";
611}
612
613# returns a section
614sub get_parent_section {
615 my $self = shift (@_);
616 my ($section) = @_;
617
618 $section =~ s/(^|\.)\d+$//;
619
620 return $section;
621}
622
623# returns the first child section (or the end child
624# if there isn't any)
625sub get_begin_child {
626 my $self = shift (@_);
627 my ($section) = @_;
628
629 my $section_ptr = $self->_lookup_section($section);
630 return "" unless defined $section_ptr;
631
632 if (defined $section_ptr->{'subsection_order'}->[0]) {
633 return "$section.$section_ptr->{'subsection_order'}->[0]";
634 }
635
636 return $self->get_end_child ($section);
637}
638
639# returns the next child of a parent section
640sub get_next_child {
641 my $self = shift (@_);
642 my ($section) = @_;
643
644 my $parent_section = $self->get_parent_section($section);
645 my $parent_section_ptr = $self->_lookup_section($parent_section);
646 return undef unless defined $parent_section_ptr;
647
648 my ($section_num) = $section =~ /(\d+)$/;
649 return undef unless defined $section_num;
650
651 my $i = 0;
652 my $section_order = $parent_section_ptr->{'subsection_order'};
653 while ($i < scalar(@$section_order)) {
654 last if $section_order->[$i] eq $section_num;
655 $i++;
656 }
657
658 $i++; # the next child
659 if ($i < scalar(@$section_order)) {
660 return $section_order->[$i] if $parent_section eq "";
661 return "$parent_section.$section_order->[$i]";
662 }
663
664 # no more sections in this level
665 return undef;
666}
667
668# returns a reference to a list of children
669sub get_children {
670 my $self = shift (@_);
671 my ($section) = @_;
672
673 my $section_ptr = $self->_lookup_section($section);
674 return [] unless defined $section_ptr;
675
676 my @children = @{$section_ptr->{'subsection_order'}};
677
678 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
679 return \@children;
680}
681
682# returns the child section one past the last one (which
683# is coded as "0")
684sub get_end_child {
685 my $self = shift (@_);
686 my ($section) = @_;
687
688 return $section . ".0" unless $section eq "";
689 return "0";
690}
691
692# returns the next section in book order
693sub get_next_section {
694 my $self = shift (@_);
695 my ($section) = @_;
696
697 return undef unless defined $section;
698
699 my $section_ptr = $self->_lookup_section($section);
700 return undef unless defined $section_ptr;
701
702 # first try to find first child
703 if (defined $section_ptr->{'subsection_order'}->[0]) {
704 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
705 return "$section.$section_ptr->{'subsection_order'}->[0]";
706 }
707
708 do {
709 # try to find sibling
710 my $next_child = $self->get_next_child ($section);
711 return $next_child if (defined $next_child);
712
713 # move up one level
714 $section = $self->get_parent_section ($section);
715 } while $section =~ /\d/;
716
717 return undef;
718}
719
720sub is_leaf_section {
721 my $self = shift (@_);
722 my ($section) = @_;
723
724 my $section_ptr = $self->_lookup_section($section);
725 return 1 unless defined $section_ptr;
726
727 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
728}
729
730# methods for dealing with sections
731
732# returns the name of the inserted section
733sub insert_section {
734 my $self = shift (@_);
735 my ($before_section) = @_;
736
737 # get the child to insert before and its parent section
738 my $parent_section = "";
739 my $before_child = "0";
740 my @before_section = split (/\./, $before_section);
741 if (scalar(@before_section) > 0) {
742 $before_child = pop (@before_section);
743 $parent_section = join (".", @before_section);
744 }
745
746 my $parent_section_ptr = $self->_lookup_section($parent_section);
747 if (!defined $parent_section_ptr) {
748 print STDERR "doc::insert_section couldn't find parent section " .
749 "$parent_section\n";
750 return;
751 }
752
753 # get the next section number
754 my $section_num = $parent_section_ptr->{'next_subsection'}++;
755
756 my $i = 0;
757 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
758 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
759 $i++;
760 }
761
762 # insert the section number into the order list
763 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
764
765 # add this section to the parent section
766 my $section_ptr = {'subsection_order'=>[],
767 'next_subsection'=>1,
768 'subsections'=>{},
769 'metadata'=>[],
770 'text'=>""};
771 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
772
773 # work out the full section number
774 my $section = $parent_section;
775 $section .= "." unless $section eq "";
776 $section .= $section_num;
777
778 return $section;
779}
780
781# creates a pre-named section
782sub create_named_section {
783 my $self = shift (@_);
784 my ($mastersection) = @_;
785
786 my ($num);
787 my $section = $mastersection;
788 my $sectionref = $self;
789
790 while ($section ne "") {
791 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
792 $num =~ s/^0+(\d)/$1/; # remove leading 0s
793 $section = "" unless defined $section;
794
795 if (defined $num) {
796 if (!defined $sectionref->{'subsections'}->{$num}) {
797 push (@{$sectionref->{'subsection_order'}}, $num);
798 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
799 'next_subsection'=>1,
800 'subsections'=>{},
801 'metadata'=>[],
802 'text'=>""};
803 if ($num >= $sectionref->{'next_subsection'}) {
804 $sectionref->{'next_subsection'} = $num + 1;
805 }
806 }
807 $sectionref = $sectionref->{'subsections'}->{$num};
808
809 } else {
810 print STDERR "doc::create_named_section couldn't create section ";
811 print STDERR "$mastersection\n";
812 last;
813 }
814 }
815}
816
817# returns a reference to a list of subsections
818sub list_subsections {
819 my $self = shift (@_);
820 my ($section) = @_;
821
822 my $section_ptr = $self->_lookup_section ($section);
823 if (!defined $section_ptr) {
824 print STDERR "doc::list_subsections couldn't find section $section\n";
825 return [];
826 }
827
828 return [@{$section_ptr->{'subsection_order'}}];
829}
830
831sub delete_section {
832 my $self = shift (@_);
833 my ($section) = @_;
834
835# my $section_ptr = {'subsection_order'=>[],
836# 'next_subsection'=>1,
837# 'subsections'=>{},
838# 'metadata'=>[],
839# 'text'=>""};
840
841 # if this is the top section reset everything
842 if ($section eq "") {
843 $self->{'subsection_order'} = [];
844 $self->{'subsections'} = {};
845 $self->{'metadata'} = [];
846 $self->{'text'} = "";
847 return;
848 }
849
850 # find the parent of the section to delete
851 my $parent_section = "";
852 my $child = "0";
853 my @section = split (/\./, $section);
854 if (scalar(@section) > 0) {
855 $child = pop (@section);
856 $parent_section = join (".", @section);
857 }
858
859 my $parent_section_ptr = $self->_lookup_section($parent_section);
860 if (!defined $parent_section_ptr) {
861 print STDERR "doc::delete_section couldn't find parent section " .
862 "$parent_section\n";
863 return;
864 }
865
866 # remove this section from the subsection_order list
867 my $i = 0;
868 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
869 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
870 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
871 last;
872 }
873 $i++;
874 }
875
876 # remove this section from the subsection hash
877 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
878 undef $parent_section_ptr->{'subsections'}->{$child};
879 }
880}
881
882#--
883# methods for dealing with metadata
884
885# set_metadata_element and get_metadata_element are for metadata
886# which should only have one value. add_meta_data and get_metadata
887# are for metadata which can have more than one value.
888
889# returns the first metadata value which matches field
890
891# This version of get metadata element works much like the one above,
892# except it allows for the namespace portion of a metadata element to
893# be ignored, thus if you are searching for dc.Title, the first piece
894# of matching metadata ending with the name Title (once any namespace
895# is removed) would be returned.
896# 28-11-2003 John Thompson
897sub get_metadata_element {
898 my $self = shift (@_);
899 my ($section, $field, $ignore_namespace) = @_;
900 my ($data);
901
902 $ignore_namespace = 0 unless defined $ignore_namespace;
903
904 my $section_ptr = $self->_lookup_section($section);
905 if (!defined $section_ptr) {
906 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
907 return;
908 }
909
910 # Remove the any namespace if we are being told to ignore them
911 if($ignore_namespace) {
912 $field =~ s/^\w*\.//;
913 }
914
915 foreach $data (@{$section_ptr->{'metadata'}}) {
916
917 my $data_name = $data->[0];
918 # Remove the any namespace if we are being told to ignore them
919 if($ignore_namespace) {
920 $data_name =~ s/^\w*\.//;
921 }
922
923 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
924 }
925
926 return undef; # was not found
927}
928
929# returns a list of the form [value1, value2, ...]
930sub get_metadata {
931 my $self = shift (@_);
932 my ($section, $field, $ignore_namespace) = @_;
933 my ($data);
934
935 $ignore_namespace = 0 unless defined $ignore_namespace;
936
937 my $section_ptr = $self->_lookup_section($section);
938 if (!defined $section_ptr) {
939 print STDERR "doc::get_metadata couldn't find section ",
940 $section, "\n";
941 return;
942 }
943
944 # Remove the any namespace if we are being told to ignore them
945 if($ignore_namespace) {
946 $field =~ s/^\w*\.//;
947 }
948
949 my @metadata = ();
950 foreach $data (@{$section_ptr->{'metadata'}}) {
951
952 my $data_name = $data->[0];
953 # Remove the any namespace if we are being told to ignore them
954 if($ignore_namespace) {
955 $data_name =~ s/^\w*\.//;
956 }
957
958 push (@metadata, $data->[1]) if ($data_name eq $field);
959 }
960
961 return \@metadata;
962}
963
964# returns a list of the form [[field,value],[field,value],...]
965sub get_all_metadata {
966 my $self = shift (@_);
967 my ($section) = @_;
968
969 my $section_ptr = $self->_lookup_section($section);
970 if (!defined $section_ptr) {
971 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
972 return;
973 }
974
975 return $section_ptr->{'metadata'};
976}
977
978# returns a xml element of the form <MetadataList><Metadata name="metadata-name">metadata_value</Metadata>...</MetadataList>
979sub get_top_metadata_list{
980 my $self = shift (@_);
981
982 my @topmetadata =$self->get_all_metadata($self->get_top_section());
983 my $metadatalist ='<MetadataList>';
984
985 foreach my $i (@topmetadata){
986 foreach my $j (@$i){
987 my %metaMap = @$j;
988 foreach my $key (keys %metaMap){
989 $metadatalist .='<Metadata name='."\"$key\"".'>'.$self->_escape_text($metaMap{$key}).'</Metadata>'."\n";
990 }
991 }
992 }
993
994 $metadatalist .='</MetadataList>';
995 return $metadatalist;
996}
997
998
999# $value is optional
1000sub delete_metadata {
1001 my $self = shift (@_);
1002 my ($section, $field, $value) = @_;
1003
1004 my $section_ptr = $self->_lookup_section($section);
1005 if (!defined $section_ptr) {
1006 print STDERR "doc::delete_metadata couldn't find section ", $section, "\n";
1007 return;
1008 }
1009
1010 my $i = 0;
1011 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
1012 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
1013 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
1014 splice (@{$section_ptr->{'metadata'}}, $i, 1);
1015 } else {
1016 $i++;
1017 }
1018 }
1019}
1020
1021sub delete_all_metadata {
1022 my $self = shift (@_);
1023 my ($section) = @_;
1024
1025 my $section_ptr = $self->_lookup_section($section);
1026 if (!defined $section_ptr) {
1027 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
1028 return;
1029 }
1030
1031 $section_ptr->{'metadata'} = [];
1032}
1033
1034sub set_metadata_element {
1035 my $self = shift (@_);
1036 my ($section, $field, $value) = @_;
1037
1038 $self->set_utf8_metadata_element ($section, $field,
1039 &unicode::ascii2utf8(\$value));
1040}
1041
1042# set_utf8_metadata_element assumes the text has already been
1043# converted to the UTF-8 encoding.
1044sub set_utf8_metadata_element {
1045 my $self = shift (@_);
1046 my ($section, $field, $value) = @_;
1047
1048 $self->delete_metadata ($section, $field);
1049 $self->add_utf8_metadata ($section, $field, $value);
1050}
1051
1052
1053# add_metadata assumes the text is in (extended) ascii form. For
1054# text which has already been converted to the UTF-8 format use
1055# add_utf8_metadata.
1056sub add_metadata {
1057 my $self = shift (@_);
1058 my ($section, $field, $value) = @_;
1059
1060 $self->add_utf8_metadata ($section, $field,
1061 &unicode::ascii2utf8(\$value));
1062}
1063
1064sub add_utf8_metadata {
1065 my $self = shift (@_);
1066 my ($section, $field, $value) = @_;
1067
1068 my $section_ptr = $self->_lookup_section($section);
1069 if (!defined $section_ptr) {
1070 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
1071 return;
1072 }
1073 if (!defined $value) {
1074 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
1075 return;
1076 }
1077 if (!defined $field) {
1078 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
1079 return;
1080 }
1081
1082 #print STDERR "###$field=$value\n";
1083 # double check that the value is utf-8
1084 if (unicode::ensure_utf8(\$value)) {
1085 print STDERR "doc::add_utf8_metadata: warning: '$field' wasn't utf8\n";
1086 }
1087
1088 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1089}
1090
1091
1092# methods for dealing with text
1093
1094# returns the text for a section
1095sub get_text {
1096 my $self = shift (@_);
1097 my ($section) = @_;
1098
1099 my $section_ptr = $self->_lookup_section($section);
1100 if (!defined $section_ptr) {
1101 print STDERR "doc::get_text couldn't find section " .
1102 "$section\n";
1103 return "";
1104 }
1105
1106 return $section_ptr->{'text'};
1107}
1108
1109# returns the (utf-8 encoded) length of the text for a section
1110sub get_text_length {
1111 my $self = shift (@_);
1112 my ($section) = @_;
1113
1114 my $section_ptr = $self->_lookup_section($section);
1115 if (!defined $section_ptr) {
1116 print STDERR "doc::get_text_length couldn't find section " .
1117 "$section\n";
1118 return 0;
1119 }
1120
1121 return length ($section_ptr->{'text'});
1122}
1123
1124sub delete_text {
1125 my $self = shift (@_);
1126 my ($section) = @_;
1127
1128 my $section_ptr = $self->_lookup_section($section);
1129 if (!defined $section_ptr) {
1130 print STDERR "doc::delete_text couldn't find section " .
1131 "$section\n";
1132 return;
1133 }
1134
1135 $section_ptr->{'text'} = "";
1136}
1137
1138# add_text assumes the text is in (extended) ascii form. For
1139# text which has been already converted to the UTF-8 format
1140# use add_utf8_text.
1141sub add_text {
1142 my $self = shift (@_);
1143 my ($section, $text) = @_;
1144
1145 # convert the text to UTF-8 encoded unicode characters
1146 # and add the text
1147 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1148}
1149
1150
1151# add_utf8_text assumes the text to be added has already
1152# been converted to the UTF-8 encoding. For ascii text use
1153# add_text
1154sub add_utf8_text {
1155 my $self = shift (@_);
1156 my ($section, $text) = @_;
1157
1158 my $section_ptr = $self->_lookup_section($section);
1159 if (!defined $section_ptr) {
1160 print STDERR "doc::add_utf8_text couldn't find section " .
1161 "$section\n";
1162 return;
1163 }
1164
1165 $section_ptr->{'text'} .= $text;
1166}
1167
1168
1169# methods for dealing with associated files
1170
1171# a file is associated with a document, NOT a section.
1172# if section is defined it is noted in the data structure
1173# only so that files associated from a particular section
1174# may be removed later (using delete_section_assoc_files)
1175sub associate_file {
1176 my $self = shift (@_);
1177 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1178 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1179
1180 # remove all associated files with the same name
1181 $self->delete_assoc_file ($assoc_filename);
1182
1183 push (@{$self->{'associated_files'}},
1184 [$real_filename, $assoc_filename, $mime_type, $section]);
1185}
1186
1187# returns a list of associated files in the form
1188# [[real_filename, assoc_filename, mimetype], ...]
1189sub get_assoc_files {
1190 my $self = shift (@_);
1191
1192 return $self->{'associated_files'};
1193}
1194
1195sub delete_section_assoc_files {
1196 my $self = shift (@_);
1197 my ($section) = @_;
1198
1199 my $i=0;
1200 while ($i < scalar (@{$self->{'associated_files'}})) {
1201 if (defined $self->{'associated_files'}->[$i]->[3] &&
1202 $self->{'associated_files'}->[$i]->[3] eq $section) {
1203 splice (@{$self->{'associated_files'}}, $i, 1);
1204 } else {
1205 $i++;
1206 }
1207 }
1208}
1209
1210sub delete_assoc_file {
1211 my $self = shift (@_);
1212 my ($assoc_filename) = @_;
1213
1214 my $i=0;
1215 while ($i < scalar (@{$self->{'associated_files'}})) {
1216 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1217 splice (@{$self->{'associated_files'}}, $i, 1);
1218 } else {
1219 $i++;
1220 }
1221 }
1222}
1223
1224sub reset_nextsection_ptr {
1225 my $self = shift (@_);
1226 my ($section) = @_;
1227
1228 my $section_ptr = $self->_lookup_section($section);
1229 $section_ptr->{'next_subsection'} = 1;
1230}
1231
12321;
Note: See TracBrowser for help on using the repository browser.