source: trunk/gsdl/perllib/doc.pm@ 7909

Last change on this file since 7909 was 7902, checked in by chi, 20 years ago

Saving of documents (in archive format) extended to generate METS format
as alternative to GreenstoneArchive (GA) format. Controlled through
'import.pl -saveas METS ...'

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 32.1 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use unicode;
37use util;
38use ghtml;
39##use hashdoc;
40
41# the document type may be indexed_doc, nonindexed_doc, or
42# classification
43
44my $OIDcount = 0;
45
46sub new {
47 my $class = shift (@_);
48 my ($source_filename, $doc_type) = @_;
49
50 my $self = bless {'associated_files'=>[],
51 'subsection_order'=>[],
52 'next_subsection'=>1,
53 'subsections'=>{},
54 'metadata'=>[],
55 'text'=>"",
56 'OIDtype'=>"hash"}, $class;
57
58 $self->set_source_filename ($source_filename) if defined $source_filename;
59 $self->set_doc_type ($doc_type) if defined $doc_type;
60
61 return $self;
62}
63
64# clone the $self object
65sub duplicate {
66 my $self = shift (@_);
67
68 my $newobj = {};
69
70 foreach $k (keys %$self) {
71 $newobj->{$k} = &clone ($self->{$k});
72 }
73
74 bless $newobj, ref($self);
75 return $newobj;
76}
77
78sub clone {
79 my ($from) = @_;
80 my $type = ref ($from);
81
82 if ($type eq "HASH") {
83 my $to = {};
84 foreach $key (keys %$from) {
85 $to->{$key} = &clone ($from->{$key});
86 }
87 return $to;
88 } elsif ($type eq "ARRAY") {
89 my $to = [];
90 foreach $v (@$from) {
91 push (@$to, &clone ($v));
92 }
93 return $to;
94 } else {
95 return $from;
96 }
97}
98
99sub set_OIDtype {
100 my $self = shift (@_);
101 my ($type) = @_;
102
103 if ($type eq "incremental") {
104 $self->{'OIDtype'} = $type;
105 } else {
106 $self->{'OIDtype'} = "hash";
107 }
108}
109
110sub set_source_filename {
111 my $self = shift (@_);
112 my ($source_filename) = @_;
113
114 $self->set_metadata_element ($self->get_top_section(),
115 "gsdlsourcefilename",
116 $source_filename);
117}
118
119sub set_converted_filename {
120 my $self = shift (@_);
121 my ($converted_filename) = @_;
122
123 $self->set_metadata_element ($self->get_top_section(),
124 "gsdlconvertedfilename",
125 $converted_filename);
126}
127
128
129# returns the source_filename as it was provided
130sub get_source_filename {
131 my $self = shift (@_);
132
133 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
134}
135
136# returns converted filename if available else returns source filename
137sub get_filename_for_hashing {
138 my $self = shift (@_);
139
140 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
141
142 if (!defined $filename) {
143 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
144 }
145 return $filename;
146}
147
148sub set_doc_type {
149 my $self = shift (@_);
150 my ($doc_type) = @_;
151
152 $self->set_metadata_element ($self->get_top_section(),
153 "gsdldoctype",
154 $doc_type);
155}
156
157# returns the source_filename as it was provided
158# the default of "indexed_doc" is used if no document
159# type was provided
160sub get_doc_type {
161 my $self = shift (@_);
162
163 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
164 return $doc_type if (defined $doc_type);
165 return "indexed_doc";
166}
167
168sub _escape_text {
169 my ($text) = @_;
170
171 # special characters in the gml encoding
172 $text =~ s/&/&/g; # this has to be first...
173 $text =~ s/</&lt;/g;
174 $text =~ s/>/&gt;/g;
175 $text =~ s/\"/&quot;/g;
176
177 return $text;
178}
179
180sub buffer_section_xml {
181 my $self = shift (@_);
182 my ($section) = @_;
183
184 my $section_ptr = $self->_lookup_section ($section);
185 return "" unless defined $section_ptr;
186
187 my $all_text = "<Section>\n";
188 $all_text .= " <Description>\n";
189
190 # output metadata
191 foreach my $data (@{$section_ptr->{'metadata'}}) {
192 my $escaped_value = &_escape_text($data->[1]);
193 $all_text .= ' <Metadata name="' . $data->[0] . '">' . $escaped_value . "</Metadata>\n";
194 }
195
196 $all_text .= " </Description>\n";
197
198 # output the text
199 $all_text .= " <Content>";
200 $all_text .= &_escape_text($section_ptr->{'text'});
201 $all_text .= "</Content>\n";
202
203 # output all the subsections
204 foreach my $subsection (@{$section_ptr->{'subsection_order'}}) {
205 $all_text .= $self->buffer_section_xml("$section.$subsection");
206 }
207
208 $all_text .= "</Section>\n";
209
210 # make sure no nasty control characters have snuck through
211 # (XML::Parser will barf on anything it doesn't consider to be
212 # valid UTF-8 text, including things like \c@, \cC etc.)
213 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
214
215 return $all_text;
216}
217
218sub buffer_txt_section_xml {
219 my $self = shift(@_);
220 my ($section) = @_;
221
222 my $section_ptr = $self->_lookup_section ($section);
223
224 return "" unless defined $section_ptr;
225
226 my $all_text = "<Section>\n";
227
228 ##output the text
229 #$all_text .= " <Content>";
230 $all_text .= &_escape_text($section_ptr->{'text'});
231 #$all_text .= " </Content>\n";
232
233
234 #output all the subsections
235 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
236 $all_text .= $self->buffer_txt_section_xml("$section.$subsection");
237 }
238
239 $all_text .= "</Section>\n";
240
241
242 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
243 return $all_text;
244}
245
246sub buffer_mets_fileSection_section_xml() {
247 my $self = shift(@_);
248 my ($section, $doc_Dir) = @_;
249
250 $section="" unless defined $section;
251
252
253 my $section_ptr=$self->_lookup_section($section);
254 return "" unless defined $section_ptr;
255
256
257 #**output fileSection by sections
258 my $section_num ="1". $section;
259
260
261 my $filePath = $doc_Dir . '/doctxt.xml';
262
263 #**output the fileSection details
264 my $all_text = ' <mets:fileGrp ID="FILEGROUP_PRELUDE' . $section_num . '">'. "\n";
265 $all_text .= ' <mets:file MIMETYPE="text/xml" ID="FILE'.$section_num. '">'. "\n";
266 $all_text .= ' <mets:FLocate LOCTYPE="URL" xlink:href="file:'.$filePath.'#xpointer(/Section[';
267
268 my $xpath = "1".$section;
269
270 $xpath =~ s/\./]\/Section[/g;
271
272 $all_text .= $xpath;
273
274 $all_text .= ']/text())" ID="FILE'. $section_num. '" />' . "\n";
275 $all_text .= " </mets:file>\n";
276 $all_text .= " </mets:fileGrp>\n";
277
278
279 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
280 $all_text .= $self->buffer_mets_fileSection_section_xml("$section.$subsection",$doc_Dir);
281 }
282
283 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
284
285 return $all_text;
286}
287
288sub buffer_mets_fileWhole_section_xml(){
289 my $self = shift(@_);
290 my ($section) = @_;
291
292 my $section_ptr = $self-> _lookup_section($section);
293 return "" unless defined $section_ptr;
294
295 my $all_text="" unless defined $all_txt;
296
297 my ($dirPath)="" unless defined $dirPath;
298 my $fileID=0;
299
300 #** output the fileSection for the whole section
301 #*** get the sourcefile and associative file
302
303 foreach my $data (@{$section_ptr->{'metadata'}}){
304 my $escaped_value = &_escape_text($data->[1]);
305 if ($data->[0] eq "gsdlsourcefilename") {
306 ($dirPath) = $escaped_value =~ m/^(.*)[\/\\][^\/\\]*$/;
307
308 $all_text .= ' <mets:fileGrp ID="default">'."\n";
309 ++$fileID;
310 $all_text .= ' <mets:file MIMETYPE="text/xml" ID="default.'.$fileID.'">'. "\n";
311 $all_text .= ' <mets:FLocate LOCTYPE="URL" xlink:href="file:'.$data->[1].'" ID="default.'.$fileID.'" />'."\n";
312
313 $all_text .= " </mets:file>\n";
314 }
315
316 if ($data->[0] eq "gsdlassocfile"){
317 $escaped_value =~ m/^(.*?):(.*):$/;
318
319 my $assfilePath = $dirPath . '/'. $1;
320 ++$fileID;
321 $all_text .= ' <mets:file MIMETYPE="'.$2.'" ID="default.'.$fileID. '">'. "\n";
322 $all_text .= ' <mets:FLocate LOCTYPE="URL" xlink:href="file:'.$assfilePath.'" ID="default.'. $fileID.'" />'."\n";
323
324 $all_text .= " </mets:file>\n";
325 }
326 }
327 $all_text .= " </mets:fileGrp>\n";
328
329
330 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
331
332 return $all_text;
333}
334
335sub buffer_mets_StruMapSection_section_xml(){
336 my $self = shift(@_);
337 my ($section, $order_numref) = @_;
338
339 $section="" unless defined $section;
340
341
342 my $section_ptr=$self->_lookup_section($section);
343 return "" unless defined $section_ptr;
344
345 #**output fileSection by sections
346 my $section_num ="1". $section;
347
348 #**output the StruMap details
349
350 my $all_text = ' <mets:div ID="DS'. $section_num .'" TYPE="Section" ORDER="'.$$order_numref++.'" ORDERLABEL="'. $section_num .'" LABEL="';
351 $all_text .= $section_num . '" DMDID="DM'.$section_num.'">'. "\n";
352
353 $all_text .= ' <mets:fptr FILEID="FILEGROUP_PRELUDE'.$section_num.'" />'. "\n";
354
355
356 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
357 $all_text .= $self->buffer_mets_StruMapSection_section_xml("$section.$subsection", $order_numref);
358 }
359
360 $all_text .= " </mets:div>\n";
361
362 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
363
364 return $all_text;
365}
366
367
368sub buffer_mets_StruMapWhole_section_xml(){
369 my $self = shift(@_);
370 my ($section) = @_;
371
372 my $section_ptr = $self-> _lookup_section($section);
373 return "" unless defined $section_ptr;
374
375 my $all_text="" unless defined $all_txt;
376 my $fileID=0;
377
378 $all_text .= '<mets:structMap ID="All" TYPE="Whole Document" LABEL="All">'."\n";
379 $all_text .= ' <mets:div ID="All" TYPE="Document" ORDER="All" ORDERLABEL="All" LABEL="Whole Documemt" DMDID="DM1">' . "\n";
380
381
382 #** output the StruMapSection for the whole section
383 #*** get the sourcefile and associative file
384
385 foreach my $data (@{$section_ptr->{'metadata'}}){
386 my $escaped_value = &_escape_text($data->[1]);
387
388 if ($data->[0] eq "gsdlsourcefilename") {
389 ++$fileID;
390 $all_text .= ' <mets:fptr FILEID="default.'.$fileID.'" />'."\n";
391 }
392
393 if ($data->[0] eq "gsdlassocfile"){
394 ++$fileID;
395 $all_text .= ' <mets:fptr FILEID="default.'.$fileID. '" />'. "\n";
396 }
397 }
398 $all_text .= " </mets:div>\n";
399
400 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
401
402 return $all_text;
403}
404
405
406sub buffer_mets_dmdSection_section_xml(){
407 my $self = shift(@_);
408 my ($section) = @_;
409
410 $section="" unless defined $section;
411
412 my $section_ptr=$self->_lookup_section($section);
413 return "" unless defined $section_ptr;
414
415 #***convert section number
416 my $section_num ="1". $section;
417
418 #**output the dmdSection details
419 my $all_text = '<mets:dmdSec ID="DM'.$section_num.'" GROUPID="'.$section_num.'">'. "\n";
420 $all_text .= ' <mets:mdWrap MDType="gsdl" ID="'.$section_num.'">'."\n";
421 $all_text .= " <mets:xmlData>\n";
422 foreach my $data (@{$section_ptr->{'metadata'}}){
423 my $escaped_value = &_escape_text($data->[1]);
424 $all_text .= ' <gsdl:Metadata name="'. $data->[0].'">'. $escaped_value. "</gsdl:Metadata>\n";
425 }
426 $all_text .= " </mets:xmlData>\n";
427 $all_text .= " </mets:mdWrap>\n";
428 $all_text .= "</mets:dmdSec>\n";
429
430 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
431 $all_text .= $self->buffer_mets_dmdSection_section_xml("$section.$subsection");
432 }
433
434 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
435
436 return $all_text;
437}
438
439sub output_section {
440 my $self = shift (@_);
441 my ($handle, $section) = @_;
442
443 print $handle $self->buffer_section_xml($section);
444}
445
446
447#*** print out doctxt.xml file
448sub output_txt_section {
449 my $self = shift (@_);
450 my ($handle, $section) = @_;
451
452 print $handle $self->buffer_txt_section_xml($section);
453}
454
455#*** print out docmets.xml file
456sub output_mets_section {
457 my $self = shift(@_);
458 my ($handle, $section, $doc_Dir) = @_;
459
460 #***print out the dmdSection
461 print $handle $self->buffer_mets_dmdSection_section_xml($section);
462
463 #***print out the fileSection by sections
464 print $handle "<mets:fileSec>\n";
465 print $handle $self->buffer_mets_fileSection_section_xml($section,$doc_Dir);
466 #***print out the whole fileSection
467 print $handle $self->buffer_mets_fileWhole_section_xml($section);
468 print $handle "</mets:fileSec>\n";
469
470 #***print out the StruMapSection by sections
471 print $handle '<mets:structMap ID="Section" TYPE="Section" LABEL="Section">' . "\n";
472 my $order_num=0;
473 print $handle $self->buffer_mets_StruMapSection_section_xml($section, \$order_num);
474 print $handle "</mets:structMap>\n";
475 print $handle $self->buffer_mets_StruMapWhole_section_xml($section);
476 print $handle "</mets:structMap>\n";
477}
478
479
480# look up the reference to the a particular section
481sub _lookup_section {
482 my $self = shift (@_);
483 my ($section) = @_;
484
485 my ($num);
486 my $sectionref = $self;
487
488 while (defined $section && $section ne "") {
489 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
490 $num =~ s/^0+(\d)/$1/; # remove leading 0s
491 $section = "" unless defined $section;
492
493 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
494 $sectionref = $sectionref->{'subsections'}->{$num};
495 } else {
496 return undef;
497 }
498 }
499
500 return $sectionref;
501}
502
503# calculate OID by hashing the contents of the document
504sub _calc_OID {
505 my $self = shift (@_);
506 my ($filename) = @_;
507
508 my $osexe = &util::get_os_exe();
509
510 my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
511 $ENV{'GSDLOS'},"hashfile$osexe");
512 my $result = "NULL";
513
514 if (-e "$hashfile_exe") {
515# $result = `\"$hashfile_exe\" \"$filename\"`;
516 $result = `hashfile$osexe \"$filename\"`;
517 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
518
519 } else {
520 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
521 }
522
523 return "HASH$result";
524}
525
526# methods dealing with OID, not groups of them.
527
528# if $OID is not provided one is calculated
529sub set_OID {
530 my $self = shift (@_);
531 my ($OID) = @_;
532
533 # if an OID wasn't provided claculate one
534 if (!defined $OID) {
535 $OID = "NULL";
536
537 if ($self->{'OIDtype'} eq "incremental") {
538 $OID = "D" . $OIDcount;
539 $OIDcount ++;
540
541 } else {
542 # "hash" OID - feed file to hashfile.exe
543 #my $filename = $self->get_source_filename();
544 # we want to use the converted file for hashing if available
545 # cos its quicker
546 my $filename = $self->get_filename_for_hashing();
547 if (defined($filename) && -e $filename) {
548 $OID = $self->_calc_OID ($filename);
549 } else {
550 $filename = &util::get_tmp_filename();
551 if (!open (OUTFILE, ">$filename")) {
552 print STDERR "doc::set_OID could not write to $filename\n";
553 } else {
554 $self->output_section('OUTFILE', $self->get_top_section(),
555 undef, 1);
556 close (OUTFILE);
557 }
558
559 $OID = $self->_calc_OID ($filename);
560 &util::rm ($filename);
561 }
562 }
563 }
564 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
565}
566
567# this uses hashdoc (embedded c thingy) which is faster but still
568# needs a little work to be suffiently stable
569sub ___set_OID {
570 my $self = shift (@_);
571 my ($OID) = @_;
572
573 # if an OID wasn't provided then calculate hash value based on document
574 if (!defined $OID)
575 {
576 my $hash_text = $self->buffer_section_gml($self->get_top_section(),
577 undef, 1);
578 my $hash_len = length($hash_text);
579
580 $OID = &hashdoc::buffer($hash_text,$hash_len);
581 }
582
583 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
584}
585
586# returns the OID for this document
587sub get_OID {
588 my $self = shift (@_);
589 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
590 return $OID if (defined $OID);
591 return "NULL";
592}
593
594sub delete_OID {
595 my $self = shift (@_);
596
597 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
598}
599
600
601# methods for manipulating section names
602
603# returns the name of the top-most section (the top
604# level of the document
605sub get_top_section {
606 my $self = shift (@_);
607
608 return "";
609}
610
611# returns a section
612sub get_parent_section {
613 my $self = shift (@_);
614 my ($section) = @_;
615
616 $section =~ s/(^|\.)\d+$//;
617
618 return $section;
619}
620
621# returns the first child section (or the end child
622# if there isn't any)
623sub get_begin_child {
624 my $self = shift (@_);
625 my ($section) = @_;
626
627 my $section_ptr = $self->_lookup_section($section);
628 return "" unless defined $section_ptr;
629
630 if (defined $section_ptr->{'subsection_order'}->[0]) {
631 return "$section.$section_ptr->{'subsection_order'}->[0]";
632 }
633
634 return $self->get_end_child ($section);
635}
636
637# returns the next child of a parent section
638sub get_next_child {
639 my $self = shift (@_);
640 my ($section) = @_;
641
642 my $parent_section = $self->get_parent_section($section);
643 my $parent_section_ptr = $self->_lookup_section($parent_section);
644 return undef unless defined $parent_section_ptr;
645
646 my ($section_num) = $section =~ /(\d+)$/;
647 return undef unless defined $section_num;
648
649 my $i = 0;
650 my $section_order = $parent_section_ptr->{'subsection_order'};
651 while ($i < scalar(@$section_order)) {
652 last if $section_order->[$i] eq $section_num;
653 $i++;
654 }
655
656 $i++; # the next child
657 if ($i < scalar(@$section_order)) {
658 return $section_order->[$i] if $parent_section eq "";
659 return "$parent_section.$section_order->[$i]";
660 }
661
662 # no more sections in this level
663 return undef;
664}
665
666# returns a reference to a list of children
667sub get_children {
668 my $self = shift (@_);
669 my ($section) = @_;
670
671 my $section_ptr = $self->_lookup_section($section);
672 return [] unless defined $section_ptr;
673
674 my @children = @{$section_ptr->{'subsection_order'}};
675
676 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
677 return \@children;
678}
679
680# returns the child section one past the last one (which
681# is coded as "0")
682sub get_end_child {
683 my $self = shift (@_);
684 my ($section) = @_;
685
686 return $section . ".0" unless $section eq "";
687 return "0";
688}
689
690# returns the next section in book order
691sub get_next_section {
692 my $self = shift (@_);
693 my ($section) = @_;
694
695 return undef unless defined $section;
696
697 my $section_ptr = $self->_lookup_section($section);
698 return undef unless defined $section_ptr;
699
700 # first try to find first child
701 if (defined $section_ptr->{'subsection_order'}->[0]) {
702 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
703 return "$section.$section_ptr->{'subsection_order'}->[0]";
704 }
705
706 do {
707 # try to find sibling
708 my $next_child = $self->get_next_child ($section);
709 return $next_child if (defined $next_child);
710
711 # move up one level
712 $section = $self->get_parent_section ($section);
713 } while $section =~ /\d/;
714
715 return undef;
716}
717
718sub is_leaf_section {
719 my $self = shift (@_);
720 my ($section) = @_;
721
722 my $section_ptr = $self->_lookup_section($section);
723 return 1 unless defined $section_ptr;
724
725 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
726}
727
728# methods for dealing with sections
729
730# returns the name of the inserted section
731sub insert_section {
732 my $self = shift (@_);
733 my ($before_section) = @_;
734
735 # get the child to insert before and its parent section
736 my $parent_section = "";
737 my $before_child = "0";
738 my @before_section = split (/\./, $before_section);
739 if (scalar(@before_section) > 0) {
740 $before_child = pop (@before_section);
741 $parent_section = join (".", @before_section);
742 }
743
744 my $parent_section_ptr = $self->_lookup_section($parent_section);
745 if (!defined $parent_section_ptr) {
746 print STDERR "doc::insert_section couldn't find parent section " .
747 "$parent_section\n";
748 return;
749 }
750
751 # get the next section number
752 my $section_num = $parent_section_ptr->{'next_subsection'}++;
753
754 my $i = 0;
755 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
756 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
757 $i++;
758 }
759
760 # insert the section number into the order list
761 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
762
763 # add this section to the parent section
764 my $section_ptr = {'subsection_order'=>[],
765 'next_subsection'=>1,
766 'subsections'=>{},
767 'metadata'=>[],
768 'text'=>""};
769 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
770
771 # work out the full section number
772 my $section = $parent_section;
773 $section .= "." unless $section eq "";
774 $section .= $section_num;
775
776 return $section;
777}
778
779# creates a pre-named section
780sub create_named_section {
781 my $self = shift (@_);
782 my ($mastersection) = @_;
783
784 my ($num);
785 my $section = $mastersection;
786 my $sectionref = $self;
787
788 while ($section ne "") {
789 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
790 $num =~ s/^0+(\d)/$1/; # remove leading 0s
791 $section = "" unless defined $section;
792
793 if (defined $num) {
794 if (!defined $sectionref->{'subsections'}->{$num}) {
795 push (@{$sectionref->{'subsection_order'}}, $num);
796 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
797 'next_subsection'=>1,
798 'subsections'=>{},
799 'metadata'=>[],
800 'text'=>""};
801 if ($num >= $sectionref->{'next_subsection'}) {
802 $sectionref->{'next_subsection'} = $num + 1;
803 }
804 }
805 $sectionref = $sectionref->{'subsections'}->{$num};
806
807 } else {
808 print STDERR "doc::create_named_section couldn't create section ";
809 print STDERR "$mastersection\n";
810 last;
811 }
812 }
813}
814
815# returns a reference to a list of subsections
816sub list_subsections {
817 my $self = shift (@_);
818 my ($section) = @_;
819
820 my $section_ptr = $self->_lookup_section ($section);
821 if (!defined $section_ptr) {
822 print STDERR "doc::list_subsections couldn't find section $section\n";
823 return [];
824 }
825
826 return [@{$section_ptr->{'subsection_order'}}];
827}
828
829sub delete_section {
830 my $self = shift (@_);
831 my ($section) = @_;
832
833# my $section_ptr = {'subsection_order'=>[],
834# 'next_subsection'=>1,
835# 'subsections'=>{},
836# 'metadata'=>[],
837# 'text'=>""};
838
839 # if this is the top section reset everything
840 if ($section eq "") {
841 $self->{'subsection_order'} = [];
842 $self->{'subsections'} = {};
843 $self->{'metadata'} = [];
844 $self->{'text'} = "";
845 return;
846 }
847
848 # find the parent of the section to delete
849 my $parent_section = "";
850 my $child = "0";
851 my @section = split (/\./, $section);
852 if (scalar(@section) > 0) {
853 $child = pop (@section);
854 $parent_section = join (".", @section);
855 }
856
857 my $parent_section_ptr = $self->_lookup_section($parent_section);
858 if (!defined $parent_section_ptr) {
859 print STDERR "doc::delete_section couldn't find parent section " .
860 "$parent_section\n";
861 return;
862 }
863
864 # remove this section from the subsection_order list
865 my $i = 0;
866 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
867 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
868 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
869 last;
870 }
871 $i++;
872 }
873
874 # remove this section from the subsection hash
875 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
876 undef $parent_section_ptr->{'subsections'}->{$child};
877 }
878}
879
880#--
881# methods for dealing with metadata
882
883# set_metadata_element and get_metadata_element are for metadata
884# which should only have one value. add_meta_data and get_metadata
885# are for metadata which can have more than one value.
886
887# returns the first metadata value which matches field
888
889# This version of get metadata element works much like the one above,
890# except it allows for the namespace portion of a metadata element to
891# be ignored, thus if you are searching for dc.Title, the first piece
892# of matching metadata ending with the name Title (once any namespace
893# is removed) would be returned.
894# 28-11-2003 John Thompson
895sub get_metadata_element {
896 my $self = shift (@_);
897 my ($section, $field, $ignore_namespace) = @_;
898 my ($data);
899
900 $ignore_namespace = 0 unless defined $ignore_namespace;
901
902 my $section_ptr = $self->_lookup_section($section);
903 if (!defined $section_ptr) {
904 print STDERR "doc::get_metadata_element couldn't find section " .
905 "$section\n";
906 return;
907 }
908
909 # Remove the any namespace if we are being told to ignore them
910 if($ignore_namespace) {
911 $field =~ s/^\w*\.//;
912 }
913
914 foreach $data (@{$section_ptr->{'metadata'}}) {
915
916 my $data_name = $data->[0];
917 # Remove the any namespace if we are being told to ignore them
918 if($ignore_namespace) {
919 $data_name =~ s/^\w*\.//;
920 }
921
922 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
923 }
924
925 return undef; # was not found
926}
927
928# returns a list of the form [value1, value2, ...]
929sub get_metadata {
930 my $self = shift (@_);
931 my ($section, $field, $ignore_namespace) = @_;
932 my ($data);
933
934 $ignore_namespace = 0 unless defined $ignore_namespace;
935
936 my $section_ptr = $self->_lookup_section($section);
937 if (!defined $section_ptr) {
938 print STDERR "doc::get_metadata couldn't find section " .
939 "$section\n";
940 return;
941 }
942
943 # Remove the any namespace if we are being told to ignore them
944 if($ignore_namespace) {
945 $field =~ s/^\w*\.//;
946 }
947
948 my @metadata = ();
949 foreach $data (@{$section_ptr->{'metadata'}}) {
950
951 my $data_name = $data->[0];
952 # Remove the any namespace if we are being told to ignore them
953 if($ignore_namespace) {
954 $data_name =~ s/^\w*\.//;
955 }
956
957 push (@metadata, $data->[1]) if ($data_name eq $field);
958 }
959
960 return \@metadata;
961}
962
963# returns a list of the form [[field,value],[field,value],...]
964sub get_all_metadata {
965 my $self = shift (@_);
966 my ($section) = @_;
967
968 my $section_ptr = $self->_lookup_section($section);
969 if (!defined $section_ptr) {
970 print STDERR "doc::get_all_metadata couldn't find section " .
971 "$section\n";
972 return;
973 }
974
975 return $section_ptr->{'metadata'};
976}
977
978# $value is optional
979sub delete_metadata {
980 my $self = shift (@_);
981 my ($section, $field, $value) = @_;
982
983 my $section_ptr = $self->_lookup_section($section);
984 if (!defined $section_ptr) {
985 print STDERR "doc::delete_metadata couldn't find section " .
986 "$section\n";
987 return;
988 }
989
990 my $i = 0;
991 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
992 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
993 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
994 splice (@{$section_ptr->{'metadata'}}, $i, 1);
995 } else {
996 $i++;
997 }
998 }
999}
1000
1001sub delete_all_metadata {
1002 my $self = shift (@_);
1003 my ($section) = @_;
1004
1005 my $section_ptr = $self->_lookup_section($section);
1006 if (!defined $section_ptr) {
1007 print STDERR "doc::delete_all_metadata couldn't find section " .
1008 "$section\n";
1009 return;
1010 }
1011
1012 $section_ptr->{'metadata'} = [];
1013}
1014
1015sub set_metadata_element {
1016 my $self = shift (@_);
1017 my ($section, $field, $value) = @_;
1018
1019 $self->set_utf8_metadata_element ($section, $field,
1020 &unicode::ascii2utf8(\$value));
1021}
1022
1023# set_utf8_metadata_element assumes the text has already been
1024# converted to the UTF-8 encoding.
1025sub set_utf8_metadata_element {
1026 my $self = shift (@_);
1027 my ($section, $field, $value) = @_;
1028
1029 $self->delete_metadata ($section, $field);
1030 $self->add_utf8_metadata ($section, $field, $value);
1031}
1032
1033
1034# add_metadata assumes the text is in (extended) ascii form. For
1035# text which hash been already converted to the UTF-8 format use
1036# add_utf8_metadata.
1037sub add_metadata {
1038 my $self = shift (@_);
1039 my ($section, $field, $value) = @_;
1040
1041 $self->add_utf8_metadata ($section, $field,
1042 &unicode::ascii2utf8(\$value));
1043}
1044
1045sub add_utf8_metadata {
1046 my $self = shift (@_);
1047 my ($section, $field, $value) = @_;
1048
1049 my $section_ptr = $self->_lookup_section($section);
1050 if (!defined $section_ptr) {
1051 print STDERR "doc::add_utf8_metadata couldn't find section " .
1052 "$section\n";
1053 return;
1054 }
1055 if (!defined $value) {
1056 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
1057 return;
1058 }
1059 if (!defined $field) {
1060 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
1061 return;
1062 }
1063
1064 # double check that the value is utf-8
1065 if (unicode::ensure_utf8(\$value)) {
1066 print STDERR "doc::add_utf8_metadata: warning: '$field' wasn't utf8\n";
1067 }
1068
1069 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1070}
1071
1072
1073# methods for dealing with text
1074
1075# returns the text for a section
1076sub get_text {
1077 my $self = shift (@_);
1078 my ($section) = @_;
1079
1080 my $section_ptr = $self->_lookup_section($section);
1081 if (!defined $section_ptr) {
1082 print STDERR "doc::get_text couldn't find section " .
1083 "$section\n";
1084 return "";
1085 }
1086
1087 return $section_ptr->{'text'};
1088}
1089
1090# returns the (utf-8 encoded) length of the text for a section
1091sub get_text_length {
1092 my $self = shift (@_);
1093 my ($section) = @_;
1094
1095 my $section_ptr = $self->_lookup_section($section);
1096 if (!defined $section_ptr) {
1097 print STDERR "doc::get_text_length couldn't find section " .
1098 "$section\n";
1099 return 0;
1100 }
1101
1102 return length ($section_ptr->{'text'});
1103}
1104
1105sub delete_text {
1106 my $self = shift (@_);
1107 my ($section) = @_;
1108
1109 my $section_ptr = $self->_lookup_section($section);
1110 if (!defined $section_ptr) {
1111 print STDERR "doc::delete_text couldn't find section " .
1112 "$section\n";
1113 return;
1114 }
1115
1116 $section_ptr->{'text'} = "";
1117}
1118
1119# add_text assumes the text is in (extended) ascii form. For
1120# text which has been already converted to the UTF-8 format
1121# use add_utf8_text.
1122sub add_text {
1123 my $self = shift (@_);
1124 my ($section, $text) = @_;
1125
1126 # convert the text to UTF-8 encoded unicode characters
1127 # and add the text
1128 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1129}
1130
1131
1132# add_utf8_text assumes the text to be added has already
1133# been converted to the UTF-8 encoding. For ascii text use
1134# add_text
1135sub add_utf8_text {
1136 my $self = shift (@_);
1137 my ($section, $text) = @_;
1138
1139 my $section_ptr = $self->_lookup_section($section);
1140 if (!defined $section_ptr) {
1141 print STDERR "doc::add_utf8_text couldn't find section " .
1142 "$section\n";
1143 return;
1144 }
1145
1146 $section_ptr->{'text'} .= $text;
1147}
1148
1149
1150# methods for dealing with associated files
1151
1152# a file is associated with a document, NOT a section.
1153# if section is defined it is noted in the data structure
1154# only so that files associated from a particular section
1155# may be removed later (using delete_section_assoc_files)
1156sub associate_file {
1157 my $self = shift (@_);
1158 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1159 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1160
1161 # remove all associated files with the same name
1162 $self->delete_assoc_file ($assoc_filename);
1163
1164 push (@{$self->{'associated_files'}},
1165 [$real_filename, $assoc_filename, $mime_type, $section]);
1166}
1167
1168# returns a list of associated files in the form
1169# [[real_filename, assoc_filename, mimetype], ...]
1170sub get_assoc_files {
1171 my $self = shift (@_);
1172
1173 return $self->{'associated_files'};
1174}
1175
1176sub delete_section_assoc_files {
1177 my $self = shift (@_);
1178 my ($section) = @_;
1179
1180 my $i=0;
1181 while ($i < scalar (@{$self->{'associated_files'}})) {
1182 if (defined $self->{'associated_files'}->[$i]->[3] &&
1183 $self->{'associated_files'}->[$i]->[3] eq $section) {
1184 splice (@{$self->{'associated_files'}}, $i, 1);
1185 } else {
1186 $i++;
1187 }
1188 }
1189}
1190
1191sub delete_assoc_file {
1192 my $self = shift (@_);
1193 my ($assoc_filename) = @_;
1194
1195 my $i=0;
1196 while ($i < scalar (@{$self->{'associated_files'}})) {
1197 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1198 splice (@{$self->{'associated_files'}}, $i, 1);
1199 } else {
1200 $i++;
1201 }
1202 }
1203}
1204
1205sub reset_nextsection_ptr {
1206 my $self = shift (@_);
1207 my ($section) = @_;
1208
1209 my $section_ptr = $self->_lookup_section($section);
1210 $section_ptr->{'next_subsection'} = 1;
1211}
1212
12131;
Note: See TracBrowser for help on using the repository browser.