source: trunk/gsdl/perllib/doc.pm@ 8510

Last change on this file since 8510 was 8504, checked in by chi, 20 years ago

Modification of METS format in order to be compatible with GS3. Also, add some new methods for the use of DSpace.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 34.9 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use unicode;
37use util;
38use ghtml;
39use File::stat;
40##use hashdoc;
41
42# the document type may be indexed_doc, nonindexed_doc, or
43# classification
44
45my $OIDcount = 0;
46
47sub new {
48 my $class = shift (@_);
49 my ($source_filename, $doc_type) = @_;
50
51 my $self = bless {'associated_files'=>[],
52 'subsection_order'=>[],
53 'next_subsection'=>1,
54 'subsections'=>{},
55 'metadata'=>[],
56 'text'=>"",
57 'OIDtype'=>"hash"}, $class;
58
59 #GRB: Added code to set lastmodified for OAI purposes
60 if ((defined $doc_type) && (defined $source_filename) && (-e $source_filename)) {
61 my $file_stat = stat($source_filename);
62 my $mtime = $file_stat->mtime;
63 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $file_stat->mtime);
64 }
65 #GRB: end inserted code
66
67 if (defined $source_filename) {
68 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
69
70 if (defined $collect_dir) {
71 my $dirsep = &util::get_dirsep();
72 if ($collect_dir !~ m/$dirsep$/) {
73 $collect_dir .= $dirsep;
74 }
75
76 $collect_dir =~ s/\\/\\\\/g; # escape DOS style file separator
77
78 if ($source_filename =~ /^$collect_dir(.*)$/) {
79 $source_filename = $1;
80 }
81 }
82
83 $self->set_source_filename ($source_filename);
84 }
85
86 $self->set_doc_type ($doc_type) if defined $doc_type;
87
88 return $self;
89}
90
91# clone the $self object
92sub duplicate {
93 my $self = shift (@_);
94
95 my $newobj = {};
96
97 foreach $k (keys %$self) {
98 $newobj->{$k} = &clone ($self->{$k});
99 }
100
101 bless $newobj, ref($self);
102 return $newobj;
103}
104
105sub clone {
106 my ($from) = @_;
107 my $type = ref ($from);
108
109 if ($type eq "HASH") {
110 my $to = {};
111 foreach $key (keys %$from) {
112 $to->{$key} = &clone ($from->{$key});
113 }
114 return $to;
115 } elsif ($type eq "ARRAY") {
116 my $to = [];
117 foreach $v (@$from) {
118 push (@$to, &clone ($v));
119 }
120 return $to;
121 } else {
122 return $from;
123 }
124}
125
126sub set_OIDtype {
127 my $self = shift (@_);
128 my ($type) = @_;
129
130 if ($type eq "incremental") {
131 $self->{'OIDtype'} = $type;
132 } else {
133 $self->{'OIDtype'} = "hash";
134 }
135}
136
137sub set_source_filename {
138 my $self = shift (@_);
139 my ($source_filename) = @_;
140
141 $self->set_metadata_element ($self->get_top_section(),
142 "gsdlsourcefilename",
143 $source_filename);
144}
145
146sub set_converted_filename {
147 my $self = shift (@_);
148 my ($converted_filename) = @_;
149
150 $self->set_metadata_element ($self->get_top_section(),
151 "gsdlconvertedfilename",
152 $converted_filename);
153}
154
155
156# returns the source_filename as it was provided
157sub get_source_filename {
158 my $self = shift (@_);
159
160 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
161}
162
163# returns converted filename if available else returns source filename
164sub get_filename_for_hashing {
165 my $self = shift (@_);
166
167 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
168
169 if (!defined $filename) {
170 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
171 }
172 return $filename;
173}
174
175sub set_doc_type {
176 my $self = shift (@_);
177 my ($doc_type) = @_;
178
179 $self->set_metadata_element ($self->get_top_section(),
180 "gsdldoctype",
181 $doc_type);
182}
183
184# returns the source_filename as it was provided
185# the default of "indexed_doc" is used if no document
186# type was provided
187sub get_doc_type {
188 my $self = shift (@_);
189
190 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
191 return $doc_type if (defined $doc_type);
192 return "indexed_doc";
193}
194
195sub _escape_text {
196 my ($text) = @_;
197
198 # special characters in the gml encoding
199 $text =~ s/&/&/g; # this has to be first...
200 $text =~ s/</&lt;/g;
201 $text =~ s/>/&gt;/g;
202 $text =~ s/\"/&quot;/g;
203
204 return $text;
205}
206
207sub buffer_section_xml {
208 my $self = shift (@_);
209 my ($section) = @_;
210
211 my $section_ptr = $self->_lookup_section ($section);
212 return "" unless defined $section_ptr;
213
214 my $all_text = "<Section>\n";
215 $all_text .= " <Description>\n";
216
217 # output metadata
218 foreach my $data (@{$section_ptr->{'metadata'}}) {
219 my $escaped_value = &_escape_text($data->[1]);
220 $all_text .= ' <Metadata name="' . $data->[0] . '">' . $escaped_value . "</Metadata>\n";
221 }
222
223 $all_text .= " </Description>\n";
224
225 # output the text
226 $all_text .= " <Content>";
227 $all_text .= &_escape_text($section_ptr->{'text'});
228 $all_text .= "</Content>\n";
229
230 # output all the subsections
231 foreach my $subsection (@{$section_ptr->{'subsection_order'}}) {
232 $all_text .= $self->buffer_section_xml("$section.$subsection");
233 }
234
235 $all_text .= "</Section>\n";
236
237 # make sure no nasty control characters have snuck through
238 # (XML::Parser will barf on anything it doesn't consider to be
239 # valid UTF-8 text, including things like \c@, \cC etc.)
240 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
241
242 return $all_text;
243}
244
245sub buffer_txt_section_xml {
246 my $self = shift(@_);
247 my ($section) = @_;
248
249 my $section_ptr = $self->_lookup_section ($section);
250
251 return "" unless defined $section_ptr;
252
253 my $all_text = "<Section>\n";
254
255 ##output the text
256 #$all_text .= " <Content>";
257 $all_text .= &_escape_text($section_ptr->{'text'});
258 #$all_text .= " </Content>\n";
259
260
261 #output all the subsections
262 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
263 $all_text .= $self->buffer_txt_section_xml("$section.$subsection");
264 }
265
266 $all_text .= "</Section>\n";
267
268
269 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
270 return $all_text;
271}
272
273sub buffer_mets_fileSection_section_xml() {
274 my $self = shift(@_);
275 my ($section, $doc_Dir) = @_;
276
277 $section="" unless defined $section;
278
279
280 my $section_ptr=$self->_lookup_section($section);
281 return "" unless defined $section_ptr;
282
283
284 #**output fileSection by sections
285 my $section_num ="1". $section;
286
287
288 #my $filePath = $doc_Dir . '/doctxt.xml';
289
290 my $filePath = 'doctxt.xml';
291
292 #**output the fileSection details
293 my $all_text = ' <mets:fileGrp ID="FILEGROUP_PRELUDE' . $section_num . '">'. "\n";
294 $all_text .= ' <mets:file MIMETYPE="text/xml" ID="FILE'.$section_num. '">'. "\n";
295 $all_text .= ' <mets:FLocat LOCTYPE="URL" xlink:href="file:'.$filePath.'#xpointer(/Section[';
296
297 my $xpath = "1".$section;
298
299 $xpath =~ s/\./]\/Section[/g;
300
301 $all_text .= $xpath;
302
303 $all_text .= ']/text())" ID="FILE'. $section_num. '" />' . "\n";
304 $all_text .= " </mets:file>\n";
305 $all_text .= " </mets:fileGrp>\n";
306
307
308 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
309 $all_text .= $self->buffer_mets_fileSection_section_xml("$section.$subsection",$doc_Dir);
310 }
311
312 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
313
314 return $all_text;
315}
316
317sub buffer_mets_fileWhole_section_xml(){
318 my $self = shift(@_);
319 my ($section) = @_;
320
321 my $section_ptr = $self-> _lookup_section($section);
322 return "" unless defined $section_ptr;
323
324 my $all_text="" unless defined $all_txt;
325
326 my ($dirPath)="" unless defined $dirPath;
327 my $fileID=0;
328
329 #** output the fileSection for the whole section
330 #*** get the sourcefile and associative file
331
332 foreach my $data (@{$section_ptr->{'metadata'}}){
333 my $escaped_value = &_escape_text($data->[1]);
334 if ($data->[0] eq "gsdlsourcefilename") {
335 ($dirPath) = $escaped_value =~ m/^(.*)[\/\\][^\/\\]*$/;
336
337 $all_text .= ' <mets:fileGrp ID="default">'."\n";
338 ++$fileID;
339 $all_text .= ' <mets:file MIMETYPE="text/xml" ID="default.'.$fileID.'">'. "\n";
340
341 $all_text .= ' <mets:FLocat LOCTYPE="URL" xlink:href="file:'.$data->[1].'" ID="default.'.$fileID.'" />'."\n";
342
343 $all_text .= " </mets:file>\n";
344 }
345
346 if ($data->[0] eq "gsdlassocfile"){
347 $escaped_value =~ m/^(.*?):(.*):$/;
348
349 my $assfilePath = $dirPath . '/'. $1;
350 ++$fileID;
351
352 $all_text .= ' <mets:file MIMETYPE="'.$2.'" ID="default.'.$fileID. '">'. "\n";
353 $all_text .= ' <mets:FLocat LOCTYPE="URL" xlink:href="file:'.$assfilePath.'" ID="default.'. $fileID.'" />'."\n";
354
355 $all_text .= " </mets:file>\n";
356 }
357 }
358 $all_text .= " </mets:fileGrp>\n";
359
360
361 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
362
363 return $all_text;
364}
365
366sub buffer_mets_StruMapSection_section_xml(){
367 my $self = shift(@_);
368 my ($section, $order_numref) = @_;
369
370 $section="" unless defined $section;
371
372
373 my $section_ptr=$self->_lookup_section($section);
374 return "" unless defined $section_ptr;
375
376
377 #***output fileSection by Sections
378 my $section_num ="1". $section;
379 my $dmd_num = $section_num;
380
381 #**output the dmdSection details
382 if ($section_num eq "1") {
383 $dmd_num = "0";
384 }
385
386 #**output the StruMap details
387
388 my $all_text = ' <mets:div ID="DS'. $section_num .'" TYPE="Section" ORDER="'.$$order_numref++.'" ORDERLABEL="'. $section_num .'" LABEL="';
389
390 $all_text .= $section_num . '" DMDID="DM'.$dmd_num.'">'. "\n";
391
392 $all_text .= ' <mets:fptr FILEID="FILEGROUP_PRELUDE'.$section_num.'" />'. "\n";
393
394
395 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
396 $all_text .= $self->buffer_mets_StruMapSection_section_xml("$section.$subsection", $order_numref);
397 }
398
399 $all_text .= " </mets:div>\n";
400
401 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
402
403 return $all_text;
404}
405
406
407sub buffer_mets_StruMapWhole_section_xml(){
408 my $self = shift(@_);
409 my ($section) = @_;
410
411 my $section_ptr = $self-> _lookup_section($section);
412 return "" unless defined $section_ptr;
413
414 my $all_text="" unless defined $all_txt;
415 my $fileID=0;
416
417 $all_text .= '<mets:structMap ID="All" TYPE="Whole Document" LABEL="All">'."\n";
418 $all_text .= ' <mets:div ID="All" TYPE="Document" ORDER="All" ORDERLABEL="All" LABEL="Whole Documemt" DMDID="DM1">' . "\n";
419
420
421 #** output the StruMapSection for the whole section
422 #*** get the sourcefile and associative file
423
424 foreach my $data (@{$section_ptr->{'metadata'}}){
425 my $escaped_value = &_escape_text($data->[1]);
426
427 if ($data->[0] eq "gsdlsourcefilename") {
428 ++$fileID;
429 $all_text .= ' <mets:fptr FILEID="default.'.$fileID.'" />'."\n";
430 }
431
432 if ($data->[0] eq "gsdlassocfile"){
433 ++$fileID;
434 $all_text .= ' <mets:fptr FILEID="default.'.$fileID. '" />'. "\n";
435 }
436 }
437 $all_text .= " </mets:div>\n";
438
439 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
440
441 return $all_text;
442}
443
444
445sub buffer_mets_dmdSection_section_xml(){
446 my $self = shift(@_);
447 my ($section) = @_;
448
449 $section="" unless defined $section;
450
451 my $section_ptr=$self->_lookup_section($section);
452 return "" unless defined $section_ptr;
453
454 #***convert section number
455 my $section_num ="1". $section;
456 my $dmd_num = $section_num;
457
458 #**output the dmdSection details
459 if ($section_num eq "1") {
460 $dmd_num = "0";
461 }
462 my $all_text = '<mets:dmdSec ID="DM'.$dmd_num.'" GROUPID="'.$section_num.'">'. "\n";
463 $all_text .= ' <mets:mdWrap MDType="gsdl3" ID="'.$section_num.'">'."\n";
464 $all_text .= " <mets:xmlData>\n";
465 foreach my $data (@{$section_ptr->{'metadata'}}){
466 my $escaped_value = &_escape_text($data->[1]);
467 $all_text .= ' <gsdl3:Metadata name="'. $data->[0].'">'. $escaped_value. "</gsdl3:Metadata>\n";
468 if ($data->[0] eq "dc.Title") {
469 $all_text .= ' <gsdl3:Metadata name="Title">'. $escaped_value."</gsdl3:Metadata>\n";
470 }
471 }
472
473 $all_text .= " </mets:xmlData>\n";
474 $all_text .= " </mets:mdWrap>\n";
475 $all_text .= "</mets:dmdSec>\n";
476
477 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
478 $all_text .= $self->buffer_mets_dmdSection_section_xml("$section.$subsection");
479 }
480
481 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
482
483 return $all_text;
484}
485
486sub output_section {
487 my $self = shift (@_);
488 my ($handle, $section) = @_;
489
490 print $handle $self->buffer_section_xml($section);
491}
492
493#*** print out DSpace dublin_core metadata section
494sub output_dspace_section {
495 my $self = shift (@_);
496 my ($handle, $section) = @_;
497
498 my $section_ptr = $self->_lookup_section ($section);
499 return "" unless defined $section_ptr;
500
501 my $all_text = "<Section>\n";
502 $all_text .= " <Description>\n";
503
504 # output metadata
505 foreach my $data (@{$section_ptr->{'metadata'}}) {
506 my $escaped_value = &_escape_text($data->[1]);
507 $all_text .= ' <Metadata name="' . $data->[0] . '">' . $escaped_value . "</Metadata>\n";
508 }
509
510 $all_text .= " </Description>\n";
511 $all_text .= "</Section>\n";
512
513 # make sure no nasty control characters have snuck through
514 # (XML::Parser will barf on anything it doesn't consider to be
515 # valid UTF-8 text, including things like \c@, \cC etc.)
516 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
517
518 return $all_text;
519}
520
521#*** print out doctxt.xml file
522sub output_txt_section {
523 my $self = shift (@_);
524 my ($handle, $section) = @_;
525
526 print $handle $self->buffer_txt_section_xml($section);
527}
528
529#*** print out docmets.xml file
530sub output_mets_section {
531 my $self = shift(@_);
532 my ($handle, $section, $doc_Dir) = @_;
533
534 #***print out the dmdSection
535 print $handle $self->buffer_mets_dmdSection_section_xml($section);
536
537 #***print out the fileSection by sections
538 print $handle "<mets:fileSec>\n";
539 print $handle $self->buffer_mets_fileSection_section_xml($section,$doc_Dir);
540
541 #***print out the whole fileSection
542 print $handle $self->buffer_mets_fileWhole_section_xml($section);
543 print $handle "</mets:fileSec>\n";
544
545 #***print out the StruMapSection by sections
546 print $handle '<mets:structMap ID="Section" TYPE="Section" LABEL="Section">' . "\n";
547 my $order_num=0;
548 print $handle $self->buffer_mets_StruMapSection_section_xml($section, \$order_num);
549 print $handle "</mets:structMap>\n";
550 print $handle $self->buffer_mets_StruMapWhole_section_xml($section);
551 print $handle "</mets:structMap>\n";
552}
553
554#*** print out dublin_core.xml file
555sub output_dc_section {
556 my $self = shift(@_);
557 my ($handle, $section, $doc_Dir) = @_;
558
559 #***print out the dublin_core
560 $section="" unless defined $section;
561
562 my $section_ptr=$self->_lookup_section($section);
563 return "" unless defined $section_ptr;
564 my $all_text="";
565 foreach my $data (@{$section_ptr->{'metadata'}}){
566 my $escaped_value = &_escape_text($data->[1]);
567 if ($data->[0]=~ /^dc/) {
568 #$all_text .= ' <dcvalue element="'. $data->[0].'" qualifier="#####">'. $escaped_value. "</dcvalue>\n";
569 $all_text .= ' <dcvalue element="'. $data->[0].'">'. $escaped_value. "</dcvalue>\n";
570 }
571 }
572 if ($all_text eq "") {
573 $all_text .= " There is no Dublin Core metatdata in this document\n";
574 }
575 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
576
577 print $handle $all_text;
578}
579
580# look up the reference to the a particular section
581sub _lookup_section {
582 my $self = shift (@_);
583 my ($section) = @_;
584
585 my ($num);
586 my $sectionref = $self;
587
588 while (defined $section && $section ne "") {
589 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
590 $num =~ s/^0+(\d)/$1/; # remove leading 0s
591 $section = "" unless defined $section;
592
593 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
594 $sectionref = $sectionref->{'subsections'}->{$num};
595 } else {
596 return undef;
597 }
598 }
599
600 return $sectionref;
601}
602
603# calculate OID by hashing the contents of the document
604sub _calc_OID {
605 my $self = shift (@_);
606 my ($filename) = @_;
607
608 my $osexe = &util::get_os_exe();
609
610 my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
611 $ENV{'GSDLOS'},"hashfile$osexe");
612
613 my $result = "NULL";
614
615 if (-e "$hashfile_exe") {
616# $result = `\"$hashfile_exe\" \"$filename\"`;
617 $result = `hashfile$osexe \"$filename\"`;
618 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
619
620 } else {
621 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
622 }
623 return "HASH$result";
624}
625
626# methods dealing with OID, not groups of them.
627
628# if $OID is not provided one is calculated
629sub set_OID {
630 my $self = shift (@_);
631 my ($OID) = @_;
632
633 # if an OID wasn't provided claculate one
634 if (!defined $OID) {
635 $OID = "NULL";
636
637 if ($self->{'OIDtype'} eq "incremental") {
638 $OID = "D" . $OIDcount;
639 $OIDcount ++;
640
641 } else {
642 # "hash" OID - feed file to hashfile.exe
643 #my $filename = $self->get_source_filename();
644 # we want to use the converted file for hashing if available
645 # cos its quicker
646 my $filename = $self->get_filename_for_hashing();
647
648 if (defined($filename) && -e $filename) {
649 $OID = $self->_calc_OID ($filename);
650 } else {
651 $filename = &util::get_tmp_filename();
652 if (!open (OUTFILE, ">$filename")) {
653 print STDERR "doc::set_OID could not write to $filename\n";
654 } else {
655 $self->output_section('OUTFILE', $self->get_top_section(),
656 undef, 1);
657 close (OUTFILE);
658 }
659 $OID = $self->_calc_OID ($filename);
660 &util::rm ($filename);
661 }
662 }
663 }
664 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
665}
666
667# this uses hashdoc (embedded c thingy) which is faster but still
668# needs a little work to be suffiently stable
669sub ___set_OID {
670 my $self = shift (@_);
671 my ($OID) = @_;
672
673 # if an OID wasn't provided then calculate hash value based on document
674 if (!defined $OID)
675 {
676 my $hash_text = $self->buffer_section_gml($self->get_top_section(),
677 undef, 1);
678 my $hash_len = length($hash_text);
679
680 $OID = &hashdoc::buffer($hash_text,$hash_len);
681 }
682
683 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
684}
685
686# returns the OID for this document
687sub get_OID {
688 my $self = shift (@_);
689 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
690 return $OID if (defined $OID);
691 return "NULL";
692}
693
694sub delete_OID {
695 my $self = shift (@_);
696
697 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
698}
699
700
701# methods for manipulating section names
702
703# returns the name of the top-most section (the top
704# level of the document
705sub get_top_section {
706 my $self = shift (@_);
707
708 return "";
709}
710
711# returns a section
712sub get_parent_section {
713 my $self = shift (@_);
714 my ($section) = @_;
715
716 $section =~ s/(^|\.)\d+$//;
717
718 return $section;
719}
720
721# returns the first child section (or the end child
722# if there isn't any)
723sub get_begin_child {
724 my $self = shift (@_);
725 my ($section) = @_;
726
727 my $section_ptr = $self->_lookup_section($section);
728 return "" unless defined $section_ptr;
729
730 if (defined $section_ptr->{'subsection_order'}->[0]) {
731 return "$section.$section_ptr->{'subsection_order'}->[0]";
732 }
733
734 return $self->get_end_child ($section);
735}
736
737# returns the next child of a parent section
738sub get_next_child {
739 my $self = shift (@_);
740 my ($section) = @_;
741
742 my $parent_section = $self->get_parent_section($section);
743 my $parent_section_ptr = $self->_lookup_section($parent_section);
744 return undef unless defined $parent_section_ptr;
745
746 my ($section_num) = $section =~ /(\d+)$/;
747 return undef unless defined $section_num;
748
749 my $i = 0;
750 my $section_order = $parent_section_ptr->{'subsection_order'};
751 while ($i < scalar(@$section_order)) {
752 last if $section_order->[$i] eq $section_num;
753 $i++;
754 }
755
756 $i++; # the next child
757 if ($i < scalar(@$section_order)) {
758 return $section_order->[$i] if $parent_section eq "";
759 return "$parent_section.$section_order->[$i]";
760 }
761
762 # no more sections in this level
763 return undef;
764}
765
766# returns a reference to a list of children
767sub get_children {
768 my $self = shift (@_);
769 my ($section) = @_;
770
771 my $section_ptr = $self->_lookup_section($section);
772 return [] unless defined $section_ptr;
773
774 my @children = @{$section_ptr->{'subsection_order'}};
775
776 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
777 return \@children;
778}
779
780# returns the child section one past the last one (which
781# is coded as "0")
782sub get_end_child {
783 my $self = shift (@_);
784 my ($section) = @_;
785
786 return $section . ".0" unless $section eq "";
787 return "0";
788}
789
790# returns the next section in book order
791sub get_next_section {
792 my $self = shift (@_);
793 my ($section) = @_;
794
795 return undef unless defined $section;
796
797 my $section_ptr = $self->_lookup_section($section);
798 return undef unless defined $section_ptr;
799
800 # first try to find first child
801 if (defined $section_ptr->{'subsection_order'}->[0]) {
802 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
803 return "$section.$section_ptr->{'subsection_order'}->[0]";
804 }
805
806 do {
807 # try to find sibling
808 my $next_child = $self->get_next_child ($section);
809 return $next_child if (defined $next_child);
810
811 # move up one level
812 $section = $self->get_parent_section ($section);
813 } while $section =~ /\d/;
814
815 return undef;
816}
817
818sub is_leaf_section {
819 my $self = shift (@_);
820 my ($section) = @_;
821
822 my $section_ptr = $self->_lookup_section($section);
823 return 1 unless defined $section_ptr;
824
825 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
826}
827
828# methods for dealing with sections
829
830# returns the name of the inserted section
831sub insert_section {
832 my $self = shift (@_);
833 my ($before_section) = @_;
834
835 # get the child to insert before and its parent section
836 my $parent_section = "";
837 my $before_child = "0";
838 my @before_section = split (/\./, $before_section);
839 if (scalar(@before_section) > 0) {
840 $before_child = pop (@before_section);
841 $parent_section = join (".", @before_section);
842 }
843
844 my $parent_section_ptr = $self->_lookup_section($parent_section);
845 if (!defined $parent_section_ptr) {
846 print STDERR "doc::insert_section couldn't find parent section " .
847 "$parent_section\n";
848 return;
849 }
850
851 # get the next section number
852 my $section_num = $parent_section_ptr->{'next_subsection'}++;
853
854 my $i = 0;
855 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
856 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
857 $i++;
858 }
859
860 # insert the section number into the order list
861 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
862
863 # add this section to the parent section
864 my $section_ptr = {'subsection_order'=>[],
865 'next_subsection'=>1,
866 'subsections'=>{},
867 'metadata'=>[],
868 'text'=>""};
869 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
870
871 # work out the full section number
872 my $section = $parent_section;
873 $section .= "." unless $section eq "";
874 $section .= $section_num;
875
876 return $section;
877}
878
879# creates a pre-named section
880sub create_named_section {
881 my $self = shift (@_);
882 my ($mastersection) = @_;
883
884 my ($num);
885 my $section = $mastersection;
886 my $sectionref = $self;
887
888 while ($section ne "") {
889 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
890 $num =~ s/^0+(\d)/$1/; # remove leading 0s
891 $section = "" unless defined $section;
892
893 if (defined $num) {
894 if (!defined $sectionref->{'subsections'}->{$num}) {
895 push (@{$sectionref->{'subsection_order'}}, $num);
896 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
897 'next_subsection'=>1,
898 'subsections'=>{},
899 'metadata'=>[],
900 'text'=>""};
901 if ($num >= $sectionref->{'next_subsection'}) {
902 $sectionref->{'next_subsection'} = $num + 1;
903 }
904 }
905 $sectionref = $sectionref->{'subsections'}->{$num};
906
907 } else {
908 print STDERR "doc::create_named_section couldn't create section ";
909 print STDERR "$mastersection\n";
910 last;
911 }
912 }
913}
914
915# returns a reference to a list of subsections
916sub list_subsections {
917 my $self = shift (@_);
918 my ($section) = @_;
919
920 my $section_ptr = $self->_lookup_section ($section);
921 if (!defined $section_ptr) {
922 print STDERR "doc::list_subsections couldn't find section $section\n";
923 return [];
924 }
925
926 return [@{$section_ptr->{'subsection_order'}}];
927}
928
929sub delete_section {
930 my $self = shift (@_);
931 my ($section) = @_;
932
933# my $section_ptr = {'subsection_order'=>[],
934# 'next_subsection'=>1,
935# 'subsections'=>{},
936# 'metadata'=>[],
937# 'text'=>""};
938
939 # if this is the top section reset everything
940 if ($section eq "") {
941 $self->{'subsection_order'} = [];
942 $self->{'subsections'} = {};
943 $self->{'metadata'} = [];
944 $self->{'text'} = "";
945 return;
946 }
947
948 # find the parent of the section to delete
949 my $parent_section = "";
950 my $child = "0";
951 my @section = split (/\./, $section);
952 if (scalar(@section) > 0) {
953 $child = pop (@section);
954 $parent_section = join (".", @section);
955 }
956
957 my $parent_section_ptr = $self->_lookup_section($parent_section);
958 if (!defined $parent_section_ptr) {
959 print STDERR "doc::delete_section couldn't find parent section " .
960 "$parent_section\n";
961 return;
962 }
963
964 # remove this section from the subsection_order list
965 my $i = 0;
966 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
967 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
968 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
969 last;
970 }
971 $i++;
972 }
973
974 # remove this section from the subsection hash
975 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
976 undef $parent_section_ptr->{'subsections'}->{$child};
977 }
978}
979
980#--
981# methods for dealing with metadata
982
983# set_metadata_element and get_metadata_element are for metadata
984# which should only have one value. add_meta_data and get_metadata
985# are for metadata which can have more than one value.
986
987# returns the first metadata value which matches field
988
989# This version of get metadata element works much like the one above,
990# except it allows for the namespace portion of a metadata element to
991# be ignored, thus if you are searching for dc.Title, the first piece
992# of matching metadata ending with the name Title (once any namespace
993# is removed) would be returned.
994# 28-11-2003 John Thompson
995sub get_metadata_element {
996 my $self = shift (@_);
997 my ($section, $field, $ignore_namespace) = @_;
998 my ($data);
999
1000 $ignore_namespace = 0 unless defined $ignore_namespace;
1001
1002 my $section_ptr = $self->_lookup_section($section);
1003 if (!defined $section_ptr) {
1004 print STDERR "doc::get_metadata_element couldn't find section " .
1005 "$section\n";
1006 return;
1007 }
1008
1009 # Remove the any namespace if we are being told to ignore them
1010 if($ignore_namespace) {
1011 $field =~ s/^\w*\.//;
1012 }
1013
1014 foreach $data (@{$section_ptr->{'metadata'}}) {
1015
1016 my $data_name = $data->[0];
1017 # Remove the any namespace if we are being told to ignore them
1018 if($ignore_namespace) {
1019 $data_name =~ s/^\w*\.//;
1020 }
1021
1022 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
1023 }
1024
1025 return undef; # was not found
1026}
1027
1028# returns a list of the form [value1, value2, ...]
1029sub get_metadata {
1030 my $self = shift (@_);
1031 my ($section, $field, $ignore_namespace) = @_;
1032 my ($data);
1033
1034 $ignore_namespace = 0 unless defined $ignore_namespace;
1035
1036 my $section_ptr = $self->_lookup_section($section);
1037 if (!defined $section_ptr) {
1038 print STDERR "doc::get_metadata couldn't find section " .
1039 "$section\n";
1040 return;
1041 }
1042
1043 # Remove the any namespace if we are being told to ignore them
1044 if($ignore_namespace) {
1045 $field =~ s/^\w*\.//;
1046 }
1047
1048 my @metadata = ();
1049 foreach $data (@{$section_ptr->{'metadata'}}) {
1050
1051 my $data_name = $data->[0];
1052 # Remove the any namespace if we are being told to ignore them
1053 if($ignore_namespace) {
1054 $data_name =~ s/^\w*\.//;
1055 }
1056
1057 push (@metadata, $data->[1]) if ($data_name eq $field);
1058 }
1059
1060 return \@metadata;
1061}
1062
1063# returns a list of the form [[field,value],[field,value],...]
1064sub get_all_metadata {
1065 my $self = shift (@_);
1066 my ($section) = @_;
1067
1068 my $section_ptr = $self->_lookup_section($section);
1069 if (!defined $section_ptr) {
1070 print STDERR "doc::get_all_metadata couldn't find section " .
1071 "$section\n";
1072 return;
1073 }
1074
1075 return $section_ptr->{'metadata'};
1076}
1077
1078# $value is optional
1079sub delete_metadata {
1080 my $self = shift (@_);
1081 my ($section, $field, $value) = @_;
1082
1083 my $section_ptr = $self->_lookup_section($section);
1084 if (!defined $section_ptr) {
1085 print STDERR "doc::delete_metadata couldn't find section " .
1086 "$section\n";
1087 return;
1088 }
1089
1090 my $i = 0;
1091 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
1092 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
1093 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
1094 splice (@{$section_ptr->{'metadata'}}, $i, 1);
1095 } else {
1096 $i++;
1097 }
1098 }
1099}
1100
1101sub delete_all_metadata {
1102 my $self = shift (@_);
1103 my ($section) = @_;
1104
1105 my $section_ptr = $self->_lookup_section($section);
1106 if (!defined $section_ptr) {
1107 print STDERR "doc::delete_all_metadata couldn't find section " .
1108 "$section\n";
1109 return;
1110 }
1111
1112 $section_ptr->{'metadata'} = [];
1113}
1114
1115sub set_metadata_element {
1116 my $self = shift (@_);
1117 my ($section, $field, $value) = @_;
1118
1119 $self->set_utf8_metadata_element ($section, $field,
1120 &unicode::ascii2utf8(\$value));
1121}
1122
1123# set_utf8_metadata_element assumes the text has already been
1124# converted to the UTF-8 encoding.
1125sub set_utf8_metadata_element {
1126 my $self = shift (@_);
1127 my ($section, $field, $value) = @_;
1128
1129 $self->delete_metadata ($section, $field);
1130 $self->add_utf8_metadata ($section, $field, $value);
1131}
1132
1133
1134# add_metadata assumes the text is in (extended) ascii form. For
1135# text which has already been converted to the UTF-8 format use
1136# add_utf8_metadata.
1137sub add_metadata {
1138 my $self = shift (@_);
1139 my ($section, $field, $value) = @_;
1140
1141 $self->add_utf8_metadata ($section, $field,
1142 &unicode::ascii2utf8(\$value));
1143}
1144
1145sub add_utf8_metadata {
1146 my $self = shift (@_);
1147 my ($section, $field, $value) = @_;
1148
1149 my $section_ptr = $self->_lookup_section($section);
1150 if (!defined $section_ptr) {
1151 print STDERR "doc::add_utf8_metadata couldn't find section " .
1152 "$section\n";
1153 return;
1154 }
1155 if (!defined $value) {
1156 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
1157 return;
1158 }
1159 if (!defined $field) {
1160 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
1161 return;
1162 }
1163
1164 # double check that the value is utf-8
1165 if (unicode::ensure_utf8(\$value)) {
1166 print STDERR "doc::add_utf8_metadata: warning: '$field' wasn't utf8\n";
1167 }
1168
1169 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1170}
1171
1172
1173# methods for dealing with text
1174
1175# returns the text for a section
1176sub get_text {
1177 my $self = shift (@_);
1178 my ($section) = @_;
1179
1180 my $section_ptr = $self->_lookup_section($section);
1181 if (!defined $section_ptr) {
1182 print STDERR "doc::get_text couldn't find section " .
1183 "$section\n";
1184 return "";
1185 }
1186
1187 return $section_ptr->{'text'};
1188}
1189
1190# returns the (utf-8 encoded) length of the text for a section
1191sub get_text_length {
1192 my $self = shift (@_);
1193 my ($section) = @_;
1194
1195 my $section_ptr = $self->_lookup_section($section);
1196 if (!defined $section_ptr) {
1197 print STDERR "doc::get_text_length couldn't find section " .
1198 "$section\n";
1199 return 0;
1200 }
1201
1202 return length ($section_ptr->{'text'});
1203}
1204
1205sub delete_text {
1206 my $self = shift (@_);
1207 my ($section) = @_;
1208
1209 my $section_ptr = $self->_lookup_section($section);
1210 if (!defined $section_ptr) {
1211 print STDERR "doc::delete_text couldn't find section " .
1212 "$section\n";
1213 return;
1214 }
1215
1216 $section_ptr->{'text'} = "";
1217}
1218
1219# add_text assumes the text is in (extended) ascii form. For
1220# text which has been already converted to the UTF-8 format
1221# use add_utf8_text.
1222sub add_text {
1223 my $self = shift (@_);
1224 my ($section, $text) = @_;
1225
1226 # convert the text to UTF-8 encoded unicode characters
1227 # and add the text
1228 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1229}
1230
1231
1232# add_utf8_text assumes the text to be added has already
1233# been converted to the UTF-8 encoding. For ascii text use
1234# add_text
1235sub add_utf8_text {
1236 my $self = shift (@_);
1237 my ($section, $text) = @_;
1238
1239 my $section_ptr = $self->_lookup_section($section);
1240 if (!defined $section_ptr) {
1241 print STDERR "doc::add_utf8_text couldn't find section " .
1242 "$section\n";
1243 return;
1244 }
1245
1246 $section_ptr->{'text'} .= $text;
1247}
1248
1249
1250# methods for dealing with associated files
1251
1252# a file is associated with a document, NOT a section.
1253# if section is defined it is noted in the data structure
1254# only so that files associated from a particular section
1255# may be removed later (using delete_section_assoc_files)
1256sub associate_file {
1257 my $self = shift (@_);
1258 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1259 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1260
1261 # remove all associated files with the same name
1262 $self->delete_assoc_file ($assoc_filename);
1263
1264 push (@{$self->{'associated_files'}},
1265 [$real_filename, $assoc_filename, $mime_type, $section]);
1266}
1267
1268# returns a list of associated files in the form
1269# [[real_filename, assoc_filename, mimetype], ...]
1270sub get_assoc_files {
1271 my $self = shift (@_);
1272
1273 return $self->{'associated_files'};
1274}
1275
1276sub delete_section_assoc_files {
1277 my $self = shift (@_);
1278 my ($section) = @_;
1279
1280 my $i=0;
1281 while ($i < scalar (@{$self->{'associated_files'}})) {
1282 if (defined $self->{'associated_files'}->[$i]->[3] &&
1283 $self->{'associated_files'}->[$i]->[3] eq $section) {
1284 splice (@{$self->{'associated_files'}}, $i, 1);
1285 } else {
1286 $i++;
1287 }
1288 }
1289}
1290
1291sub delete_assoc_file {
1292 my $self = shift (@_);
1293 my ($assoc_filename) = @_;
1294
1295 my $i=0;
1296 while ($i < scalar (@{$self->{'associated_files'}})) {
1297 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1298 splice (@{$self->{'associated_files'}}, $i, 1);
1299 } else {
1300 $i++;
1301 }
1302 }
1303}
1304
1305sub reset_nextsection_ptr {
1306 my $self = shift (@_);
1307 my ($section) = @_;
1308
1309 my $section_ptr = $self->_lookup_section($section);
1310 $section_ptr->{'next_subsection'} = 1;
1311}
1312
13131;
Note: See TracBrowser for help on using the repository browser.