source: trunk/gsdl/perllib/doc.pm@ 8794

Last change on this file since 8794 was 8716, checked in by kjdon, 20 years ago

added some changes made by Emanuel Dejanu (Simple Words)

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 35.4 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use unicode;
37use util;
38use ghtml;
39use File::stat;
40##use hashdoc;
41
42# the document type may be indexed_doc, nonindexed_doc, or
43# classification
44
45my $OIDcount = 0;
46
47sub new {
48 my $class = shift (@_);
49 my ($source_filename, $doc_type) = @_;
50
51 my $self = bless {'associated_files'=>[],
52 'subsection_order'=>[],
53 'next_subsection'=>1,
54 'subsections'=>{},
55 'metadata'=>[],
56 'text'=>"",
57 'OIDtype'=>"hash"}, $class;
58
59 #GRB: Added code to set lastmodified for OAI purposes
60 if ((defined $doc_type) && (defined $source_filename) && (-e $source_filename)) {
61 my $file_stat = stat($source_filename);
62 my $mtime = $file_stat->mtime;
63 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $file_stat->mtime);
64 }
65 #GRB: end inserted code
66
67 if (defined $source_filename) {
68 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
69
70 if (defined $collect_dir) {
71 my $dirsep = &util::get_dirsep();
72 if ($collect_dir !~ m/$dirsep$/) {
73 $collect_dir .= $dirsep;
74 }
75
76 $collect_dir =~ s/\\/\\\\/g; # escape DOS style file separator
77
78 if ($source_filename =~ /^$collect_dir(.*)$/) {
79 $source_filename = $1;
80 }
81 }
82
83 $self->set_source_filename ($source_filename);
84 }
85
86 $self->set_doc_type ($doc_type) if defined $doc_type;
87
88 return $self;
89}
90
91# clone the $self object
92sub duplicate {
93 my $self = shift (@_);
94
95 my $newobj = {};
96
97 foreach my $k (keys %$self) {
98 $newobj->{$k} = &clone ($self->{$k});
99 }
100
101 bless $newobj, ref($self);
102 return $newobj;
103}
104
105sub clone {
106 my ($from) = @_;
107 my $type = ref ($from);
108
109 if ($type eq "HASH") {
110 my $to = {};
111 foreach my $key (keys %$from) {
112 $to->{$key} = &clone ($from->{$key});
113 }
114 return $to;
115 } elsif ($type eq "ARRAY") {
116 my $to = [];
117 foreach my $v (@$from) {
118 push (@$to, &clone ($v));
119 }
120 return $to;
121 } else {
122 return $from;
123 }
124}
125
126sub set_OIDtype {
127 my $self = shift (@_);
128 my ($type) = @_;
129
130 if ($type =~ /^(hash|incremental|dirname)$/) {
131 $self->{'OIDtype'} = $type;
132 } else {
133 $self->{'OIDtype'} = "hash";
134 }
135}
136
137sub set_source_filename {
138 my $self = shift (@_);
139 my ($source_filename) = @_;
140
141 $self->set_metadata_element ($self->get_top_section(),
142 "gsdlsourcefilename",
143 $source_filename);
144}
145
146sub set_converted_filename {
147 my $self = shift (@_);
148 my ($converted_filename) = @_;
149
150 $self->set_metadata_element ($self->get_top_section(),
151 "gsdlconvertedfilename",
152 $converted_filename);
153}
154
155
156# returns the source_filename as it was provided
157sub get_source_filename {
158 my $self = shift (@_);
159
160 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
161}
162
163# returns converted filename if available else returns source filename
164sub get_filename_for_hashing {
165 my $self = shift (@_);
166
167 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
168
169 if (!defined $filename) {
170 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
171 }
172 return $filename;
173}
174
175sub set_doc_type {
176 my $self = shift (@_);
177 my ($doc_type) = @_;
178
179 $self->set_metadata_element ($self->get_top_section(),
180 "gsdldoctype",
181 $doc_type);
182}
183
184# returns the source_filename as it was provided
185# the default of "indexed_doc" is used if no document
186# type was provided
187sub get_doc_type {
188 my $self = shift (@_);
189
190 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
191 return $doc_type if (defined $doc_type);
192 return "indexed_doc";
193}
194
195sub _escape_text {
196 my ($text) = @_;
197
198 # special characters in the gml encoding
199 $text =~ s/&/&/g; # this has to be first...
200 $text =~ s/</&lt;/g;
201 $text =~ s/>/&gt;/g;
202 $text =~ s/\"/&quot;/g;
203
204 return $text;
205}
206
207sub buffer_section_xml {
208 my $self = shift (@_);
209 my ($section) = @_;
210
211 my $section_ptr = $self->_lookup_section ($section);
212 return "" unless defined $section_ptr;
213
214 my $all_text = "<Section>\n";
215 $all_text .= " <Description>\n";
216
217 # output metadata
218 foreach my $data (@{$section_ptr->{'metadata'}}) {
219 my $escaped_value = &_escape_text($data->[1]);
220 $all_text .= ' <Metadata name="' . $data->[0] . '">' . $escaped_value . "</Metadata>\n";
221 }
222
223 $all_text .= " </Description>\n";
224
225 # output the text
226 $all_text .= " <Content>";
227 $all_text .= &_escape_text($section_ptr->{'text'});
228 $all_text .= "</Content>\n";
229
230 # output all the subsections
231 foreach my $subsection (@{$section_ptr->{'subsection_order'}}) {
232 $all_text .= $self->buffer_section_xml("$section.$subsection");
233 }
234
235 $all_text .= "</Section>\n";
236
237 # make sure no nasty control characters have snuck through
238 # (XML::Parser will barf on anything it doesn't consider to be
239 # valid UTF-8 text, including things like \c@, \cC etc.)
240 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
241
242 return $all_text;
243}
244
245sub buffer_txt_section_xml {
246 my $self = shift(@_);
247 my ($section) = @_;
248
249 my $section_ptr = $self->_lookup_section ($section);
250
251 return "" unless defined $section_ptr;
252
253 my $all_text = "<Section>\n";
254
255 ##output the text
256 #$all_text .= " <Content>";
257 $all_text .= &_escape_text($section_ptr->{'text'});
258 #$all_text .= " </Content>\n";
259
260
261 #output all the subsections
262 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
263 $all_text .= $self->buffer_txt_section_xml("$section.$subsection");
264 }
265
266 $all_text .= "</Section>\n";
267
268
269 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
270 return $all_text;
271}
272
273sub buffer_mets_fileSection_section_xml() {
274 my $self = shift(@_);
275 my ($section, $doc_Dir) = @_;
276
277 $section="" unless defined $section;
278
279
280 my $section_ptr=$self->_lookup_section($section);
281 return "" unless defined $section_ptr;
282
283
284 #**output fileSection by sections
285 my $section_num ="1". $section;
286
287
288 #my $filePath = $doc_Dir . '/doctxt.xml';
289
290 my $filePath = 'doctxt.xml';
291
292 #**output the fileSection details
293 my $all_text = ' <mets:fileGrp ID="FILEGROUP_PRELUDE' . $section_num . '">'. "\n";
294 $all_text .= ' <mets:file MIMETYPE="text/xml" ID="FILE'.$section_num. '">'. "\n";
295 $all_text .= ' <mets:FLocat LOCTYPE="URL" xlink:href="file:'.$filePath.'#xpointer(/Section[';
296
297 my $xpath = "1".$section;
298
299 $xpath =~ s/\./]\/Section[/g;
300
301 $all_text .= $xpath;
302
303 $all_text .= ']/text())" ID="FILE'. $section_num. '" />' . "\n";
304 $all_text .= " </mets:file>\n";
305 $all_text .= " </mets:fileGrp>\n";
306
307
308 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
309 $all_text .= $self->buffer_mets_fileSection_section_xml("$section.$subsection",$doc_Dir);
310 }
311
312 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
313
314 return $all_text;
315}
316
317sub buffer_mets_fileWhole_section_xml(){
318 my $self = shift(@_);
319 my ($section) = @_;
320
321 my $section_ptr = $self-> _lookup_section($section);
322 return "" unless defined $section_ptr;
323
324 my $all_text="" unless defined $all_txt;
325
326 my ($dirPath)="" unless defined $dirPath;
327 my $fileID=0;
328
329 #** output the fileSection for the whole section
330 #*** get the sourcefile and associative file
331
332 foreach my $data (@{$section_ptr->{'metadata'}}){
333 my $escaped_value = &_escape_text($data->[1]);
334 if ($data->[0] eq "gsdlsourcefilename") {
335 ($dirPath) = $escaped_value =~ m/^(.*)[\/\\][^\/\\]*$/;
336
337 $all_text .= ' <mets:fileGrp ID="default">'."\n";
338 ++$fileID;
339 $all_text .= ' <mets:file MIMETYPE="text/xml" ID="default.'.$fileID.'">'. "\n";
340
341 $all_text .= ' <mets:FLocat LOCTYPE="URL" xlink:href="file:'.$data->[1].'" ID="default.'.$fileID.'" />'."\n";
342
343 $all_text .= " </mets:file>\n";
344 }
345
346 if ($data->[0] eq "gsdlassocfile"){
347 $escaped_value =~ m/^(.*?):(.*):$/;
348
349 my $assfilePath = $dirPath . '/'. $1;
350 ++$fileID;
351
352 $all_text .= ' <mets:file MIMETYPE="'.$2.'" ID="default.'.$fileID. '">'. "\n";
353 $all_text .= ' <mets:FLocat LOCTYPE="URL" xlink:href="file:'.$assfilePath.'" ID="default.'. $fileID.'" />'."\n";
354
355 $all_text .= " </mets:file>\n";
356 }
357 }
358 $all_text .= " </mets:fileGrp>\n";
359
360
361 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
362
363 return $all_text;
364}
365
366sub buffer_mets_StruMapSection_section_xml(){
367 my $self = shift(@_);
368 my ($section, $order_numref) = @_;
369
370 $section="" unless defined $section;
371
372
373 my $section_ptr=$self->_lookup_section($section);
374 return "" unless defined $section_ptr;
375
376
377 #***output fileSection by Sections
378 my $section_num ="1". $section;
379 my $dmd_num = $section_num;
380
381 #**output the dmdSection details
382 if ($section_num eq "1") {
383 $dmd_num = "0";
384 }
385
386 #**output the StruMap details
387
388 my $all_text = ' <mets:div ID="DS'. $section_num .'" TYPE="Section" ORDER="'.$$order_numref++.'" ORDERLABEL="'. $section_num .'" LABEL="';
389
390 $all_text .= $section_num . '" DMDID="DM'.$dmd_num.'">'. "\n";
391
392 $all_text .= ' <mets:fptr FILEID="FILEGROUP_PRELUDE'.$section_num.'" />'. "\n";
393
394
395 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
396 $all_text .= $self->buffer_mets_StruMapSection_section_xml("$section.$subsection", $order_numref);
397 }
398
399 $all_text .= " </mets:div>\n";
400
401 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
402
403 return $all_text;
404}
405
406
407sub buffer_mets_StruMapWhole_section_xml(){
408 my $self = shift(@_);
409 my ($section) = @_;
410
411 my $section_ptr = $self-> _lookup_section($section);
412 return "" unless defined $section_ptr;
413
414 my $all_text="" unless defined $all_txt;
415 my $fileID=0;
416
417 $all_text .= '<mets:structMap ID="All" TYPE="Whole Document" LABEL="All">'."\n";
418 $all_text .= ' <mets:div ID="All" TYPE="Document" ORDER="All" ORDERLABEL="All" LABEL="Whole Documemt" DMDID="DM1">' . "\n";
419
420
421 #** output the StruMapSection for the whole section
422 #*** get the sourcefile and associative file
423
424 foreach my $data (@{$section_ptr->{'metadata'}}){
425 my $escaped_value = &_escape_text($data->[1]);
426
427 if ($data->[0] eq "gsdlsourcefilename") {
428 ++$fileID;
429 $all_text .= ' <mets:fptr FILEID="default.'.$fileID.'" />'."\n";
430 }
431
432 if ($data->[0] eq "gsdlassocfile"){
433 ++$fileID;
434 $all_text .= ' <mets:fptr FILEID="default.'.$fileID. '" />'. "\n";
435 }
436 }
437 $all_text .= " </mets:div>\n";
438
439 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
440
441 return $all_text;
442}
443
444
445sub buffer_mets_dmdSection_section_xml(){
446 my $self = shift(@_);
447 my ($section) = @_;
448
449 $section="" unless defined $section;
450
451 my $section_ptr=$self->_lookup_section($section);
452 return "" unless defined $section_ptr;
453
454 #***convert section number
455 my $section_num ="1". $section;
456 my $dmd_num = $section_num;
457
458 #**output the dmdSection details
459 if ($section_num eq "1") {
460 $dmd_num = "0";
461 }
462 my $all_text = '<mets:dmdSec ID="DM'.$dmd_num.'" GROUPID="'.$section_num.'">'. "\n";
463 $all_text .= ' <mets:mdWrap MDType="gsdl3" ID="'.$section_num.'">'."\n";
464 $all_text .= " <mets:xmlData>\n";
465 foreach my $data (@{$section_ptr->{'metadata'}}){
466 my $escaped_value = &_escape_text($data->[1]);
467 $all_text .= ' <gsdl3:Metadata name="'. $data->[0].'">'. $escaped_value. "</gsdl3:Metadata>\n";
468 if ($data->[0] eq "dc.Title") {
469 $all_text .= ' <gsdl3:Metadata name="Title">'. $escaped_value."</gsdl3:Metadata>\n";
470 }
471 }
472
473 $all_text .= " </mets:xmlData>\n";
474 $all_text .= " </mets:mdWrap>\n";
475 $all_text .= "</mets:dmdSec>\n";
476
477 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
478 $all_text .= $self->buffer_mets_dmdSection_section_xml("$section.$subsection");
479 }
480
481 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
482
483 return $all_text;
484}
485
486sub output_section {
487 my $self = shift (@_);
488 my ($handle, $section) = @_;
489
490 print $handle $self->buffer_section_xml($section);
491}
492
493#*** print out DSpace dublin_core metadata section
494sub output_dspace_section {
495 my $self = shift (@_);
496 my ($handle, $section) = @_;
497
498 my $section_ptr = $self->_lookup_section ($section);
499 return "" unless defined $section_ptr;
500
501 my $all_text = "<Section>\n";
502 $all_text .= " <Description>\n";
503
504 # output metadata
505 foreach my $data (@{$section_ptr->{'metadata'}}) {
506 my $escaped_value = &_escape_text($data->[1]);
507 $all_text .= ' <Metadata name="' . $data->[0] . '">' . $escaped_value . "</Metadata>\n";
508 }
509
510 $all_text .= " </Description>\n";
511 $all_text .= "</Section>\n";
512
513 # make sure no nasty control characters have snuck through
514 # (XML::Parser will barf on anything it doesn't consider to be
515 # valid UTF-8 text, including things like \c@, \cC etc.)
516 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
517
518 return $all_text;
519}
520
521#*** print out doctxt.xml file
522sub output_txt_section {
523 my $self = shift (@_);
524 my ($handle, $section) = @_;
525
526 print $handle $self->buffer_txt_section_xml($section);
527}
528
529#*** print out docmets.xml file
530sub output_mets_section {
531 my $self = shift(@_);
532 my ($handle, $section, $doc_Dir) = @_;
533
534 #***print out the dmdSection
535 print $handle $self->buffer_mets_dmdSection_section_xml($section);
536
537 #***print out the fileSection by sections
538 print $handle "<mets:fileSec>\n";
539 print $handle $self->buffer_mets_fileSection_section_xml($section,$doc_Dir);
540
541 #***print out the whole fileSection
542 print $handle $self->buffer_mets_fileWhole_section_xml($section);
543 print $handle "</mets:fileSec>\n";
544
545 #***print out the StruMapSection by sections
546 print $handle '<mets:structMap ID="Section" TYPE="Section" LABEL="Section">' . "\n";
547 my $order_num=0;
548 print $handle $self->buffer_mets_StruMapSection_section_xml($section, \$order_num);
549 print $handle "</mets:structMap>\n";
550 print $handle $self->buffer_mets_StruMapWhole_section_xml($section);
551 print $handle "</mets:structMap>\n";
552}
553
554#*** print out dublin_core.xml file
555sub output_dc_section {
556 my $self = shift(@_);
557 my ($handle, $section, $doc_Dir) = @_;
558
559 #***print out the dublin_core
560 $section="" unless defined $section;
561
562 my $section_ptr=$self->_lookup_section($section);
563 return "" unless defined $section_ptr;
564 my $all_text="";
565 foreach my $data (@{$section_ptr->{'metadata'}}){
566 my $escaped_value = &_escape_text($data->[1]);
567 if ($data->[0]=~ /^dc/) {
568 #$all_text .= ' <dcvalue element="'. $data->[0].'" qualifier="#####">'. $escaped_value. "</dcvalue>\n";
569 $all_text .= ' <dcvalue element="'. $data->[0].'">'. $escaped_value. "</dcvalue>\n";
570 }
571 }
572 if ($all_text eq "") {
573 $all_text .= " There is no Dublin Core metatdata in this document\n";
574 }
575 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
576
577 print $handle $all_text;
578}
579
580# look up the reference to the a particular section
581sub _lookup_section {
582 my $self = shift (@_);
583 my ($section) = @_;
584
585 my ($num);
586 my $sectionref = $self;
587
588 while (defined $section && $section ne "") {
589 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
590 $num =~ s/^0+(\d)/$1/; # remove leading 0s
591 $section = "" unless defined $section;
592
593 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
594 $sectionref = $sectionref->{'subsections'}->{$num};
595 } else {
596 return undef;
597 }
598 }
599
600 return $sectionref;
601}
602
603# calculate OID by hashing the contents of the document
604sub _calc_OID {
605 my $self = shift (@_);
606 my ($filename) = @_;
607
608 my $osexe = &util::get_os_exe();
609
610 my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
611 $ENV{'GSDLOS'},"hashfile$osexe");
612
613 my $result = "NULL";
614
615 if (-e "$hashfile_exe") {
616# $result = `\"$hashfile_exe\" \"$filename\"`;
617 $result = `hashfile$osexe \"$filename\"`;
618 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
619
620 } else {
621 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
622 }
623 return "HASH$result";
624}
625
626# methods dealing with OID, not groups of them.
627
628# if $OID is not provided one is calculated
629sub set_OID {
630 my $self = shift (@_);
631 my ($OID) = @_;
632
633 # if an OID wasn't provided claculate one
634 if (!defined $OID) {
635 $OID = "NULL";
636
637 if ($self->{'OIDtype'} eq "incremental") {
638 $OID = "D" . $OIDcount;
639 $OIDcount ++;
640
641 } elsif ($self->{'OIDtype'} eq "dirname") {
642 $OID = 'J';
643 my $filename = $self->get_source_filename();
644 if (defined($filename) && -e $filename) {
645 $OID = &File::Basename::dirname($filename);
646 if (defined $OID) {
647 $OID = 'J'.&File::Basename::basename($OID);
648 } else {
649 print STDERR "Failed to find base for filename ($filename).....\n";
650 die("\n");
651 }
652 } else {
653 print STDERR "Failed to find filename.....\n";
654 die("\n");
655 }
656
657 } else {
658 # "hash" OID - feed file to hashfile.exe
659 #my $filename = $self->get_source_filename();
660 # we want to use the converted file for hashing if available
661 # cos its quicker
662 my $filename = $self->get_filename_for_hashing();
663
664 if (defined($filename) && -e $filename) {
665 $OID = $self->_calc_OID ($filename);
666 } else {
667 $filename = &util::get_tmp_filename();
668 if (!open (OUTFILE, ">$filename")) {
669 print STDERR "doc::set_OID could not write to $filename\n";
670 } else {
671 $self->output_section('OUTFILE', $self->get_top_section(),
672 undef, 1);
673 close (OUTFILE);
674 }
675 $OID = $self->_calc_OID ($filename);
676 &util::rm ($filename);
677 }
678 }
679 }
680 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
681}
682
683# this uses hashdoc (embedded c thingy) which is faster but still
684# needs a little work to be suffiently stable
685sub ___set_OID {
686 my $self = shift (@_);
687 my ($OID) = @_;
688
689 # if an OID wasn't provided then calculate hash value based on document
690 if (!defined $OID)
691 {
692 my $hash_text = $self->buffer_section_gml($self->get_top_section(),
693 undef, 1);
694 my $hash_len = length($hash_text);
695
696 $OID = &hashdoc::buffer($hash_text,$hash_len);
697 }
698
699 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
700}
701
702# returns the OID for this document
703sub get_OID {
704 my $self = shift (@_);
705 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
706 return $OID if (defined $OID);
707 return "NULL";
708}
709
710sub delete_OID {
711 my $self = shift (@_);
712
713 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
714}
715
716
717# methods for manipulating section names
718
719# returns the name of the top-most section (the top
720# level of the document
721sub get_top_section {
722 my $self = shift (@_);
723
724 return "";
725}
726
727# returns a section
728sub get_parent_section {
729 my $self = shift (@_);
730 my ($section) = @_;
731
732 $section =~ s/(^|\.)\d+$//;
733
734 return $section;
735}
736
737# returns the first child section (or the end child
738# if there isn't any)
739sub get_begin_child {
740 my $self = shift (@_);
741 my ($section) = @_;
742
743 my $section_ptr = $self->_lookup_section($section);
744 return "" unless defined $section_ptr;
745
746 if (defined $section_ptr->{'subsection_order'}->[0]) {
747 return "$section.$section_ptr->{'subsection_order'}->[0]";
748 }
749
750 return $self->get_end_child ($section);
751}
752
753# returns the next child of a parent section
754sub get_next_child {
755 my $self = shift (@_);
756 my ($section) = @_;
757
758 my $parent_section = $self->get_parent_section($section);
759 my $parent_section_ptr = $self->_lookup_section($parent_section);
760 return undef unless defined $parent_section_ptr;
761
762 my ($section_num) = $section =~ /(\d+)$/;
763 return undef unless defined $section_num;
764
765 my $i = 0;
766 my $section_order = $parent_section_ptr->{'subsection_order'};
767 while ($i < scalar(@$section_order)) {
768 last if $section_order->[$i] eq $section_num;
769 $i++;
770 }
771
772 $i++; # the next child
773 if ($i < scalar(@$section_order)) {
774 return $section_order->[$i] if $parent_section eq "";
775 return "$parent_section.$section_order->[$i]";
776 }
777
778 # no more sections in this level
779 return undef;
780}
781
782# returns a reference to a list of children
783sub get_children {
784 my $self = shift (@_);
785 my ($section) = @_;
786
787 my $section_ptr = $self->_lookup_section($section);
788 return [] unless defined $section_ptr;
789
790 my @children = @{$section_ptr->{'subsection_order'}};
791
792 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
793 return \@children;
794}
795
796# returns the child section one past the last one (which
797# is coded as "0")
798sub get_end_child {
799 my $self = shift (@_);
800 my ($section) = @_;
801
802 return $section . ".0" unless $section eq "";
803 return "0";
804}
805
806# returns the next section in book order
807sub get_next_section {
808 my $self = shift (@_);
809 my ($section) = @_;
810
811 return undef unless defined $section;
812
813 my $section_ptr = $self->_lookup_section($section);
814 return undef unless defined $section_ptr;
815
816 # first try to find first child
817 if (defined $section_ptr->{'subsection_order'}->[0]) {
818 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
819 return "$section.$section_ptr->{'subsection_order'}->[0]";
820 }
821
822 do {
823 # try to find sibling
824 my $next_child = $self->get_next_child ($section);
825 return $next_child if (defined $next_child);
826
827 # move up one level
828 $section = $self->get_parent_section ($section);
829 } while $section =~ /\d/;
830
831 return undef;
832}
833
834sub is_leaf_section {
835 my $self = shift (@_);
836 my ($section) = @_;
837
838 my $section_ptr = $self->_lookup_section($section);
839 return 1 unless defined $section_ptr;
840
841 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
842}
843
844# methods for dealing with sections
845
846# returns the name of the inserted section
847sub insert_section {
848 my $self = shift (@_);
849 my ($before_section) = @_;
850
851 # get the child to insert before and its parent section
852 my $parent_section = "";
853 my $before_child = "0";
854 my @before_section = split (/\./, $before_section);
855 if (scalar(@before_section) > 0) {
856 $before_child = pop (@before_section);
857 $parent_section = join (".", @before_section);
858 }
859
860 my $parent_section_ptr = $self->_lookup_section($parent_section);
861 if (!defined $parent_section_ptr) {
862 print STDERR "doc::insert_section couldn't find parent section " .
863 "$parent_section\n";
864 return;
865 }
866
867 # get the next section number
868 my $section_num = $parent_section_ptr->{'next_subsection'}++;
869
870 my $i = 0;
871 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
872 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
873 $i++;
874 }
875
876 # insert the section number into the order list
877 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
878
879 # add this section to the parent section
880 my $section_ptr = {'subsection_order'=>[],
881 'next_subsection'=>1,
882 'subsections'=>{},
883 'metadata'=>[],
884 'text'=>""};
885 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
886
887 # work out the full section number
888 my $section = $parent_section;
889 $section .= "." unless $section eq "";
890 $section .= $section_num;
891
892 return $section;
893}
894
895# creates a pre-named section
896sub create_named_section {
897 my $self = shift (@_);
898 my ($mastersection) = @_;
899
900 my ($num);
901 my $section = $mastersection;
902 my $sectionref = $self;
903
904 while ($section ne "") {
905 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
906 $num =~ s/^0+(\d)/$1/; # remove leading 0s
907 $section = "" unless defined $section;
908
909 if (defined $num) {
910 if (!defined $sectionref->{'subsections'}->{$num}) {
911 push (@{$sectionref->{'subsection_order'}}, $num);
912 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
913 'next_subsection'=>1,
914 'subsections'=>{},
915 'metadata'=>[],
916 'text'=>""};
917 if ($num >= $sectionref->{'next_subsection'}) {
918 $sectionref->{'next_subsection'} = $num + 1;
919 }
920 }
921 $sectionref = $sectionref->{'subsections'}->{$num};
922
923 } else {
924 print STDERR "doc::create_named_section couldn't create section ";
925 print STDERR "$mastersection\n";
926 last;
927 }
928 }
929}
930
931# returns a reference to a list of subsections
932sub list_subsections {
933 my $self = shift (@_);
934 my ($section) = @_;
935
936 my $section_ptr = $self->_lookup_section ($section);
937 if (!defined $section_ptr) {
938 print STDERR "doc::list_subsections couldn't find section $section\n";
939 return [];
940 }
941
942 return [@{$section_ptr->{'subsection_order'}}];
943}
944
945sub delete_section {
946 my $self = shift (@_);
947 my ($section) = @_;
948
949# my $section_ptr = {'subsection_order'=>[],
950# 'next_subsection'=>1,
951# 'subsections'=>{},
952# 'metadata'=>[],
953# 'text'=>""};
954
955 # if this is the top section reset everything
956 if ($section eq "") {
957 $self->{'subsection_order'} = [];
958 $self->{'subsections'} = {};
959 $self->{'metadata'} = [];
960 $self->{'text'} = "";
961 return;
962 }
963
964 # find the parent of the section to delete
965 my $parent_section = "";
966 my $child = "0";
967 my @section = split (/\./, $section);
968 if (scalar(@section) > 0) {
969 $child = pop (@section);
970 $parent_section = join (".", @section);
971 }
972
973 my $parent_section_ptr = $self->_lookup_section($parent_section);
974 if (!defined $parent_section_ptr) {
975 print STDERR "doc::delete_section couldn't find parent section " .
976 "$parent_section\n";
977 return;
978 }
979
980 # remove this section from the subsection_order list
981 my $i = 0;
982 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
983 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
984 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
985 last;
986 }
987 $i++;
988 }
989
990 # remove this section from the subsection hash
991 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
992 undef $parent_section_ptr->{'subsections'}->{$child};
993 }
994}
995
996#--
997# methods for dealing with metadata
998
999# set_metadata_element and get_metadata_element are for metadata
1000# which should only have one value. add_meta_data and get_metadata
1001# are for metadata which can have more than one value.
1002
1003# returns the first metadata value which matches field
1004
1005# This version of get metadata element works much like the one above,
1006# except it allows for the namespace portion of a metadata element to
1007# be ignored, thus if you are searching for dc.Title, the first piece
1008# of matching metadata ending with the name Title (once any namespace
1009# is removed) would be returned.
1010# 28-11-2003 John Thompson
1011sub get_metadata_element {
1012 my $self = shift (@_);
1013 my ($section, $field, $ignore_namespace) = @_;
1014 my ($data);
1015
1016 $ignore_namespace = 0 unless defined $ignore_namespace;
1017
1018 my $section_ptr = $self->_lookup_section($section);
1019 if (!defined $section_ptr) {
1020 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
1021 return;
1022 }
1023
1024 # Remove the any namespace if we are being told to ignore them
1025 if($ignore_namespace) {
1026 $field =~ s/^\w*\.//;
1027 }
1028
1029 foreach $data (@{$section_ptr->{'metadata'}}) {
1030
1031 my $data_name = $data->[0];
1032 # Remove the any namespace if we are being told to ignore them
1033 if($ignore_namespace) {
1034 $data_name =~ s/^\w*\.//;
1035 }
1036
1037 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
1038 }
1039
1040 return undef; # was not found
1041}
1042
1043# returns a list of the form [value1, value2, ...]
1044sub get_metadata {
1045 my $self = shift (@_);
1046 my ($section, $field, $ignore_namespace) = @_;
1047 my ($data);
1048
1049 $ignore_namespace = 0 unless defined $ignore_namespace;
1050
1051 my $section_ptr = $self->_lookup_section($section);
1052 if (!defined $section_ptr) {
1053 print STDERR "doc::get_metadata couldn't find section ",
1054 $section, "\n";
1055 return;
1056 }
1057
1058 # Remove the any namespace if we are being told to ignore them
1059 if($ignore_namespace) {
1060 $field =~ s/^\w*\.//;
1061 }
1062
1063 my @metadata = ();
1064 foreach $data (@{$section_ptr->{'metadata'}}) {
1065
1066 my $data_name = $data->[0];
1067 # Remove the any namespace if we are being told to ignore them
1068 if($ignore_namespace) {
1069 $data_name =~ s/^\w*\.//;
1070 }
1071
1072 push (@metadata, $data->[1]) if ($data_name eq $field);
1073 }
1074
1075 return \@metadata;
1076}
1077
1078# returns a list of the form [[field,value],[field,value],...]
1079sub get_all_metadata {
1080 my $self = shift (@_);
1081 my ($section) = @_;
1082
1083 my $section_ptr = $self->_lookup_section($section);
1084 if (!defined $section_ptr) {
1085 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
1086 return;
1087 }
1088
1089 return $section_ptr->{'metadata'};
1090}
1091
1092# $value is optional
1093sub delete_metadata {
1094 my $self = shift (@_);
1095 my ($section, $field, $value) = @_;
1096
1097 my $section_ptr = $self->_lookup_section($section);
1098 if (!defined $section_ptr) {
1099 print STDERR "doc::delete_metadata couldn't find section ", $section, "\n";
1100 return;
1101 }
1102
1103 my $i = 0;
1104 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
1105 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
1106 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
1107 splice (@{$section_ptr->{'metadata'}}, $i, 1);
1108 } else {
1109 $i++;
1110 }
1111 }
1112}
1113
1114sub delete_all_metadata {
1115 my $self = shift (@_);
1116 my ($section) = @_;
1117
1118 my $section_ptr = $self->_lookup_section($section);
1119 if (!defined $section_ptr) {
1120 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
1121 return;
1122 }
1123
1124 $section_ptr->{'metadata'} = [];
1125}
1126
1127sub set_metadata_element {
1128 my $self = shift (@_);
1129 my ($section, $field, $value) = @_;
1130
1131 $self->set_utf8_metadata_element ($section, $field,
1132 &unicode::ascii2utf8(\$value));
1133}
1134
1135# set_utf8_metadata_element assumes the text has already been
1136# converted to the UTF-8 encoding.
1137sub set_utf8_metadata_element {
1138 my $self = shift (@_);
1139 my ($section, $field, $value) = @_;
1140
1141 $self->delete_metadata ($section, $field);
1142 $self->add_utf8_metadata ($section, $field, $value);
1143}
1144
1145
1146# add_metadata assumes the text is in (extended) ascii form. For
1147# text which has already been converted to the UTF-8 format use
1148# add_utf8_metadata.
1149sub add_metadata {
1150 my $self = shift (@_);
1151 my ($section, $field, $value) = @_;
1152
1153 $self->add_utf8_metadata ($section, $field,
1154 &unicode::ascii2utf8(\$value));
1155}
1156
1157sub add_utf8_metadata {
1158 my $self = shift (@_);
1159 my ($section, $field, $value) = @_;
1160
1161 my $section_ptr = $self->_lookup_section($section);
1162 if (!defined $section_ptr) {
1163 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
1164 return;
1165 }
1166 if (!defined $value) {
1167 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
1168 return;
1169 }
1170 if (!defined $field) {
1171 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
1172 return;
1173 }
1174
1175 # double check that the value is utf-8
1176 if (unicode::ensure_utf8(\$value)) {
1177 print STDERR "doc::add_utf8_metadata: warning: '$field' wasn't utf8\n";
1178 }
1179
1180 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1181}
1182
1183
1184# methods for dealing with text
1185
1186# returns the text for a section
1187sub get_text {
1188 my $self = shift (@_);
1189 my ($section) = @_;
1190
1191 my $section_ptr = $self->_lookup_section($section);
1192 if (!defined $section_ptr) {
1193 print STDERR "doc::get_text couldn't find section " .
1194 "$section\n";
1195 return "";
1196 }
1197
1198 return $section_ptr->{'text'};
1199}
1200
1201# returns the (utf-8 encoded) length of the text for a section
1202sub get_text_length {
1203 my $self = shift (@_);
1204 my ($section) = @_;
1205
1206 my $section_ptr = $self->_lookup_section($section);
1207 if (!defined $section_ptr) {
1208 print STDERR "doc::get_text_length couldn't find section " .
1209 "$section\n";
1210 return 0;
1211 }
1212
1213 return length ($section_ptr->{'text'});
1214}
1215
1216sub delete_text {
1217 my $self = shift (@_);
1218 my ($section) = @_;
1219
1220 my $section_ptr = $self->_lookup_section($section);
1221 if (!defined $section_ptr) {
1222 print STDERR "doc::delete_text couldn't find section " .
1223 "$section\n";
1224 return;
1225 }
1226
1227 $section_ptr->{'text'} = "";
1228}
1229
1230# add_text assumes the text is in (extended) ascii form. For
1231# text which has been already converted to the UTF-8 format
1232# use add_utf8_text.
1233sub add_text {
1234 my $self = shift (@_);
1235 my ($section, $text) = @_;
1236
1237 # convert the text to UTF-8 encoded unicode characters
1238 # and add the text
1239 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1240}
1241
1242
1243# add_utf8_text assumes the text to be added has already
1244# been converted to the UTF-8 encoding. For ascii text use
1245# add_text
1246sub add_utf8_text {
1247 my $self = shift (@_);
1248 my ($section, $text) = @_;
1249
1250 my $section_ptr = $self->_lookup_section($section);
1251 if (!defined $section_ptr) {
1252 print STDERR "doc::add_utf8_text couldn't find section " .
1253 "$section\n";
1254 return;
1255 }
1256
1257 $section_ptr->{'text'} .= $text;
1258}
1259
1260
1261# methods for dealing with associated files
1262
1263# a file is associated with a document, NOT a section.
1264# if section is defined it is noted in the data structure
1265# only so that files associated from a particular section
1266# may be removed later (using delete_section_assoc_files)
1267sub associate_file {
1268 my $self = shift (@_);
1269 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1270 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1271
1272 # remove all associated files with the same name
1273 $self->delete_assoc_file ($assoc_filename);
1274
1275 push (@{$self->{'associated_files'}},
1276 [$real_filename, $assoc_filename, $mime_type, $section]);
1277}
1278
1279# returns a list of associated files in the form
1280# [[real_filename, assoc_filename, mimetype], ...]
1281sub get_assoc_files {
1282 my $self = shift (@_);
1283
1284 return $self->{'associated_files'};
1285}
1286
1287sub delete_section_assoc_files {
1288 my $self = shift (@_);
1289 my ($section) = @_;
1290
1291 my $i=0;
1292 while ($i < scalar (@{$self->{'associated_files'}})) {
1293 if (defined $self->{'associated_files'}->[$i]->[3] &&
1294 $self->{'associated_files'}->[$i]->[3] eq $section) {
1295 splice (@{$self->{'associated_files'}}, $i, 1);
1296 } else {
1297 $i++;
1298 }
1299 }
1300}
1301
1302sub delete_assoc_file {
1303 my $self = shift (@_);
1304 my ($assoc_filename) = @_;
1305
1306 my $i=0;
1307 while ($i < scalar (@{$self->{'associated_files'}})) {
1308 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1309 splice (@{$self->{'associated_files'}}, $i, 1);
1310 } else {
1311 $i++;
1312 }
1313 }
1314}
1315
1316sub reset_nextsection_ptr {
1317 my $self = shift (@_);
1318 my ($section) = @_;
1319
1320 my $section_ptr = $self->_lookup_section($section);
1321 $section_ptr->{'next_subsection'} = 1;
1322}
1323
13241;
Note: See TracBrowser for help on using the repository browser.