source: main/tags/2.51-fiji/gsdl/perllib/doc.pm@ 24574

Last change on this file since 24574 was 7929, checked in by davidb, 20 years ago

doc.pm modified so filename stored under gsdlsourcefilename is local
to collection.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 32.5 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use unicode;
37use util;
38use ghtml;
39##use hashdoc;
40
41# the document type may be indexed_doc, nonindexed_doc, or
42# classification
43
44my $OIDcount = 0;
45
46sub new {
47 my $class = shift (@_);
48 my ($source_filename, $doc_type) = @_;
49
50 my $self = bless {'associated_files'=>[],
51 'subsection_order'=>[],
52 'next_subsection'=>1,
53 'subsections'=>{},
54 'metadata'=>[],
55 'text'=>"",
56 'OIDtype'=>"hash"}, $class;
57
58 if (defined $source_filename) {
59 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
60
61 if (defined $collect_dir) {
62 my $dirsep = &util::get_dirsep();
63 if ($collect_dir !~ m/$dirsep$/) {
64 $collect_dir .= $dirsep;
65 }
66
67 $collect_dir =~ s/\\/\\\\/g; # escape DOS style file separator
68
69 if ($source_filename =~ /^$collect_dir(.*)$/) {
70 $source_filename = $1;
71 }
72 }
73
74 $self->set_source_filename ($source_filename);
75 }
76
77 $self->set_doc_type ($doc_type) if defined $doc_type;
78
79 return $self;
80}
81
82# clone the $self object
83sub duplicate {
84 my $self = shift (@_);
85
86 my $newobj = {};
87
88 foreach $k (keys %$self) {
89 $newobj->{$k} = &clone ($self->{$k});
90 }
91
92 bless $newobj, ref($self);
93 return $newobj;
94}
95
96sub clone {
97 my ($from) = @_;
98 my $type = ref ($from);
99
100 if ($type eq "HASH") {
101 my $to = {};
102 foreach $key (keys %$from) {
103 $to->{$key} = &clone ($from->{$key});
104 }
105 return $to;
106 } elsif ($type eq "ARRAY") {
107 my $to = [];
108 foreach $v (@$from) {
109 push (@$to, &clone ($v));
110 }
111 return $to;
112 } else {
113 return $from;
114 }
115}
116
117sub set_OIDtype {
118 my $self = shift (@_);
119 my ($type) = @_;
120
121 if ($type eq "incremental") {
122 $self->{'OIDtype'} = $type;
123 } else {
124 $self->{'OIDtype'} = "hash";
125 }
126}
127
128sub set_source_filename {
129 my $self = shift (@_);
130 my ($source_filename) = @_;
131
132 $self->set_metadata_element ($self->get_top_section(),
133 "gsdlsourcefilename",
134 $source_filename);
135}
136
137sub set_converted_filename {
138 my $self = shift (@_);
139 my ($converted_filename) = @_;
140
141 $self->set_metadata_element ($self->get_top_section(),
142 "gsdlconvertedfilename",
143 $converted_filename);
144}
145
146
147# returns the source_filename as it was provided
148sub get_source_filename {
149 my $self = shift (@_);
150
151 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
152}
153
154# returns converted filename if available else returns source filename
155sub get_filename_for_hashing {
156 my $self = shift (@_);
157
158 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
159
160 if (!defined $filename) {
161 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
162 }
163 return $filename;
164}
165
166sub set_doc_type {
167 my $self = shift (@_);
168 my ($doc_type) = @_;
169
170 $self->set_metadata_element ($self->get_top_section(),
171 "gsdldoctype",
172 $doc_type);
173}
174
175# returns the source_filename as it was provided
176# the default of "indexed_doc" is used if no document
177# type was provided
178sub get_doc_type {
179 my $self = shift (@_);
180
181 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
182 return $doc_type if (defined $doc_type);
183 return "indexed_doc";
184}
185
186sub _escape_text {
187 my ($text) = @_;
188
189 # special characters in the gml encoding
190 $text =~ s/&/&/g; # this has to be first...
191 $text =~ s/</&lt;/g;
192 $text =~ s/>/&gt;/g;
193 $text =~ s/\"/&quot;/g;
194
195 return $text;
196}
197
198sub buffer_section_xml {
199 my $self = shift (@_);
200 my ($section) = @_;
201
202 my $section_ptr = $self->_lookup_section ($section);
203 return "" unless defined $section_ptr;
204
205 my $all_text = "<Section>\n";
206 $all_text .= " <Description>\n";
207
208 # output metadata
209 foreach my $data (@{$section_ptr->{'metadata'}}) {
210 my $escaped_value = &_escape_text($data->[1]);
211 $all_text .= ' <Metadata name="' . $data->[0] . '">' . $escaped_value . "</Metadata>\n";
212 }
213
214 $all_text .= " </Description>\n";
215
216 # output the text
217 $all_text .= " <Content>";
218 $all_text .= &_escape_text($section_ptr->{'text'});
219 $all_text .= "</Content>\n";
220
221 # output all the subsections
222 foreach my $subsection (@{$section_ptr->{'subsection_order'}}) {
223 $all_text .= $self->buffer_section_xml("$section.$subsection");
224 }
225
226 $all_text .= "</Section>\n";
227
228 # make sure no nasty control characters have snuck through
229 # (XML::Parser will barf on anything it doesn't consider to be
230 # valid UTF-8 text, including things like \c@, \cC etc.)
231 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
232
233 return $all_text;
234}
235
236sub buffer_txt_section_xml {
237 my $self = shift(@_);
238 my ($section) = @_;
239
240 my $section_ptr = $self->_lookup_section ($section);
241
242 return "" unless defined $section_ptr;
243
244 my $all_text = "<Section>\n";
245
246 ##output the text
247 #$all_text .= " <Content>";
248 $all_text .= &_escape_text($section_ptr->{'text'});
249 #$all_text .= " </Content>\n";
250
251
252 #output all the subsections
253 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
254 $all_text .= $self->buffer_txt_section_xml("$section.$subsection");
255 }
256
257 $all_text .= "</Section>\n";
258
259
260 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
261 return $all_text;
262}
263
264sub buffer_mets_fileSection_section_xml() {
265 my $self = shift(@_);
266 my ($section, $doc_Dir) = @_;
267
268 $section="" unless defined $section;
269
270
271 my $section_ptr=$self->_lookup_section($section);
272 return "" unless defined $section_ptr;
273
274
275 #**output fileSection by sections
276 my $section_num ="1". $section;
277
278
279 my $filePath = $doc_Dir . '/doctxt.xml';
280
281 #**output the fileSection details
282 my $all_text = ' <mets:fileGrp ID="FILEGROUP_PRELUDE' . $section_num . '">'. "\n";
283 $all_text .= ' <mets:file MIMETYPE="text/xml" ID="FILE'.$section_num. '">'. "\n";
284 $all_text .= ' <mets:FLocate LOCTYPE="URL" xlink:href="file:'.$filePath.'#xpointer(/Section[';
285
286 my $xpath = "1".$section;
287
288 $xpath =~ s/\./]\/Section[/g;
289
290 $all_text .= $xpath;
291
292 $all_text .= ']/text())" ID="FILE'. $section_num. '" />' . "\n";
293 $all_text .= " </mets:file>\n";
294 $all_text .= " </mets:fileGrp>\n";
295
296
297 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
298 $all_text .= $self->buffer_mets_fileSection_section_xml("$section.$subsection",$doc_Dir);
299 }
300
301 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
302
303 return $all_text;
304}
305
306sub buffer_mets_fileWhole_section_xml(){
307 my $self = shift(@_);
308 my ($section) = @_;
309
310 my $section_ptr = $self-> _lookup_section($section);
311 return "" unless defined $section_ptr;
312
313 my $all_text="" unless defined $all_txt;
314
315 my ($dirPath)="" unless defined $dirPath;
316 my $fileID=0;
317
318 #** output the fileSection for the whole section
319 #*** get the sourcefile and associative file
320
321 foreach my $data (@{$section_ptr->{'metadata'}}){
322 my $escaped_value = &_escape_text($data->[1]);
323 if ($data->[0] eq "gsdlsourcefilename") {
324 ($dirPath) = $escaped_value =~ m/^(.*)[\/\\][^\/\\]*$/;
325
326 $all_text .= ' <mets:fileGrp ID="default">'."\n";
327 ++$fileID;
328 $all_text .= ' <mets:file MIMETYPE="text/xml" ID="default.'.$fileID.'">'. "\n";
329 $all_text .= ' <mets:FLocate LOCTYPE="URL" xlink:href="file:'.$data->[1].'" ID="default.'.$fileID.'" />'."\n";
330
331 $all_text .= " </mets:file>\n";
332 }
333
334 if ($data->[0] eq "gsdlassocfile"){
335 $escaped_value =~ m/^(.*?):(.*):$/;
336
337 my $assfilePath = $dirPath . '/'. $1;
338 ++$fileID;
339 $all_text .= ' <mets:file MIMETYPE="'.$2.'" ID="default.'.$fileID. '">'. "\n";
340 $all_text .= ' <mets:FLocate LOCTYPE="URL" xlink:href="file:'.$assfilePath.'" ID="default.'. $fileID.'" />'."\n";
341
342 $all_text .= " </mets:file>\n";
343 }
344 }
345 $all_text .= " </mets:fileGrp>\n";
346
347
348 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
349
350 return $all_text;
351}
352
353sub buffer_mets_StruMapSection_section_xml(){
354 my $self = shift(@_);
355 my ($section, $order_numref) = @_;
356
357 $section="" unless defined $section;
358
359
360 my $section_ptr=$self->_lookup_section($section);
361 return "" unless defined $section_ptr;
362
363 #**output fileSection by sections
364 my $section_num ="1". $section;
365
366 #**output the StruMap details
367
368 my $all_text = ' <mets:div ID="DS'. $section_num .'" TYPE="Section" ORDER="'.$$order_numref++.'" ORDERLABEL="'. $section_num .'" LABEL="';
369 $all_text .= $section_num . '" DMDID="DM'.$section_num.'">'. "\n";
370
371 $all_text .= ' <mets:fptr FILEID="FILEGROUP_PRELUDE'.$section_num.'" />'. "\n";
372
373
374 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
375 $all_text .= $self->buffer_mets_StruMapSection_section_xml("$section.$subsection", $order_numref);
376 }
377
378 $all_text .= " </mets:div>\n";
379
380 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
381
382 return $all_text;
383}
384
385
386sub buffer_mets_StruMapWhole_section_xml(){
387 my $self = shift(@_);
388 my ($section) = @_;
389
390 my $section_ptr = $self-> _lookup_section($section);
391 return "" unless defined $section_ptr;
392
393 my $all_text="" unless defined $all_txt;
394 my $fileID=0;
395
396 $all_text .= '<mets:structMap ID="All" TYPE="Whole Document" LABEL="All">'."\n";
397 $all_text .= ' <mets:div ID="All" TYPE="Document" ORDER="All" ORDERLABEL="All" LABEL="Whole Documemt" DMDID="DM1">' . "\n";
398
399
400 #** output the StruMapSection for the whole section
401 #*** get the sourcefile and associative file
402
403 foreach my $data (@{$section_ptr->{'metadata'}}){
404 my $escaped_value = &_escape_text($data->[1]);
405
406 if ($data->[0] eq "gsdlsourcefilename") {
407 ++$fileID;
408 $all_text .= ' <mets:fptr FILEID="default.'.$fileID.'" />'."\n";
409 }
410
411 if ($data->[0] eq "gsdlassocfile"){
412 ++$fileID;
413 $all_text .= ' <mets:fptr FILEID="default.'.$fileID. '" />'. "\n";
414 }
415 }
416 $all_text .= " </mets:div>\n";
417
418 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
419
420 return $all_text;
421}
422
423
424sub buffer_mets_dmdSection_section_xml(){
425 my $self = shift(@_);
426 my ($section) = @_;
427
428 $section="" unless defined $section;
429
430 my $section_ptr=$self->_lookup_section($section);
431 return "" unless defined $section_ptr;
432
433 #***convert section number
434 my $section_num ="1". $section;
435
436 #**output the dmdSection details
437 my $all_text = '<mets:dmdSec ID="DM'.$section_num.'" GROUPID="'.$section_num.'">'. "\n";
438 $all_text .= ' <mets:mdWrap MDType="gsdl" ID="'.$section_num.'">'."\n";
439 $all_text .= " <mets:xmlData>\n";
440 foreach my $data (@{$section_ptr->{'metadata'}}){
441 my $escaped_value = &_escape_text($data->[1]);
442 $all_text .= ' <gsdl:Metadata name="'. $data->[0].'">'. $escaped_value. "</gsdl:Metadata>\n";
443 }
444 $all_text .= " </mets:xmlData>\n";
445 $all_text .= " </mets:mdWrap>\n";
446 $all_text .= "</mets:dmdSec>\n";
447
448 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
449 $all_text .= $self->buffer_mets_dmdSection_section_xml("$section.$subsection");
450 }
451
452 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
453
454 return $all_text;
455}
456
457sub output_section {
458 my $self = shift (@_);
459 my ($handle, $section) = @_;
460
461 print $handle $self->buffer_section_xml($section);
462}
463
464
465#*** print out doctxt.xml file
466sub output_txt_section {
467 my $self = shift (@_);
468 my ($handle, $section) = @_;
469
470 print $handle $self->buffer_txt_section_xml($section);
471}
472
473#*** print out docmets.xml file
474sub output_mets_section {
475 my $self = shift(@_);
476 my ($handle, $section, $doc_Dir) = @_;
477
478 #***print out the dmdSection
479 print $handle $self->buffer_mets_dmdSection_section_xml($section);
480
481 #***print out the fileSection by sections
482 print $handle "<mets:fileSec>\n";
483 print $handle $self->buffer_mets_fileSection_section_xml($section,$doc_Dir);
484 #***print out the whole fileSection
485 print $handle $self->buffer_mets_fileWhole_section_xml($section);
486 print $handle "</mets:fileSec>\n";
487
488 #***print out the StruMapSection by sections
489 print $handle '<mets:structMap ID="Section" TYPE="Section" LABEL="Section">' . "\n";
490 my $order_num=0;
491 print $handle $self->buffer_mets_StruMapSection_section_xml($section, \$order_num);
492 print $handle "</mets:structMap>\n";
493 print $handle $self->buffer_mets_StruMapWhole_section_xml($section);
494 print $handle "</mets:structMap>\n";
495}
496
497
498# look up the reference to the a particular section
499sub _lookup_section {
500 my $self = shift (@_);
501 my ($section) = @_;
502
503 my ($num);
504 my $sectionref = $self;
505
506 while (defined $section && $section ne "") {
507 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
508 $num =~ s/^0+(\d)/$1/; # remove leading 0s
509 $section = "" unless defined $section;
510
511 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
512 $sectionref = $sectionref->{'subsections'}->{$num};
513 } else {
514 return undef;
515 }
516 }
517
518 return $sectionref;
519}
520
521# calculate OID by hashing the contents of the document
522sub _calc_OID {
523 my $self = shift (@_);
524 my ($filename) = @_;
525
526 my $osexe = &util::get_os_exe();
527
528 my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
529 $ENV{'GSDLOS'},"hashfile$osexe");
530 my $result = "NULL";
531
532 if (-e "$hashfile_exe") {
533# $result = `\"$hashfile_exe\" \"$filename\"`;
534 $result = `hashfile$osexe \"$filename\"`;
535 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
536
537 } else {
538 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
539 }
540
541 return "HASH$result";
542}
543
544# methods dealing with OID, not groups of them.
545
546# if $OID is not provided one is calculated
547sub set_OID {
548 my $self = shift (@_);
549 my ($OID) = @_;
550
551 # if an OID wasn't provided claculate one
552 if (!defined $OID) {
553 $OID = "NULL";
554
555 if ($self->{'OIDtype'} eq "incremental") {
556 $OID = "D" . $OIDcount;
557 $OIDcount ++;
558
559 } else {
560 # "hash" OID - feed file to hashfile.exe
561 #my $filename = $self->get_source_filename();
562 # we want to use the converted file for hashing if available
563 # cos its quicker
564 my $filename = $self->get_filename_for_hashing();
565 if (defined($filename) && -e $filename) {
566 $OID = $self->_calc_OID ($filename);
567 } else {
568 $filename = &util::get_tmp_filename();
569 if (!open (OUTFILE, ">$filename")) {
570 print STDERR "doc::set_OID could not write to $filename\n";
571 } else {
572 $self->output_section('OUTFILE', $self->get_top_section(),
573 undef, 1);
574 close (OUTFILE);
575 }
576
577 $OID = $self->_calc_OID ($filename);
578 &util::rm ($filename);
579 }
580 }
581 }
582 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
583}
584
585# this uses hashdoc (embedded c thingy) which is faster but still
586# needs a little work to be suffiently stable
587sub ___set_OID {
588 my $self = shift (@_);
589 my ($OID) = @_;
590
591 # if an OID wasn't provided then calculate hash value based on document
592 if (!defined $OID)
593 {
594 my $hash_text = $self->buffer_section_gml($self->get_top_section(),
595 undef, 1);
596 my $hash_len = length($hash_text);
597
598 $OID = &hashdoc::buffer($hash_text,$hash_len);
599 }
600
601 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
602}
603
604# returns the OID for this document
605sub get_OID {
606 my $self = shift (@_);
607 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
608 return $OID if (defined $OID);
609 return "NULL";
610}
611
612sub delete_OID {
613 my $self = shift (@_);
614
615 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
616}
617
618
619# methods for manipulating section names
620
621# returns the name of the top-most section (the top
622# level of the document
623sub get_top_section {
624 my $self = shift (@_);
625
626 return "";
627}
628
629# returns a section
630sub get_parent_section {
631 my $self = shift (@_);
632 my ($section) = @_;
633
634 $section =~ s/(^|\.)\d+$//;
635
636 return $section;
637}
638
639# returns the first child section (or the end child
640# if there isn't any)
641sub get_begin_child {
642 my $self = shift (@_);
643 my ($section) = @_;
644
645 my $section_ptr = $self->_lookup_section($section);
646 return "" unless defined $section_ptr;
647
648 if (defined $section_ptr->{'subsection_order'}->[0]) {
649 return "$section.$section_ptr->{'subsection_order'}->[0]";
650 }
651
652 return $self->get_end_child ($section);
653}
654
655# returns the next child of a parent section
656sub get_next_child {
657 my $self = shift (@_);
658 my ($section) = @_;
659
660 my $parent_section = $self->get_parent_section($section);
661 my $parent_section_ptr = $self->_lookup_section($parent_section);
662 return undef unless defined $parent_section_ptr;
663
664 my ($section_num) = $section =~ /(\d+)$/;
665 return undef unless defined $section_num;
666
667 my $i = 0;
668 my $section_order = $parent_section_ptr->{'subsection_order'};
669 while ($i < scalar(@$section_order)) {
670 last if $section_order->[$i] eq $section_num;
671 $i++;
672 }
673
674 $i++; # the next child
675 if ($i < scalar(@$section_order)) {
676 return $section_order->[$i] if $parent_section eq "";
677 return "$parent_section.$section_order->[$i]";
678 }
679
680 # no more sections in this level
681 return undef;
682}
683
684# returns a reference to a list of children
685sub get_children {
686 my $self = shift (@_);
687 my ($section) = @_;
688
689 my $section_ptr = $self->_lookup_section($section);
690 return [] unless defined $section_ptr;
691
692 my @children = @{$section_ptr->{'subsection_order'}};
693
694 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
695 return \@children;
696}
697
698# returns the child section one past the last one (which
699# is coded as "0")
700sub get_end_child {
701 my $self = shift (@_);
702 my ($section) = @_;
703
704 return $section . ".0" unless $section eq "";
705 return "0";
706}
707
708# returns the next section in book order
709sub get_next_section {
710 my $self = shift (@_);
711 my ($section) = @_;
712
713 return undef unless defined $section;
714
715 my $section_ptr = $self->_lookup_section($section);
716 return undef unless defined $section_ptr;
717
718 # first try to find first child
719 if (defined $section_ptr->{'subsection_order'}->[0]) {
720 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
721 return "$section.$section_ptr->{'subsection_order'}->[0]";
722 }
723
724 do {
725 # try to find sibling
726 my $next_child = $self->get_next_child ($section);
727 return $next_child if (defined $next_child);
728
729 # move up one level
730 $section = $self->get_parent_section ($section);
731 } while $section =~ /\d/;
732
733 return undef;
734}
735
736sub is_leaf_section {
737 my $self = shift (@_);
738 my ($section) = @_;
739
740 my $section_ptr = $self->_lookup_section($section);
741 return 1 unless defined $section_ptr;
742
743 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
744}
745
746# methods for dealing with sections
747
748# returns the name of the inserted section
749sub insert_section {
750 my $self = shift (@_);
751 my ($before_section) = @_;
752
753 # get the child to insert before and its parent section
754 my $parent_section = "";
755 my $before_child = "0";
756 my @before_section = split (/\./, $before_section);
757 if (scalar(@before_section) > 0) {
758 $before_child = pop (@before_section);
759 $parent_section = join (".", @before_section);
760 }
761
762 my $parent_section_ptr = $self->_lookup_section($parent_section);
763 if (!defined $parent_section_ptr) {
764 print STDERR "doc::insert_section couldn't find parent section " .
765 "$parent_section\n";
766 return;
767 }
768
769 # get the next section number
770 my $section_num = $parent_section_ptr->{'next_subsection'}++;
771
772 my $i = 0;
773 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
774 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
775 $i++;
776 }
777
778 # insert the section number into the order list
779 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
780
781 # add this section to the parent section
782 my $section_ptr = {'subsection_order'=>[],
783 'next_subsection'=>1,
784 'subsections'=>{},
785 'metadata'=>[],
786 'text'=>""};
787 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
788
789 # work out the full section number
790 my $section = $parent_section;
791 $section .= "." unless $section eq "";
792 $section .= $section_num;
793
794 return $section;
795}
796
797# creates a pre-named section
798sub create_named_section {
799 my $self = shift (@_);
800 my ($mastersection) = @_;
801
802 my ($num);
803 my $section = $mastersection;
804 my $sectionref = $self;
805
806 while ($section ne "") {
807 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
808 $num =~ s/^0+(\d)/$1/; # remove leading 0s
809 $section = "" unless defined $section;
810
811 if (defined $num) {
812 if (!defined $sectionref->{'subsections'}->{$num}) {
813 push (@{$sectionref->{'subsection_order'}}, $num);
814 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
815 'next_subsection'=>1,
816 'subsections'=>{},
817 'metadata'=>[],
818 'text'=>""};
819 if ($num >= $sectionref->{'next_subsection'}) {
820 $sectionref->{'next_subsection'} = $num + 1;
821 }
822 }
823 $sectionref = $sectionref->{'subsections'}->{$num};
824
825 } else {
826 print STDERR "doc::create_named_section couldn't create section ";
827 print STDERR "$mastersection\n";
828 last;
829 }
830 }
831}
832
833# returns a reference to a list of subsections
834sub list_subsections {
835 my $self = shift (@_);
836 my ($section) = @_;
837
838 my $section_ptr = $self->_lookup_section ($section);
839 if (!defined $section_ptr) {
840 print STDERR "doc::list_subsections couldn't find section $section\n";
841 return [];
842 }
843
844 return [@{$section_ptr->{'subsection_order'}}];
845}
846
847sub delete_section {
848 my $self = shift (@_);
849 my ($section) = @_;
850
851# my $section_ptr = {'subsection_order'=>[],
852# 'next_subsection'=>1,
853# 'subsections'=>{},
854# 'metadata'=>[],
855# 'text'=>""};
856
857 # if this is the top section reset everything
858 if ($section eq "") {
859 $self->{'subsection_order'} = [];
860 $self->{'subsections'} = {};
861 $self->{'metadata'} = [];
862 $self->{'text'} = "";
863 return;
864 }
865
866 # find the parent of the section to delete
867 my $parent_section = "";
868 my $child = "0";
869 my @section = split (/\./, $section);
870 if (scalar(@section) > 0) {
871 $child = pop (@section);
872 $parent_section = join (".", @section);
873 }
874
875 my $parent_section_ptr = $self->_lookup_section($parent_section);
876 if (!defined $parent_section_ptr) {
877 print STDERR "doc::delete_section couldn't find parent section " .
878 "$parent_section\n";
879 return;
880 }
881
882 # remove this section from the subsection_order list
883 my $i = 0;
884 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
885 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
886 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
887 last;
888 }
889 $i++;
890 }
891
892 # remove this section from the subsection hash
893 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
894 undef $parent_section_ptr->{'subsections'}->{$child};
895 }
896}
897
898#--
899# methods for dealing with metadata
900
901# set_metadata_element and get_metadata_element are for metadata
902# which should only have one value. add_meta_data and get_metadata
903# are for metadata which can have more than one value.
904
905# returns the first metadata value which matches field
906
907# This version of get metadata element works much like the one above,
908# except it allows for the namespace portion of a metadata element to
909# be ignored, thus if you are searching for dc.Title, the first piece
910# of matching metadata ending with the name Title (once any namespace
911# is removed) would be returned.
912# 28-11-2003 John Thompson
913sub get_metadata_element {
914 my $self = shift (@_);
915 my ($section, $field, $ignore_namespace) = @_;
916 my ($data);
917
918 $ignore_namespace = 0 unless defined $ignore_namespace;
919
920 my $section_ptr = $self->_lookup_section($section);
921 if (!defined $section_ptr) {
922 print STDERR "doc::get_metadata_element couldn't find section " .
923 "$section\n";
924 return;
925 }
926
927 # Remove the any namespace if we are being told to ignore them
928 if($ignore_namespace) {
929 $field =~ s/^\w*\.//;
930 }
931
932 foreach $data (@{$section_ptr->{'metadata'}}) {
933
934 my $data_name = $data->[0];
935 # Remove the any namespace if we are being told to ignore them
936 if($ignore_namespace) {
937 $data_name =~ s/^\w*\.//;
938 }
939
940 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
941 }
942
943 return undef; # was not found
944}
945
946# returns a list of the form [value1, value2, ...]
947sub get_metadata {
948 my $self = shift (@_);
949 my ($section, $field, $ignore_namespace) = @_;
950 my ($data);
951
952 $ignore_namespace = 0 unless defined $ignore_namespace;
953
954 my $section_ptr = $self->_lookup_section($section);
955 if (!defined $section_ptr) {
956 print STDERR "doc::get_metadata couldn't find section " .
957 "$section\n";
958 return;
959 }
960
961 # Remove the any namespace if we are being told to ignore them
962 if($ignore_namespace) {
963 $field =~ s/^\w*\.//;
964 }
965
966 my @metadata = ();
967 foreach $data (@{$section_ptr->{'metadata'}}) {
968
969 my $data_name = $data->[0];
970 # Remove the any namespace if we are being told to ignore them
971 if($ignore_namespace) {
972 $data_name =~ s/^\w*\.//;
973 }
974
975 push (@metadata, $data->[1]) if ($data_name eq $field);
976 }
977
978 return \@metadata;
979}
980
981# returns a list of the form [[field,value],[field,value],...]
982sub get_all_metadata {
983 my $self = shift (@_);
984 my ($section) = @_;
985
986 my $section_ptr = $self->_lookup_section($section);
987 if (!defined $section_ptr) {
988 print STDERR "doc::get_all_metadata couldn't find section " .
989 "$section\n";
990 return;
991 }
992
993 return $section_ptr->{'metadata'};
994}
995
996# $value is optional
997sub delete_metadata {
998 my $self = shift (@_);
999 my ($section, $field, $value) = @_;
1000
1001 my $section_ptr = $self->_lookup_section($section);
1002 if (!defined $section_ptr) {
1003 print STDERR "doc::delete_metadata couldn't find section " .
1004 "$section\n";
1005 return;
1006 }
1007
1008 my $i = 0;
1009 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
1010 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
1011 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
1012 splice (@{$section_ptr->{'metadata'}}, $i, 1);
1013 } else {
1014 $i++;
1015 }
1016 }
1017}
1018
1019sub delete_all_metadata {
1020 my $self = shift (@_);
1021 my ($section) = @_;
1022
1023 my $section_ptr = $self->_lookup_section($section);
1024 if (!defined $section_ptr) {
1025 print STDERR "doc::delete_all_metadata couldn't find section " .
1026 "$section\n";
1027 return;
1028 }
1029
1030 $section_ptr->{'metadata'} = [];
1031}
1032
1033sub set_metadata_element {
1034 my $self = shift (@_);
1035 my ($section, $field, $value) = @_;
1036
1037 $self->set_utf8_metadata_element ($section, $field,
1038 &unicode::ascii2utf8(\$value));
1039}
1040
1041# set_utf8_metadata_element assumes the text has already been
1042# converted to the UTF-8 encoding.
1043sub set_utf8_metadata_element {
1044 my $self = shift (@_);
1045 my ($section, $field, $value) = @_;
1046
1047 $self->delete_metadata ($section, $field);
1048 $self->add_utf8_metadata ($section, $field, $value);
1049}
1050
1051
1052# add_metadata assumes the text is in (extended) ascii form. For
1053# text which hash been already converted to the UTF-8 format use
1054# add_utf8_metadata.
1055sub add_metadata {
1056 my $self = shift (@_);
1057 my ($section, $field, $value) = @_;
1058
1059 $self->add_utf8_metadata ($section, $field,
1060 &unicode::ascii2utf8(\$value));
1061}
1062
1063sub add_utf8_metadata {
1064 my $self = shift (@_);
1065 my ($section, $field, $value) = @_;
1066
1067 my $section_ptr = $self->_lookup_section($section);
1068 if (!defined $section_ptr) {
1069 print STDERR "doc::add_utf8_metadata couldn't find section " .
1070 "$section\n";
1071 return;
1072 }
1073 if (!defined $value) {
1074 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
1075 return;
1076 }
1077 if (!defined $field) {
1078 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
1079 return;
1080 }
1081
1082 # double check that the value is utf-8
1083 if (unicode::ensure_utf8(\$value)) {
1084 print STDERR "doc::add_utf8_metadata: warning: '$field' wasn't utf8\n";
1085 }
1086
1087 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1088}
1089
1090
1091# methods for dealing with text
1092
1093# returns the text for a section
1094sub get_text {
1095 my $self = shift (@_);
1096 my ($section) = @_;
1097
1098 my $section_ptr = $self->_lookup_section($section);
1099 if (!defined $section_ptr) {
1100 print STDERR "doc::get_text couldn't find section " .
1101 "$section\n";
1102 return "";
1103 }
1104
1105 return $section_ptr->{'text'};
1106}
1107
1108# returns the (utf-8 encoded) length of the text for a section
1109sub get_text_length {
1110 my $self = shift (@_);
1111 my ($section) = @_;
1112
1113 my $section_ptr = $self->_lookup_section($section);
1114 if (!defined $section_ptr) {
1115 print STDERR "doc::get_text_length couldn't find section " .
1116 "$section\n";
1117 return 0;
1118 }
1119
1120 return length ($section_ptr->{'text'});
1121}
1122
1123sub delete_text {
1124 my $self = shift (@_);
1125 my ($section) = @_;
1126
1127 my $section_ptr = $self->_lookup_section($section);
1128 if (!defined $section_ptr) {
1129 print STDERR "doc::delete_text couldn't find section " .
1130 "$section\n";
1131 return;
1132 }
1133
1134 $section_ptr->{'text'} = "";
1135}
1136
1137# add_text assumes the text is in (extended) ascii form. For
1138# text which has been already converted to the UTF-8 format
1139# use add_utf8_text.
1140sub add_text {
1141 my $self = shift (@_);
1142 my ($section, $text) = @_;
1143
1144 # convert the text to UTF-8 encoded unicode characters
1145 # and add the text
1146 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1147}
1148
1149
1150# add_utf8_text assumes the text to be added has already
1151# been converted to the UTF-8 encoding. For ascii text use
1152# add_text
1153sub add_utf8_text {
1154 my $self = shift (@_);
1155 my ($section, $text) = @_;
1156
1157 my $section_ptr = $self->_lookup_section($section);
1158 if (!defined $section_ptr) {
1159 print STDERR "doc::add_utf8_text couldn't find section " .
1160 "$section\n";
1161 return;
1162 }
1163
1164 $section_ptr->{'text'} .= $text;
1165}
1166
1167
1168# methods for dealing with associated files
1169
1170# a file is associated with a document, NOT a section.
1171# if section is defined it is noted in the data structure
1172# only so that files associated from a particular section
1173# may be removed later (using delete_section_assoc_files)
1174sub associate_file {
1175 my $self = shift (@_);
1176 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1177 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1178
1179 # remove all associated files with the same name
1180 $self->delete_assoc_file ($assoc_filename);
1181
1182 push (@{$self->{'associated_files'}},
1183 [$real_filename, $assoc_filename, $mime_type, $section]);
1184}
1185
1186# returns a list of associated files in the form
1187# [[real_filename, assoc_filename, mimetype], ...]
1188sub get_assoc_files {
1189 my $self = shift (@_);
1190
1191 return $self->{'associated_files'};
1192}
1193
1194sub delete_section_assoc_files {
1195 my $self = shift (@_);
1196 my ($section) = @_;
1197
1198 my $i=0;
1199 while ($i < scalar (@{$self->{'associated_files'}})) {
1200 if (defined $self->{'associated_files'}->[$i]->[3] &&
1201 $self->{'associated_files'}->[$i]->[3] eq $section) {
1202 splice (@{$self->{'associated_files'}}, $i, 1);
1203 } else {
1204 $i++;
1205 }
1206 }
1207}
1208
1209sub delete_assoc_file {
1210 my $self = shift (@_);
1211 my ($assoc_filename) = @_;
1212
1213 my $i=0;
1214 while ($i < scalar (@{$self->{'associated_files'}})) {
1215 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1216 splice (@{$self->{'associated_files'}}, $i, 1);
1217 } else {
1218 $i++;
1219 }
1220 }
1221}
1222
1223sub reset_nextsection_ptr {
1224 my $self = shift (@_);
1225 my ($section) = @_;
1226
1227 my $section_ptr = $self->_lookup_section($section);
1228 $section_ptr->{'next_subsection'} = 1;
1229}
1230
12311;
Note: See TracBrowser for help on using the repository browser.