source: main/tags/2.52/gsdl/perllib/doc.pm@ 25422

Last change on this file since 25422 was 8220, checked in by cs025, 20 years ago

Extensions to underpin OAI - e.g. creation of the OAI classifier, adding
modified file dates and ensuring that documents know the parent classifiers
to which they belong.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 32.8 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use unicode;
37use util;
38use ghtml;
39use File::stat;
40##use hashdoc;
41
42# the document type may be indexed_doc, nonindexed_doc, or
43# classification
44
45my $OIDcount = 0;
46
47sub new {
48 my $class = shift (@_);
49 my ($source_filename, $doc_type) = @_;
50
51 my $self = bless {'associated_files'=>[],
52 'subsection_order'=>[],
53 'next_subsection'=>1,
54 'subsections'=>{},
55 'metadata'=>[],
56 'text'=>"",
57 'OIDtype'=>"hash"}, $class;
58
59 #GRB: Added code to set lastmodified for OAI purposes
60 if ((defined $doc_type) && (defined $source_filename) && (-e $source_filename)) {
61 my $file_stat = stat($source_filename);
62 my $mtime = $file_stat->mtime;
63 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $file_stat->mtime);
64 }
65 #GRB: end inserted code
66
67 if (defined $source_filename) {
68 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
69
70 if (defined $collect_dir) {
71 my $dirsep = &util::get_dirsep();
72 if ($collect_dir !~ m/$dirsep$/) {
73 $collect_dir .= $dirsep;
74 }
75
76 $collect_dir =~ s/\\/\\\\/g; # escape DOS style file separator
77
78 if ($source_filename =~ /^$collect_dir(.*)$/) {
79 $source_filename = $1;
80 }
81 }
82
83 $self->set_source_filename ($source_filename);
84 }
85
86 $self->set_doc_type ($doc_type) if defined $doc_type;
87
88 return $self;
89}
90
91# clone the $self object
92sub duplicate {
93 my $self = shift (@_);
94
95 my $newobj = {};
96
97 foreach $k (keys %$self) {
98 $newobj->{$k} = &clone ($self->{$k});
99 }
100
101 bless $newobj, ref($self);
102 return $newobj;
103}
104
105sub clone {
106 my ($from) = @_;
107 my $type = ref ($from);
108
109 if ($type eq "HASH") {
110 my $to = {};
111 foreach $key (keys %$from) {
112 $to->{$key} = &clone ($from->{$key});
113 }
114 return $to;
115 } elsif ($type eq "ARRAY") {
116 my $to = [];
117 foreach $v (@$from) {
118 push (@$to, &clone ($v));
119 }
120 return $to;
121 } else {
122 return $from;
123 }
124}
125
126sub set_OIDtype {
127 my $self = shift (@_);
128 my ($type) = @_;
129
130 if ($type eq "incremental") {
131 $self->{'OIDtype'} = $type;
132 } else {
133 $self->{'OIDtype'} = "hash";
134 }
135}
136
137sub set_source_filename {
138 my $self = shift (@_);
139 my ($source_filename) = @_;
140
141 $self->set_metadata_element ($self->get_top_section(),
142 "gsdlsourcefilename",
143 $source_filename);
144}
145
146sub set_converted_filename {
147 my $self = shift (@_);
148 my ($converted_filename) = @_;
149
150 $self->set_metadata_element ($self->get_top_section(),
151 "gsdlconvertedfilename",
152 $converted_filename);
153}
154
155
156# returns the source_filename as it was provided
157sub get_source_filename {
158 my $self = shift (@_);
159
160 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
161}
162
163# returns converted filename if available else returns source filename
164sub get_filename_for_hashing {
165 my $self = shift (@_);
166
167 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
168
169 if (!defined $filename) {
170 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
171 }
172 return $filename;
173}
174
175sub set_doc_type {
176 my $self = shift (@_);
177 my ($doc_type) = @_;
178
179 $self->set_metadata_element ($self->get_top_section(),
180 "gsdldoctype",
181 $doc_type);
182}
183
184# returns the source_filename as it was provided
185# the default of "indexed_doc" is used if no document
186# type was provided
187sub get_doc_type {
188 my $self = shift (@_);
189
190 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
191 return $doc_type if (defined $doc_type);
192 return "indexed_doc";
193}
194
195sub _escape_text {
196 my ($text) = @_;
197
198 # special characters in the gml encoding
199 $text =~ s/&/&/g; # this has to be first...
200 $text =~ s/</&lt;/g;
201 $text =~ s/>/&gt;/g;
202 $text =~ s/\"/&quot;/g;
203
204 return $text;
205}
206
207sub buffer_section_xml {
208 my $self = shift (@_);
209 my ($section) = @_;
210
211 my $section_ptr = $self->_lookup_section ($section);
212 return "" unless defined $section_ptr;
213
214 my $all_text = "<Section>\n";
215 $all_text .= " <Description>\n";
216
217 # output metadata
218 foreach my $data (@{$section_ptr->{'metadata'}}) {
219 my $escaped_value = &_escape_text($data->[1]);
220 $all_text .= ' <Metadata name="' . $data->[0] . '">' . $escaped_value . "</Metadata>\n";
221 }
222
223 $all_text .= " </Description>\n";
224
225 # output the text
226 $all_text .= " <Content>";
227 $all_text .= &_escape_text($section_ptr->{'text'});
228 $all_text .= "</Content>\n";
229
230 # output all the subsections
231 foreach my $subsection (@{$section_ptr->{'subsection_order'}}) {
232 $all_text .= $self->buffer_section_xml("$section.$subsection");
233 }
234
235 $all_text .= "</Section>\n";
236
237 # make sure no nasty control characters have snuck through
238 # (XML::Parser will barf on anything it doesn't consider to be
239 # valid UTF-8 text, including things like \c@, \cC etc.)
240 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
241
242 return $all_text;
243}
244
245sub buffer_txt_section_xml {
246 my $self = shift(@_);
247 my ($section) = @_;
248
249 my $section_ptr = $self->_lookup_section ($section);
250
251 return "" unless defined $section_ptr;
252
253 my $all_text = "<Section>\n";
254
255 ##output the text
256 #$all_text .= " <Content>";
257 $all_text .= &_escape_text($section_ptr->{'text'});
258 #$all_text .= " </Content>\n";
259
260
261 #output all the subsections
262 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
263 $all_text .= $self->buffer_txt_section_xml("$section.$subsection");
264 }
265
266 $all_text .= "</Section>\n";
267
268
269 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
270 return $all_text;
271}
272
273sub buffer_mets_fileSection_section_xml() {
274 my $self = shift(@_);
275 my ($section, $doc_Dir) = @_;
276
277 $section="" unless defined $section;
278
279
280 my $section_ptr=$self->_lookup_section($section);
281 return "" unless defined $section_ptr;
282
283
284 #**output fileSection by sections
285 my $section_num ="1". $section;
286
287
288 my $filePath = $doc_Dir . '/doctxt.xml';
289
290 #**output the fileSection details
291 my $all_text = ' <mets:fileGrp ID="FILEGROUP_PRELUDE' . $section_num . '">'. "\n";
292 $all_text .= ' <mets:file MIMETYPE="text/xml" ID="FILE'.$section_num. '">'. "\n";
293 $all_text .= ' <mets:FLocate LOCTYPE="URL" xlink:href="file:'.$filePath.'#xpointer(/Section[';
294
295 my $xpath = "1".$section;
296
297 $xpath =~ s/\./]\/Section[/g;
298
299 $all_text .= $xpath;
300
301 $all_text .= ']/text())" ID="FILE'. $section_num. '" />' . "\n";
302 $all_text .= " </mets:file>\n";
303 $all_text .= " </mets:fileGrp>\n";
304
305
306 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
307 $all_text .= $self->buffer_mets_fileSection_section_xml("$section.$subsection",$doc_Dir);
308 }
309
310 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
311
312 return $all_text;
313}
314
315sub buffer_mets_fileWhole_section_xml(){
316 my $self = shift(@_);
317 my ($section) = @_;
318
319 my $section_ptr = $self-> _lookup_section($section);
320 return "" unless defined $section_ptr;
321
322 my $all_text="" unless defined $all_txt;
323
324 my ($dirPath)="" unless defined $dirPath;
325 my $fileID=0;
326
327 #** output the fileSection for the whole section
328 #*** get the sourcefile and associative file
329
330 foreach my $data (@{$section_ptr->{'metadata'}}){
331 my $escaped_value = &_escape_text($data->[1]);
332 if ($data->[0] eq "gsdlsourcefilename") {
333 ($dirPath) = $escaped_value =~ m/^(.*)[\/\\][^\/\\]*$/;
334
335 $all_text .= ' <mets:fileGrp ID="default">'."\n";
336 ++$fileID;
337 $all_text .= ' <mets:file MIMETYPE="text/xml" ID="default.'.$fileID.'">'. "\n";
338 $all_text .= ' <mets:FLocate LOCTYPE="URL" xlink:href="file:'.$data->[1].'" ID="default.'.$fileID.'" />'."\n";
339
340 $all_text .= " </mets:file>\n";
341 }
342
343 if ($data->[0] eq "gsdlassocfile"){
344 $escaped_value =~ m/^(.*?):(.*):$/;
345
346 my $assfilePath = $dirPath . '/'. $1;
347 ++$fileID;
348 $all_text .= ' <mets:file MIMETYPE="'.$2.'" ID="default.'.$fileID. '">'. "\n";
349 $all_text .= ' <mets:FLocate LOCTYPE="URL" xlink:href="file:'.$assfilePath.'" ID="default.'. $fileID.'" />'."\n";
350
351 $all_text .= " </mets:file>\n";
352 }
353 }
354 $all_text .= " </mets:fileGrp>\n";
355
356
357 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
358
359 return $all_text;
360}
361
362sub buffer_mets_StruMapSection_section_xml(){
363 my $self = shift(@_);
364 my ($section, $order_numref) = @_;
365
366 $section="" unless defined $section;
367
368
369 my $section_ptr=$self->_lookup_section($section);
370 return "" unless defined $section_ptr;
371
372 #**output fileSection by sections
373 my $section_num ="1". $section;
374
375 #**output the StruMap details
376
377 my $all_text = ' <mets:div ID="DS'. $section_num .'" TYPE="Section" ORDER="'.$$order_numref++.'" ORDERLABEL="'. $section_num .'" LABEL="';
378 $all_text .= $section_num . '" DMDID="DM'.$section_num.'">'. "\n";
379
380 $all_text .= ' <mets:fptr FILEID="FILEGROUP_PRELUDE'.$section_num.'" />'. "\n";
381
382
383 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
384 $all_text .= $self->buffer_mets_StruMapSection_section_xml("$section.$subsection", $order_numref);
385 }
386
387 $all_text .= " </mets:div>\n";
388
389 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
390
391 return $all_text;
392}
393
394
395sub buffer_mets_StruMapWhole_section_xml(){
396 my $self = shift(@_);
397 my ($section) = @_;
398
399 my $section_ptr = $self-> _lookup_section($section);
400 return "" unless defined $section_ptr;
401
402 my $all_text="" unless defined $all_txt;
403 my $fileID=0;
404
405 $all_text .= '<mets:structMap ID="All" TYPE="Whole Document" LABEL="All">'."\n";
406 $all_text .= ' <mets:div ID="All" TYPE="Document" ORDER="All" ORDERLABEL="All" LABEL="Whole Documemt" DMDID="DM1">' . "\n";
407
408
409 #** output the StruMapSection for the whole section
410 #*** get the sourcefile and associative file
411
412 foreach my $data (@{$section_ptr->{'metadata'}}){
413 my $escaped_value = &_escape_text($data->[1]);
414
415 if ($data->[0] eq "gsdlsourcefilename") {
416 ++$fileID;
417 $all_text .= ' <mets:fptr FILEID="default.'.$fileID.'" />'."\n";
418 }
419
420 if ($data->[0] eq "gsdlassocfile"){
421 ++$fileID;
422 $all_text .= ' <mets:fptr FILEID="default.'.$fileID. '" />'. "\n";
423 }
424 }
425 $all_text .= " </mets:div>\n";
426
427 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
428
429 return $all_text;
430}
431
432
433sub buffer_mets_dmdSection_section_xml(){
434 my $self = shift(@_);
435 my ($section) = @_;
436
437 $section="" unless defined $section;
438
439 my $section_ptr=$self->_lookup_section($section);
440 return "" unless defined $section_ptr;
441
442 #***convert section number
443 my $section_num ="1". $section;
444
445 #**output the dmdSection details
446 my $all_text = '<mets:dmdSec ID="DM'.$section_num.'" GROUPID="'.$section_num.'">'. "\n";
447 $all_text .= ' <mets:mdWrap MDType="gsdl" ID="'.$section_num.'">'."\n";
448 $all_text .= " <mets:xmlData>\n";
449 foreach my $data (@{$section_ptr->{'metadata'}}){
450 my $escaped_value = &_escape_text($data->[1]);
451 $all_text .= ' <gsdl:Metadata name="'. $data->[0].'">'. $escaped_value. "</gsdl:Metadata>\n";
452 }
453 $all_text .= " </mets:xmlData>\n";
454 $all_text .= " </mets:mdWrap>\n";
455 $all_text .= "</mets:dmdSec>\n";
456
457 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
458 $all_text .= $self->buffer_mets_dmdSection_section_xml("$section.$subsection");
459 }
460
461 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
462
463 return $all_text;
464}
465
466sub output_section {
467 my $self = shift (@_);
468 my ($handle, $section) = @_;
469
470 print $handle $self->buffer_section_xml($section);
471}
472
473
474#*** print out doctxt.xml file
475sub output_txt_section {
476 my $self = shift (@_);
477 my ($handle, $section) = @_;
478
479 print $handle $self->buffer_txt_section_xml($section);
480}
481
482#*** print out docmets.xml file
483sub output_mets_section {
484 my $self = shift(@_);
485 my ($handle, $section, $doc_Dir) = @_;
486
487 #***print out the dmdSection
488 print $handle $self->buffer_mets_dmdSection_section_xml($section);
489
490 #***print out the fileSection by sections
491 print $handle "<mets:fileSec>\n";
492 print $handle $self->buffer_mets_fileSection_section_xml($section,$doc_Dir);
493 #***print out the whole fileSection
494 print $handle $self->buffer_mets_fileWhole_section_xml($section);
495 print $handle "</mets:fileSec>\n";
496
497 #***print out the StruMapSection by sections
498 print $handle '<mets:structMap ID="Section" TYPE="Section" LABEL="Section">' . "\n";
499 my $order_num=0;
500 print $handle $self->buffer_mets_StruMapSection_section_xml($section, \$order_num);
501 print $handle "</mets:structMap>\n";
502 print $handle $self->buffer_mets_StruMapWhole_section_xml($section);
503 print $handle "</mets:structMap>\n";
504}
505
506
507# look up the reference to the a particular section
508sub _lookup_section {
509 my $self = shift (@_);
510 my ($section) = @_;
511
512 my ($num);
513 my $sectionref = $self;
514
515 while (defined $section && $section ne "") {
516 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
517 $num =~ s/^0+(\d)/$1/; # remove leading 0s
518 $section = "" unless defined $section;
519
520 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
521 $sectionref = $sectionref->{'subsections'}->{$num};
522 } else {
523 return undef;
524 }
525 }
526
527 return $sectionref;
528}
529
530# calculate OID by hashing the contents of the document
531sub _calc_OID {
532 my $self = shift (@_);
533 my ($filename) = @_;
534
535 my $osexe = &util::get_os_exe();
536
537 my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
538 $ENV{'GSDLOS'},"hashfile$osexe");
539 my $result = "NULL";
540
541 if (-e "$hashfile_exe") {
542# $result = `\"$hashfile_exe\" \"$filename\"`;
543 $result = `hashfile$osexe \"$filename\"`;
544 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
545
546 } else {
547 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
548 }
549
550 return "HASH$result";
551}
552
553# methods dealing with OID, not groups of them.
554
555# if $OID is not provided one is calculated
556sub set_OID {
557 my $self = shift (@_);
558 my ($OID) = @_;
559
560 # if an OID wasn't provided claculate one
561 if (!defined $OID) {
562 $OID = "NULL";
563
564 if ($self->{'OIDtype'} eq "incremental") {
565 $OID = "D" . $OIDcount;
566 $OIDcount ++;
567
568 } else {
569 # "hash" OID - feed file to hashfile.exe
570 #my $filename = $self->get_source_filename();
571 # we want to use the converted file for hashing if available
572 # cos its quicker
573 my $filename = $self->get_filename_for_hashing();
574 if (defined($filename) && -e $filename) {
575 $OID = $self->_calc_OID ($filename);
576 } else {
577 $filename = &util::get_tmp_filename();
578 if (!open (OUTFILE, ">$filename")) {
579 print STDERR "doc::set_OID could not write to $filename\n";
580 } else {
581 $self->output_section('OUTFILE', $self->get_top_section(),
582 undef, 1);
583 close (OUTFILE);
584 }
585
586 $OID = $self->_calc_OID ($filename);
587 &util::rm ($filename);
588 }
589 }
590 }
591 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
592}
593
594# this uses hashdoc (embedded c thingy) which is faster but still
595# needs a little work to be suffiently stable
596sub ___set_OID {
597 my $self = shift (@_);
598 my ($OID) = @_;
599
600 # if an OID wasn't provided then calculate hash value based on document
601 if (!defined $OID)
602 {
603 my $hash_text = $self->buffer_section_gml($self->get_top_section(),
604 undef, 1);
605 my $hash_len = length($hash_text);
606
607 $OID = &hashdoc::buffer($hash_text,$hash_len);
608 }
609
610 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
611}
612
613# returns the OID for this document
614sub get_OID {
615 my $self = shift (@_);
616 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
617 return $OID if (defined $OID);
618 return "NULL";
619}
620
621sub delete_OID {
622 my $self = shift (@_);
623
624 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
625}
626
627
628# methods for manipulating section names
629
630# returns the name of the top-most section (the top
631# level of the document
632sub get_top_section {
633 my $self = shift (@_);
634
635 return "";
636}
637
638# returns a section
639sub get_parent_section {
640 my $self = shift (@_);
641 my ($section) = @_;
642
643 $section =~ s/(^|\.)\d+$//;
644
645 return $section;
646}
647
648# returns the first child section (or the end child
649# if there isn't any)
650sub get_begin_child {
651 my $self = shift (@_);
652 my ($section) = @_;
653
654 my $section_ptr = $self->_lookup_section($section);
655 return "" unless defined $section_ptr;
656
657 if (defined $section_ptr->{'subsection_order'}->[0]) {
658 return "$section.$section_ptr->{'subsection_order'}->[0]";
659 }
660
661 return $self->get_end_child ($section);
662}
663
664# returns the next child of a parent section
665sub get_next_child {
666 my $self = shift (@_);
667 my ($section) = @_;
668
669 my $parent_section = $self->get_parent_section($section);
670 my $parent_section_ptr = $self->_lookup_section($parent_section);
671 return undef unless defined $parent_section_ptr;
672
673 my ($section_num) = $section =~ /(\d+)$/;
674 return undef unless defined $section_num;
675
676 my $i = 0;
677 my $section_order = $parent_section_ptr->{'subsection_order'};
678 while ($i < scalar(@$section_order)) {
679 last if $section_order->[$i] eq $section_num;
680 $i++;
681 }
682
683 $i++; # the next child
684 if ($i < scalar(@$section_order)) {
685 return $section_order->[$i] if $parent_section eq "";
686 return "$parent_section.$section_order->[$i]";
687 }
688
689 # no more sections in this level
690 return undef;
691}
692
693# returns a reference to a list of children
694sub get_children {
695 my $self = shift (@_);
696 my ($section) = @_;
697
698 my $section_ptr = $self->_lookup_section($section);
699 return [] unless defined $section_ptr;
700
701 my @children = @{$section_ptr->{'subsection_order'}};
702
703 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
704 return \@children;
705}
706
707# returns the child section one past the last one (which
708# is coded as "0")
709sub get_end_child {
710 my $self = shift (@_);
711 my ($section) = @_;
712
713 return $section . ".0" unless $section eq "";
714 return "0";
715}
716
717# returns the next section in book order
718sub get_next_section {
719 my $self = shift (@_);
720 my ($section) = @_;
721
722 return undef unless defined $section;
723
724 my $section_ptr = $self->_lookup_section($section);
725 return undef unless defined $section_ptr;
726
727 # first try to find first child
728 if (defined $section_ptr->{'subsection_order'}->[0]) {
729 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
730 return "$section.$section_ptr->{'subsection_order'}->[0]";
731 }
732
733 do {
734 # try to find sibling
735 my $next_child = $self->get_next_child ($section);
736 return $next_child if (defined $next_child);
737
738 # move up one level
739 $section = $self->get_parent_section ($section);
740 } while $section =~ /\d/;
741
742 return undef;
743}
744
745sub is_leaf_section {
746 my $self = shift (@_);
747 my ($section) = @_;
748
749 my $section_ptr = $self->_lookup_section($section);
750 return 1 unless defined $section_ptr;
751
752 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
753}
754
755# methods for dealing with sections
756
757# returns the name of the inserted section
758sub insert_section {
759 my $self = shift (@_);
760 my ($before_section) = @_;
761
762 # get the child to insert before and its parent section
763 my $parent_section = "";
764 my $before_child = "0";
765 my @before_section = split (/\./, $before_section);
766 if (scalar(@before_section) > 0) {
767 $before_child = pop (@before_section);
768 $parent_section = join (".", @before_section);
769 }
770
771 my $parent_section_ptr = $self->_lookup_section($parent_section);
772 if (!defined $parent_section_ptr) {
773 print STDERR "doc::insert_section couldn't find parent section " .
774 "$parent_section\n";
775 return;
776 }
777
778 # get the next section number
779 my $section_num = $parent_section_ptr->{'next_subsection'}++;
780
781 my $i = 0;
782 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
783 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
784 $i++;
785 }
786
787 # insert the section number into the order list
788 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
789
790 # add this section to the parent section
791 my $section_ptr = {'subsection_order'=>[],
792 'next_subsection'=>1,
793 'subsections'=>{},
794 'metadata'=>[],
795 'text'=>""};
796 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
797
798 # work out the full section number
799 my $section = $parent_section;
800 $section .= "." unless $section eq "";
801 $section .= $section_num;
802
803 return $section;
804}
805
806# creates a pre-named section
807sub create_named_section {
808 my $self = shift (@_);
809 my ($mastersection) = @_;
810
811 my ($num);
812 my $section = $mastersection;
813 my $sectionref = $self;
814
815 while ($section ne "") {
816 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
817 $num =~ s/^0+(\d)/$1/; # remove leading 0s
818 $section = "" unless defined $section;
819
820 if (defined $num) {
821 if (!defined $sectionref->{'subsections'}->{$num}) {
822 push (@{$sectionref->{'subsection_order'}}, $num);
823 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
824 'next_subsection'=>1,
825 'subsections'=>{},
826 'metadata'=>[],
827 'text'=>""};
828 if ($num >= $sectionref->{'next_subsection'}) {
829 $sectionref->{'next_subsection'} = $num + 1;
830 }
831 }
832 $sectionref = $sectionref->{'subsections'}->{$num};
833
834 } else {
835 print STDERR "doc::create_named_section couldn't create section ";
836 print STDERR "$mastersection\n";
837 last;
838 }
839 }
840}
841
842# returns a reference to a list of subsections
843sub list_subsections {
844 my $self = shift (@_);
845 my ($section) = @_;
846
847 my $section_ptr = $self->_lookup_section ($section);
848 if (!defined $section_ptr) {
849 print STDERR "doc::list_subsections couldn't find section $section\n";
850 return [];
851 }
852
853 return [@{$section_ptr->{'subsection_order'}}];
854}
855
856sub delete_section {
857 my $self = shift (@_);
858 my ($section) = @_;
859
860# my $section_ptr = {'subsection_order'=>[],
861# 'next_subsection'=>1,
862# 'subsections'=>{},
863# 'metadata'=>[],
864# 'text'=>""};
865
866 # if this is the top section reset everything
867 if ($section eq "") {
868 $self->{'subsection_order'} = [];
869 $self->{'subsections'} = {};
870 $self->{'metadata'} = [];
871 $self->{'text'} = "";
872 return;
873 }
874
875 # find the parent of the section to delete
876 my $parent_section = "";
877 my $child = "0";
878 my @section = split (/\./, $section);
879 if (scalar(@section) > 0) {
880 $child = pop (@section);
881 $parent_section = join (".", @section);
882 }
883
884 my $parent_section_ptr = $self->_lookup_section($parent_section);
885 if (!defined $parent_section_ptr) {
886 print STDERR "doc::delete_section couldn't find parent section " .
887 "$parent_section\n";
888 return;
889 }
890
891 # remove this section from the subsection_order list
892 my $i = 0;
893 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
894 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
895 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
896 last;
897 }
898 $i++;
899 }
900
901 # remove this section from the subsection hash
902 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
903 undef $parent_section_ptr->{'subsections'}->{$child};
904 }
905}
906
907#--
908# methods for dealing with metadata
909
910# set_metadata_element and get_metadata_element are for metadata
911# which should only have one value. add_meta_data and get_metadata
912# are for metadata which can have more than one value.
913
914# returns the first metadata value which matches field
915
916# This version of get metadata element works much like the one above,
917# except it allows for the namespace portion of a metadata element to
918# be ignored, thus if you are searching for dc.Title, the first piece
919# of matching metadata ending with the name Title (once any namespace
920# is removed) would be returned.
921# 28-11-2003 John Thompson
922sub get_metadata_element {
923 my $self = shift (@_);
924 my ($section, $field, $ignore_namespace) = @_;
925 my ($data);
926
927 $ignore_namespace = 0 unless defined $ignore_namespace;
928
929 my $section_ptr = $self->_lookup_section($section);
930 if (!defined $section_ptr) {
931 print STDERR "doc::get_metadata_element couldn't find section " .
932 "$section\n";
933 return;
934 }
935
936 # Remove the any namespace if we are being told to ignore them
937 if($ignore_namespace) {
938 $field =~ s/^\w*\.//;
939 }
940
941 foreach $data (@{$section_ptr->{'metadata'}}) {
942
943 my $data_name = $data->[0];
944 # Remove the any namespace if we are being told to ignore them
945 if($ignore_namespace) {
946 $data_name =~ s/^\w*\.//;
947 }
948
949 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
950 }
951
952 return undef; # was not found
953}
954
955# returns a list of the form [value1, value2, ...]
956sub get_metadata {
957 my $self = shift (@_);
958 my ($section, $field, $ignore_namespace) = @_;
959 my ($data);
960
961 $ignore_namespace = 0 unless defined $ignore_namespace;
962
963 my $section_ptr = $self->_lookup_section($section);
964 if (!defined $section_ptr) {
965 print STDERR "doc::get_metadata couldn't find section " .
966 "$section\n";
967 return;
968 }
969
970 # Remove the any namespace if we are being told to ignore them
971 if($ignore_namespace) {
972 $field =~ s/^\w*\.//;
973 }
974
975 my @metadata = ();
976 foreach $data (@{$section_ptr->{'metadata'}}) {
977
978 my $data_name = $data->[0];
979 # Remove the any namespace if we are being told to ignore them
980 if($ignore_namespace) {
981 $data_name =~ s/^\w*\.//;
982 }
983
984 push (@metadata, $data->[1]) if ($data_name eq $field);
985 }
986
987 return \@metadata;
988}
989
990# returns a list of the form [[field,value],[field,value],...]
991sub get_all_metadata {
992 my $self = shift (@_);
993 my ($section) = @_;
994
995 my $section_ptr = $self->_lookup_section($section);
996 if (!defined $section_ptr) {
997 print STDERR "doc::get_all_metadata couldn't find section " .
998 "$section\n";
999 return;
1000 }
1001
1002 return $section_ptr->{'metadata'};
1003}
1004
1005# $value is optional
1006sub delete_metadata {
1007 my $self = shift (@_);
1008 my ($section, $field, $value) = @_;
1009
1010 my $section_ptr = $self->_lookup_section($section);
1011 if (!defined $section_ptr) {
1012 print STDERR "doc::delete_metadata couldn't find section " .
1013 "$section\n";
1014 return;
1015 }
1016
1017 my $i = 0;
1018 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
1019 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
1020 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
1021 splice (@{$section_ptr->{'metadata'}}, $i, 1);
1022 } else {
1023 $i++;
1024 }
1025 }
1026}
1027
1028sub delete_all_metadata {
1029 my $self = shift (@_);
1030 my ($section) = @_;
1031
1032 my $section_ptr = $self->_lookup_section($section);
1033 if (!defined $section_ptr) {
1034 print STDERR "doc::delete_all_metadata couldn't find section " .
1035 "$section\n";
1036 return;
1037 }
1038
1039 $section_ptr->{'metadata'} = [];
1040}
1041
1042sub set_metadata_element {
1043 my $self = shift (@_);
1044 my ($section, $field, $value) = @_;
1045
1046 $self->set_utf8_metadata_element ($section, $field,
1047 &unicode::ascii2utf8(\$value));
1048}
1049
1050# set_utf8_metadata_element assumes the text has already been
1051# converted to the UTF-8 encoding.
1052sub set_utf8_metadata_element {
1053 my $self = shift (@_);
1054 my ($section, $field, $value) = @_;
1055
1056 $self->delete_metadata ($section, $field);
1057 $self->add_utf8_metadata ($section, $field, $value);
1058}
1059
1060
1061# add_metadata assumes the text is in (extended) ascii form. For
1062# text which has already been converted to the UTF-8 format use
1063# add_utf8_metadata.
1064sub add_metadata {
1065 my $self = shift (@_);
1066 my ($section, $field, $value) = @_;
1067
1068 $self->add_utf8_metadata ($section, $field,
1069 &unicode::ascii2utf8(\$value));
1070}
1071
1072sub add_utf8_metadata {
1073 my $self = shift (@_);
1074 my ($section, $field, $value) = @_;
1075
1076 my $section_ptr = $self->_lookup_section($section);
1077 if (!defined $section_ptr) {
1078 print STDERR "doc::add_utf8_metadata couldn't find section " .
1079 "$section\n";
1080 return;
1081 }
1082 if (!defined $value) {
1083 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
1084 return;
1085 }
1086 if (!defined $field) {
1087 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
1088 return;
1089 }
1090
1091 # double check that the value is utf-8
1092 if (unicode::ensure_utf8(\$value)) {
1093 print STDERR "doc::add_utf8_metadata: warning: '$field' wasn't utf8\n";
1094 }
1095
1096 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1097}
1098
1099
1100# methods for dealing with text
1101
1102# returns the text for a section
1103sub get_text {
1104 my $self = shift (@_);
1105 my ($section) = @_;
1106
1107 my $section_ptr = $self->_lookup_section($section);
1108 if (!defined $section_ptr) {
1109 print STDERR "doc::get_text couldn't find section " .
1110 "$section\n";
1111 return "";
1112 }
1113
1114 return $section_ptr->{'text'};
1115}
1116
1117# returns the (utf-8 encoded) length of the text for a section
1118sub get_text_length {
1119 my $self = shift (@_);
1120 my ($section) = @_;
1121
1122 my $section_ptr = $self->_lookup_section($section);
1123 if (!defined $section_ptr) {
1124 print STDERR "doc::get_text_length couldn't find section " .
1125 "$section\n";
1126 return 0;
1127 }
1128
1129 return length ($section_ptr->{'text'});
1130}
1131
1132sub delete_text {
1133 my $self = shift (@_);
1134 my ($section) = @_;
1135
1136 my $section_ptr = $self->_lookup_section($section);
1137 if (!defined $section_ptr) {
1138 print STDERR "doc::delete_text couldn't find section " .
1139 "$section\n";
1140 return;
1141 }
1142
1143 $section_ptr->{'text'} = "";
1144}
1145
1146# add_text assumes the text is in (extended) ascii form. For
1147# text which has been already converted to the UTF-8 format
1148# use add_utf8_text.
1149sub add_text {
1150 my $self = shift (@_);
1151 my ($section, $text) = @_;
1152
1153 # convert the text to UTF-8 encoded unicode characters
1154 # and add the text
1155 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1156}
1157
1158
1159# add_utf8_text assumes the text to be added has already
1160# been converted to the UTF-8 encoding. For ascii text use
1161# add_text
1162sub add_utf8_text {
1163 my $self = shift (@_);
1164 my ($section, $text) = @_;
1165
1166 my $section_ptr = $self->_lookup_section($section);
1167 if (!defined $section_ptr) {
1168 print STDERR "doc::add_utf8_text couldn't find section " .
1169 "$section\n";
1170 return;
1171 }
1172
1173 $section_ptr->{'text'} .= $text;
1174}
1175
1176
1177# methods for dealing with associated files
1178
1179# a file is associated with a document, NOT a section.
1180# if section is defined it is noted in the data structure
1181# only so that files associated from a particular section
1182# may be removed later (using delete_section_assoc_files)
1183sub associate_file {
1184 my $self = shift (@_);
1185 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1186 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1187
1188 # remove all associated files with the same name
1189 $self->delete_assoc_file ($assoc_filename);
1190
1191 push (@{$self->{'associated_files'}},
1192 [$real_filename, $assoc_filename, $mime_type, $section]);
1193}
1194
1195# returns a list of associated files in the form
1196# [[real_filename, assoc_filename, mimetype], ...]
1197sub get_assoc_files {
1198 my $self = shift (@_);
1199
1200 return $self->{'associated_files'};
1201}
1202
1203sub delete_section_assoc_files {
1204 my $self = shift (@_);
1205 my ($section) = @_;
1206
1207 my $i=0;
1208 while ($i < scalar (@{$self->{'associated_files'}})) {
1209 if (defined $self->{'associated_files'}->[$i]->[3] &&
1210 $self->{'associated_files'}->[$i]->[3] eq $section) {
1211 splice (@{$self->{'associated_files'}}, $i, 1);
1212 } else {
1213 $i++;
1214 }
1215 }
1216}
1217
1218sub delete_assoc_file {
1219 my $self = shift (@_);
1220 my ($assoc_filename) = @_;
1221
1222 my $i=0;
1223 while ($i < scalar (@{$self->{'associated_files'}})) {
1224 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1225 splice (@{$self->{'associated_files'}}, $i, 1);
1226 } else {
1227 $i++;
1228 }
1229 }
1230}
1231
1232sub reset_nextsection_ptr {
1233 my $self = shift (@_);
1234 my ($section) = @_;
1235
1236 my $section_ptr = $self->_lookup_section($section);
1237 $section_ptr->{'next_subsection'} = 1;
1238}
1239
12401;
Note: See TracBrowser for help on using the repository browser.