source: trunk/gsdl/perllib/doc.pm@ 9241

Last change on this file since 9241 was 9241, checked in by davidb, 19 years ago

Incorrect parsing of gsdlassocfile in METS code. Can have a directory
after final ":" but code had assumed this would be empty. Now fixed.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 36.3 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use unicode;
37use util;
38use ghtml;
39use File::stat;
40##use hashdoc;
41
42# the document type may be indexed_doc, nonindexed_doc, or
43# classification
44
45my $OIDcount = 0;
46
47sub new {
48 my $class = shift (@_);
49 my ($source_filename, $doc_type) = @_;
50
51 my $self = bless {'associated_files'=>[],
52 'subsection_order'=>[],
53 'next_subsection'=>1,
54 'subsections'=>{},
55 'metadata'=>[],
56 'text'=>"",
57 'OIDtype'=>"hash"}, $class;
58
59 #GRB: Added code to set lastmodified for OAI purposes
60 if ((defined $doc_type) && (defined $source_filename) && (-e $source_filename)) {
61 my $file_stat = stat($source_filename);
62 my $mtime = $file_stat->mtime;
63 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $file_stat->mtime);
64 }
65 #GRB: end inserted code
66
67 if (defined $source_filename) {
68 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
69
70 if (defined $collect_dir) {
71 my $dirsep = &util::get_dirsep();
72 if ($collect_dir !~ m/$dirsep$/) {
73 $collect_dir .= $dirsep;
74 }
75
76 $collect_dir =~ s/\\/\\\\/g; # escape DOS style file separator
77
78 if ($source_filename =~ /^$collect_dir(.*)$/) {
79 $source_filename = $1;
80 }
81 }
82
83 $self->set_source_filename ($source_filename);
84 }
85
86 $self->set_doc_type ($doc_type) if defined $doc_type;
87
88 return $self;
89}
90
91# clone the $self object
92sub duplicate {
93 my $self = shift (@_);
94
95 my $newobj = {};
96
97 foreach my $k (keys %$self) {
98 $newobj->{$k} = &clone ($self->{$k});
99 }
100
101 bless $newobj, ref($self);
102 return $newobj;
103}
104
105sub clone {
106 my ($from) = @_;
107 my $type = ref ($from);
108
109 if ($type eq "HASH") {
110 my $to = {};
111 foreach my $key (keys %$from) {
112 $to->{$key} = &clone ($from->{$key});
113 }
114 return $to;
115 } elsif ($type eq "ARRAY") {
116 my $to = [];
117 foreach my $v (@$from) {
118 push (@$to, &clone ($v));
119 }
120 return $to;
121 } else {
122 return $from;
123 }
124}
125
126sub set_OIDtype {
127 my $self = shift (@_);
128 my ($type) = @_;
129
130 if ($type =~ /^(hash|incremental|dirname|assigned)$/) {
131 $self->{'OIDtype'} = $type;
132 } else {
133 $self->{'OIDtype'} = "hash";
134 }
135}
136
137sub set_source_filename {
138 my $self = shift (@_);
139 my ($source_filename) = @_;
140
141 $self->set_metadata_element ($self->get_top_section(),
142 "gsdlsourcefilename",
143 $source_filename);
144}
145
146sub set_converted_filename {
147 my $self = shift (@_);
148 my ($converted_filename) = @_;
149
150 $self->set_metadata_element ($self->get_top_section(),
151 "gsdlconvertedfilename",
152 $converted_filename);
153}
154
155
156# returns the source_filename as it was provided
157sub get_source_filename {
158 my $self = shift (@_);
159
160 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
161}
162
163# returns converted filename if available else returns source filename
164sub get_filename_for_hashing {
165 my $self = shift (@_);
166
167 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
168
169 if (!defined $filename) {
170 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
171 }
172 return $filename;
173}
174
175sub set_doc_type {
176 my $self = shift (@_);
177 my ($doc_type) = @_;
178
179 $self->set_metadata_element ($self->get_top_section(),
180 "gsdldoctype",
181 $doc_type);
182}
183
184# returns the source_filename as it was provided
185# the default of "indexed_doc" is used if no document
186# type was provided
187sub get_doc_type {
188 my $self = shift (@_);
189
190 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
191 return $doc_type if (defined $doc_type);
192 return "indexed_doc";
193}
194
195sub _escape_text {
196 my ($text) = @_;
197
198 # special characters in the gml encoding
199 $text =~ s/&/&/g; # this has to be first...
200 $text =~ s/</&lt;/g;
201 $text =~ s/>/&gt;/g;
202 $text =~ s/\"/&quot;/g;
203
204 return $text;
205}
206
207sub buffer_section_xml {
208 my $self = shift (@_);
209 my ($section) = @_;
210
211 my $section_ptr = $self->_lookup_section ($section);
212 return "" unless defined $section_ptr;
213
214 my $all_text = "<Section>\n";
215 $all_text .= " <Description>\n";
216
217 # output metadata
218 foreach my $data (@{$section_ptr->{'metadata'}}) {
219 my $escaped_value = &_escape_text($data->[1]);
220 $all_text .= ' <Metadata name="' . $data->[0] . '">' . $escaped_value . "</Metadata>\n";
221 }
222
223 $all_text .= " </Description>\n";
224
225 # output the text
226 $all_text .= " <Content>";
227 $all_text .= &_escape_text($section_ptr->{'text'});
228 $all_text .= "</Content>\n";
229
230 # output all the subsections
231 foreach my $subsection (@{$section_ptr->{'subsection_order'}}) {
232 $all_text .= $self->buffer_section_xml("$section.$subsection");
233 }
234
235 $all_text .= "</Section>\n";
236
237 # make sure no nasty control characters have snuck through
238 # (XML::Parser will barf on anything it doesn't consider to be
239 # valid UTF-8 text, including things like \c@, \cC etc.)
240 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
241
242 return $all_text;
243}
244
245sub buffer_txt_section_xml {
246 my $self = shift(@_);
247 my ($section) = @_;
248
249 my $section_ptr = $self->_lookup_section ($section);
250
251 return "" unless defined $section_ptr;
252
253 my $all_text = "<Section>\n";
254
255 ##output the text
256 #$all_text .= " <Content>";
257 $all_text .= &_escape_text($section_ptr->{'text'});
258 #$all_text .= " </Content>\n";
259
260
261 #output all the subsections
262 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
263 $all_text .= $self->buffer_txt_section_xml("$section.$subsection");
264 }
265
266 $all_text .= "</Section>\n";
267
268
269 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
270 return $all_text;
271}
272
273sub buffer_mets_fileSection_section_xml() {
274 my $self = shift(@_);
275 my ($section) = @_;
276
277 #$section="" unless defined $section;
278
279
280 my $section_ptr=$self->_lookup_section($section);
281 return "" unless defined $section_ptr;
282
283
284 #**output fileSection by sections
285 my $section_num ="1". $section;
286
287
288 #my $filePath = $doc_Dir . '/doctxt.xml';
289
290 my $filePath = 'doctxt.xml';
291
292 #**output the fileSection details
293 my $all_text = ' <mets:fileGrp ID="FILEGROUP_PRELUDE' . $section_num . '">'. "\n";
294 $all_text .= ' <mets:file MIMETYPE="text/xml" ID="FILE'.$section_num. '">'. "\n";
295 $all_text .= ' <mets:FLocat LOCTYPE="URL" xlink:href="file:'.$filePath.'#xpointer(/Section[';
296
297 my $xpath = "1".$section;
298
299 $xpath =~ s/\./]\/Section[/g;
300
301 $all_text .= $xpath;
302
303 $all_text .= ']/text())" />' . "\n";
304 $all_text .= " </mets:file>\n";
305 $all_text .= " </mets:fileGrp>\n";
306
307
308 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
309 $all_text .= $self->buffer_mets_fileSection_section_xml("$section.$subsection");
310 }
311
312 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
313
314 return $all_text;
315}
316
317sub buffer_mets_fileWhole_section_xml(){
318 my $self = shift(@_);
319 my ($section) = @_;
320
321 my $section_ptr = $self-> _lookup_section($section);
322 return "" unless defined $section_ptr;
323
324 my $all_text="" unless defined $all_txt;
325
326 my ($dirPath)="" unless defined $dirPath;
327 my $fileID=0;
328
329 #** output the fileSection for the whole section
330 #*** get the sourcefile and associative file
331
332 foreach my $data (@{$section_ptr->{'metadata'}}){
333 my $escaped_value = &_escape_text($data->[1]);
334 if ($data->[0] eq "gsdlsourcefilename") {
335 ($dirPath) = $escaped_value =~ m/^(.*)[\/\\][^\/\\]*$/;
336
337 $all_text .= ' <mets:fileGrp ID="default">'."\n";
338 ++$fileID;
339 $all_text .= ' <mets:file MIMETYPE="text/xml" ID="default.'.$fileID.'">'. "\n";
340
341 $all_text .= ' <mets:FLocat LOCTYPE="URL" xlink:href="file:'.$data->[1].'" />'."\n";
342
343 $all_text .= " </mets:file>\n";
344 }
345
346 if ($data->[0] eq "gsdlassocfile"){
347
348 $escaped_value =~ m/^(.*?):(.*):(.*)$/;
349 my $assoc_file = $1;
350 my $mime_type = $2;
351 my $assoc_dir = $3;
352
353 my $assfilePath = $dirPath . '/'. $assoc_file;
354 ++$fileID;
355
356 $all_text .= ' <mets:file MIMETYPE="'.$mime_type.'" ID="default.'.$fileID. '">'. "\n";
357 $all_text .= ' <mets:FLocat LOCTYPE="URL" xlink:href="file:'.$assfilePath.'" />'."\n";
358
359 $all_text .= " </mets:file>\n";
360 }
361 }
362 $all_text .= " </mets:fileGrp>\n";
363
364
365 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
366
367 return $all_text;
368}
369
370sub buffer_mets_StruMapSection_section_xml(){
371 my $self = shift(@_);
372 my ($section, $order_numref) = @_;
373
374 $section="" unless defined $section;
375
376
377 my $section_ptr=$self->_lookup_section($section);
378 return "" unless defined $section_ptr;
379
380
381 #***output fileSection by Sections
382 my $section_num ="1". $section;
383 my $dmd_num = $section_num;
384
385 ##**output the dmdSection details
386 #if ($section_num eq "1") {
387 # $dmd_num = "0";
388 #}
389
390 #**output the StruMap details
391
392 my $all_text = ' <mets:div ID="DS'. $section_num .'" TYPE="Section" ORDER="'.$$order_numref++.'" ORDERLABEL="'. $section_num .'" LABEL="';
393
394 $all_text .= $section_num . '" DMDID="DM'.$dmd_num.'">'. "\n";
395
396 $all_text .= ' <mets:fptr FILEID="FILEGROUP_PRELUDE'.$section_num.'" />'. "\n";
397
398
399 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
400 $all_text .= $self->buffer_mets_StruMapSection_section_xml("$section.$subsection", $order_numref);
401 }
402
403 $all_text .= " </mets:div>\n";
404
405 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
406
407 return $all_text;
408}
409
410
411sub buffer_mets_StruMapWhole_section_xml(){
412 my $self = shift(@_);
413 my ($section) = @_;
414
415 my $section_ptr = $self-> _lookup_section($section);
416 return "" unless defined $section_ptr;
417
418 my $all_text="" unless defined $all_txt;
419 my $fileID=0;
420 my $order_num = 0;
421
422 $all_text .= '<mets:structMap ID="All" TYPE="Whole Document" LABEL="All">'."\n";
423 $all_text .= ' <mets:div ID="DSAll" TYPE="Document" ORDER="'.$order_num.'" ORDERLABEL="All" LABEL="Whole Documemt" DMDID="DM1">' . "\n";
424
425
426 #** output the StruMapSection for the whole section
427 #*** get the sourcefile and associative file
428
429 foreach my $data (@{$section_ptr->{'metadata'}}){
430 my $escaped_value = &_escape_text($data->[1]);
431
432 if ($data->[0] eq "gsdlsourcefilename") {
433 ++$fileID;
434 $all_text .= ' <mets:fptr FILEID="default.'.$fileID.'" />'."\n";
435 }
436
437 if ($data->[0] eq "gsdlassocfile"){
438 ++$fileID;
439 $all_text .= ' <mets:fptr FILEID="default.'.$fileID. '" />'. "\n";
440 }
441 }
442 $all_text .= " </mets:div>\n";
443
444 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
445
446 return $all_text;
447}
448
449
450sub buffer_mets_dmdSection_section_xml(){
451 my $self = shift(@_);
452 my ($section) = @_;
453
454 $section="" unless defined $section;
455
456 my $section_ptr=$self->_lookup_section($section);
457 return "" unless defined $section_ptr;
458
459 #***convert section number
460 my $section_num ="1". $section;
461 my $dmd_num = $section_num;
462
463 # #**output the dmdSection details
464 # if ($section_num eq "1") {
465 # $dmd_num = "0";
466 # }
467 my $all_text = '<mets:dmdSec ID="DM'.$dmd_num.'" GROUPID="'.$section_num.'">'. "\n";
468 $all_text .= ' <mets:mdWrap MDTYPE="OTHER" OTHERMDTYPE="gsdl3" ID="gsdl'.$section_num.'">'."\n";
469 $all_text .= " <mets:xmlData>\n";
470 foreach my $data (@{$section_ptr->{'metadata'}}){
471 my $escaped_value = &_escape_text($data->[1]);
472 $all_text .= ' <gsdl3:Metadata name="'. $data->[0].'">'. $escaped_value. "</gsdl3:Metadata>\n";
473 if ($data->[0] eq "dc.Title") {
474 $all_text .= ' <gsdl3:Metadata name="Title">'. $escaped_value."</gsdl3:Metadata>\n";
475 }
476 }
477
478 $all_text .= " </mets:xmlData>\n";
479 $all_text .= " </mets:mdWrap>\n";
480 $all_text .= "</mets:dmdSec>\n";
481
482 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
483 $all_text .= $self->buffer_mets_dmdSection_section_xml("$section.$subsection");
484 }
485
486 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
487
488 return $all_text;
489}
490
491sub output_section {
492 my $self = shift (@_);
493 my ($handle, $section) = @_;
494
495 print $handle $self->buffer_section_xml($section);
496}
497
498#*** print out DSpace dublin_core metadata section
499sub output_dspace_section {
500 my $self = shift (@_);
501 my ($handle, $section) = @_;
502
503 my $section_ptr = $self->_lookup_section ($section);
504 return "" unless defined $section_ptr;
505
506 my $all_text = "<Section>\n";
507 $all_text .= " <Description>\n";
508
509 # output metadata
510 foreach my $data (@{$section_ptr->{'metadata'}}) {
511 my $escaped_value = &_escape_text($data->[1]);
512 $all_text .= ' <Metadata name="' . $data->[0] . '">' . $escaped_value . "</Metadata>\n";
513 }
514
515 $all_text .= " </Description>\n";
516 $all_text .= "</Section>\n";
517
518 # make sure no nasty control characters have snuck through
519 # (XML::Parser will barf on anything it doesn't consider to be
520 # valid UTF-8 text, including things like \c@, \cC etc.)
521 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
522
523 return $all_text;
524}
525
526#*** print out doctxt.xml file
527sub output_txt_section {
528 my $self = shift (@_);
529 my ($handle, $section) = @_;
530
531 print $handle $self->buffer_txt_section_xml($section);
532}
533
534#*** print out docmets.xml file
535sub output_mets_section {
536 my $self = shift(@_);
537 my ($handle, $section) = @_;
538
539 #***print out the dmdSection
540 print $handle $self->buffer_mets_dmdSection_section_xml($section);
541
542 #***print out the fileSection by sections
543 print $handle "<mets:fileSec>\n";
544 print $handle $self->buffer_mets_fileSection_section_xml($section);
545
546 #***print out the whole fileSection
547 print $handle $self->buffer_mets_fileWhole_section_xml($section);
548 print $handle "</mets:fileSec>\n";
549
550 #***print out the StruMapSection by sections
551 print $handle '<mets:structMap ID="Section" TYPE="Section" LABEL="Section">' . "\n";
552 my $order_num=0;
553 print $handle $self->buffer_mets_StruMapSection_section_xml($section, \$order_num);
554 print $handle "</mets:structMap>\n";
555 print $handle $self->buffer_mets_StruMapWhole_section_xml($section);
556 print $handle "</mets:structMap>\n";
557}
558
559#*** print out dublin_core.xml file
560sub output_dc_section {
561 my $self = shift(@_);
562 my ($handle, $section, $doc_Dir) = @_;
563
564 #***print out the dublin_core
565 $section="" unless defined $section;
566
567 my $section_ptr=$self->_lookup_section($section);
568 return "" unless defined $section_ptr;
569 my $all_text="";
570 foreach my $data (@{$section_ptr->{'metadata'}}){
571 my $escaped_value = &_escape_text($data->[1]);
572 if ($data->[0]=~ /^dc/) {
573 $data->[0] =~ tr/[A-Z]/[a-z]/;
574 $data->[0] =~ /^dc\.(.*)/;
575 my $dc_element = $1;
576 #$all_text .= ' <dcvalue element="'. $data->[0].'" qualifier="#####">'. $escaped_value. "</dcvalue>\n";
577 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
578 }
579 }
580 if ($all_text eq "") {
581 $all_text .= " There is no Dublin Core metatdata in this document\n";
582 }
583 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
584
585 print $handle $all_text;
586}
587
588# look up the reference to the a particular section
589sub _lookup_section {
590 my $self = shift (@_);
591 my ($section) = @_;
592
593 my ($num);
594 my $sectionref = $self;
595
596 while (defined $section && $section ne "") {
597 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
598 $num =~ s/^0+(\d)/$1/; # remove leading 0s
599 $section = "" unless defined $section;
600
601 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
602 $sectionref = $sectionref->{'subsections'}->{$num};
603 } else {
604 return undef;
605 }
606 }
607
608 return $sectionref;
609}
610
611# calculate OID by hashing the contents of the document
612sub _calc_OID {
613 my $self = shift (@_);
614 my ($filename) = @_;
615
616 my $osexe = &util::get_os_exe();
617
618 my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
619 $ENV{'GSDLOS'},"hashfile$osexe");
620
621 my $result = "NULL";
622
623 if (-e "$hashfile_exe") {
624# $result = `\"$hashfile_exe\" \"$filename\"`;
625 $result = `hashfile$osexe \"$filename\"`;
626 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
627
628 } else {
629 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
630 }
631 return "HASH$result";
632}
633
634# methods dealing with OID, not groups of them.
635
636# if $OID is not provided one is calculated
637sub set_OID {
638 my $self = shift (@_);
639 my ($OID) = @_;
640
641 my $use_hash_oid = 0;
642 # if an OID wasn't provided claculate one
643 if (!defined $OID) {
644 $OID = "NULL";
645 if ($self->{'OIDtype'} eq "hash") {
646 $use_hash_oid = 1;
647 } elsif ($self->{'OIDtype'} eq "incremental") {
648 $OID = "D" . $OIDcount;
649 $OIDcount ++;
650
651 } elsif ($self->{'OIDtype'} eq "dirname") {
652 $OID = 'J';
653 my $filename = $self->get_source_filename();
654 if (defined($filename)) { # && -e $filename) {
655 $OID = &File::Basename::dirname($filename);
656 if (defined $OID) {
657 $OID = 'J'.&File::Basename::basename($OID);
658 $OID =~ s/\.//; #remove any periods
659 } else {
660 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
661 $use_hash_oid = 1;
662 }
663 } else {
664 print STDERR "Failed to find filename, generating hash id\n";
665 $use_hash_oid = 1;
666 }
667
668 } elsif ($self->{'OIDtype'} eq "assigned") {
669 my $identifier = $self->get_metadata_element ($self->get_top_section(), "dc.Identifier");
670 if (defined $identifier && $identifier ne "") {
671 $OID = "D" . $self->get_metadata_element ($self->get_top_section(), "dc.Identifier");
672 $OID =~ s/\.//; #remove any periods
673 } else {
674 # need a hash id
675 print STDERR "no dc.Identifier found, generating hash id\n";
676 $use_hash_oid = 1;
677 }
678
679 } else {
680 $use_hash_oid = 1;
681 }
682
683 if ($use_hash_oid) {
684
685 # "hash" OID - feed file to hashfile.exe
686 #my $filename = $self->get_source_filename();
687 # we want to use the converted file for hashing if available
688 # cos its quicker
689 my $filename = $self->get_filename_for_hashing();
690
691 # -z: don't want to hash on the file if it is zero size
692 if (defined($filename) && -e $filename && !-z $filename) {
693 $OID = $self->_calc_OID ($filename);
694 } else {
695 $filename = &util::get_tmp_filename();
696 if (!open (OUTFILE, ">$filename")) {
697 print STDERR "doc::set_OID could not write to $filename\n";
698 } else {
699 $self->output_section('OUTFILE', $self->get_top_section(),
700 undef, 1);
701 close (OUTFILE);
702 }
703 $OID = $self->_calc_OID ($filename);
704 &util::rm ($filename);
705 }
706 }
707 }
708 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
709}
710
711# this uses hashdoc (embedded c thingy) which is faster but still
712# needs a little work to be suffiently stable
713sub ___set_OID {
714 my $self = shift (@_);
715 my ($OID) = @_;
716
717 # if an OID wasn't provided then calculate hash value based on document
718 if (!defined $OID)
719 {
720 my $hash_text = $self->buffer_section_gml($self->get_top_section(),
721 undef, 1);
722 my $hash_len = length($hash_text);
723
724 $OID = &hashdoc::buffer($hash_text,$hash_len);
725 }
726
727 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
728}
729
730# returns the OID for this document
731sub get_OID {
732 my $self = shift (@_);
733 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
734 return $OID if (defined $OID);
735 return "NULL";
736}
737
738sub delete_OID {
739 my $self = shift (@_);
740
741 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
742}
743
744
745# methods for manipulating section names
746
747# returns the name of the top-most section (the top
748# level of the document
749sub get_top_section {
750 my $self = shift (@_);
751
752 return "";
753}
754
755# returns a section
756sub get_parent_section {
757 my $self = shift (@_);
758 my ($section) = @_;
759
760 $section =~ s/(^|\.)\d+$//;
761
762 return $section;
763}
764
765# returns the first child section (or the end child
766# if there isn't any)
767sub get_begin_child {
768 my $self = shift (@_);
769 my ($section) = @_;
770
771 my $section_ptr = $self->_lookup_section($section);
772 return "" unless defined $section_ptr;
773
774 if (defined $section_ptr->{'subsection_order'}->[0]) {
775 return "$section.$section_ptr->{'subsection_order'}->[0]";
776 }
777
778 return $self->get_end_child ($section);
779}
780
781# returns the next child of a parent section
782sub get_next_child {
783 my $self = shift (@_);
784 my ($section) = @_;
785
786 my $parent_section = $self->get_parent_section($section);
787 my $parent_section_ptr = $self->_lookup_section($parent_section);
788 return undef unless defined $parent_section_ptr;
789
790 my ($section_num) = $section =~ /(\d+)$/;
791 return undef unless defined $section_num;
792
793 my $i = 0;
794 my $section_order = $parent_section_ptr->{'subsection_order'};
795 while ($i < scalar(@$section_order)) {
796 last if $section_order->[$i] eq $section_num;
797 $i++;
798 }
799
800 $i++; # the next child
801 if ($i < scalar(@$section_order)) {
802 return $section_order->[$i] if $parent_section eq "";
803 return "$parent_section.$section_order->[$i]";
804 }
805
806 # no more sections in this level
807 return undef;
808}
809
810# returns a reference to a list of children
811sub get_children {
812 my $self = shift (@_);
813 my ($section) = @_;
814
815 my $section_ptr = $self->_lookup_section($section);
816 return [] unless defined $section_ptr;
817
818 my @children = @{$section_ptr->{'subsection_order'}};
819
820 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
821 return \@children;
822}
823
824# returns the child section one past the last one (which
825# is coded as "0")
826sub get_end_child {
827 my $self = shift (@_);
828 my ($section) = @_;
829
830 return $section . ".0" unless $section eq "";
831 return "0";
832}
833
834# returns the next section in book order
835sub get_next_section {
836 my $self = shift (@_);
837 my ($section) = @_;
838
839 return undef unless defined $section;
840
841 my $section_ptr = $self->_lookup_section($section);
842 return undef unless defined $section_ptr;
843
844 # first try to find first child
845 if (defined $section_ptr->{'subsection_order'}->[0]) {
846 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
847 return "$section.$section_ptr->{'subsection_order'}->[0]";
848 }
849
850 do {
851 # try to find sibling
852 my $next_child = $self->get_next_child ($section);
853 return $next_child if (defined $next_child);
854
855 # move up one level
856 $section = $self->get_parent_section ($section);
857 } while $section =~ /\d/;
858
859 return undef;
860}
861
862sub is_leaf_section {
863 my $self = shift (@_);
864 my ($section) = @_;
865
866 my $section_ptr = $self->_lookup_section($section);
867 return 1 unless defined $section_ptr;
868
869 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
870}
871
872# methods for dealing with sections
873
874# returns the name of the inserted section
875sub insert_section {
876 my $self = shift (@_);
877 my ($before_section) = @_;
878
879 # get the child to insert before and its parent section
880 my $parent_section = "";
881 my $before_child = "0";
882 my @before_section = split (/\./, $before_section);
883 if (scalar(@before_section) > 0) {
884 $before_child = pop (@before_section);
885 $parent_section = join (".", @before_section);
886 }
887
888 my $parent_section_ptr = $self->_lookup_section($parent_section);
889 if (!defined $parent_section_ptr) {
890 print STDERR "doc::insert_section couldn't find parent section " .
891 "$parent_section\n";
892 return;
893 }
894
895 # get the next section number
896 my $section_num = $parent_section_ptr->{'next_subsection'}++;
897
898 my $i = 0;
899 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
900 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
901 $i++;
902 }
903
904 # insert the section number into the order list
905 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
906
907 # add this section to the parent section
908 my $section_ptr = {'subsection_order'=>[],
909 'next_subsection'=>1,
910 'subsections'=>{},
911 'metadata'=>[],
912 'text'=>""};
913 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
914
915 # work out the full section number
916 my $section = $parent_section;
917 $section .= "." unless $section eq "";
918 $section .= $section_num;
919
920 return $section;
921}
922
923# creates a pre-named section
924sub create_named_section {
925 my $self = shift (@_);
926 my ($mastersection) = @_;
927
928 my ($num);
929 my $section = $mastersection;
930 my $sectionref = $self;
931
932 while ($section ne "") {
933 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
934 $num =~ s/^0+(\d)/$1/; # remove leading 0s
935 $section = "" unless defined $section;
936
937 if (defined $num) {
938 if (!defined $sectionref->{'subsections'}->{$num}) {
939 push (@{$sectionref->{'subsection_order'}}, $num);
940 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
941 'next_subsection'=>1,
942 'subsections'=>{},
943 'metadata'=>[],
944 'text'=>""};
945 if ($num >= $sectionref->{'next_subsection'}) {
946 $sectionref->{'next_subsection'} = $num + 1;
947 }
948 }
949 $sectionref = $sectionref->{'subsections'}->{$num};
950
951 } else {
952 print STDERR "doc::create_named_section couldn't create section ";
953 print STDERR "$mastersection\n";
954 last;
955 }
956 }
957}
958
959# returns a reference to a list of subsections
960sub list_subsections {
961 my $self = shift (@_);
962 my ($section) = @_;
963
964 my $section_ptr = $self->_lookup_section ($section);
965 if (!defined $section_ptr) {
966 print STDERR "doc::list_subsections couldn't find section $section\n";
967 return [];
968 }
969
970 return [@{$section_ptr->{'subsection_order'}}];
971}
972
973sub delete_section {
974 my $self = shift (@_);
975 my ($section) = @_;
976
977# my $section_ptr = {'subsection_order'=>[],
978# 'next_subsection'=>1,
979# 'subsections'=>{},
980# 'metadata'=>[],
981# 'text'=>""};
982
983 # if this is the top section reset everything
984 if ($section eq "") {
985 $self->{'subsection_order'} = [];
986 $self->{'subsections'} = {};
987 $self->{'metadata'} = [];
988 $self->{'text'} = "";
989 return;
990 }
991
992 # find the parent of the section to delete
993 my $parent_section = "";
994 my $child = "0";
995 my @section = split (/\./, $section);
996 if (scalar(@section) > 0) {
997 $child = pop (@section);
998 $parent_section = join (".", @section);
999 }
1000
1001 my $parent_section_ptr = $self->_lookup_section($parent_section);
1002 if (!defined $parent_section_ptr) {
1003 print STDERR "doc::delete_section couldn't find parent section " .
1004 "$parent_section\n";
1005 return;
1006 }
1007
1008 # remove this section from the subsection_order list
1009 my $i = 0;
1010 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
1011 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
1012 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
1013 last;
1014 }
1015 $i++;
1016 }
1017
1018 # remove this section from the subsection hash
1019 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
1020 undef $parent_section_ptr->{'subsections'}->{$child};
1021 }
1022}
1023
1024#--
1025# methods for dealing with metadata
1026
1027# set_metadata_element and get_metadata_element are for metadata
1028# which should only have one value. add_meta_data and get_metadata
1029# are for metadata which can have more than one value.
1030
1031# returns the first metadata value which matches field
1032
1033# This version of get metadata element works much like the one above,
1034# except it allows for the namespace portion of a metadata element to
1035# be ignored, thus if you are searching for dc.Title, the first piece
1036# of matching metadata ending with the name Title (once any namespace
1037# is removed) would be returned.
1038# 28-11-2003 John Thompson
1039sub get_metadata_element {
1040 my $self = shift (@_);
1041 my ($section, $field, $ignore_namespace) = @_;
1042 my ($data);
1043
1044 $ignore_namespace = 0 unless defined $ignore_namespace;
1045
1046 my $section_ptr = $self->_lookup_section($section);
1047 if (!defined $section_ptr) {
1048 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
1049 return;
1050 }
1051
1052 # Remove the any namespace if we are being told to ignore them
1053 if($ignore_namespace) {
1054 $field =~ s/^\w*\.//;
1055 }
1056
1057 foreach $data (@{$section_ptr->{'metadata'}}) {
1058
1059 my $data_name = $data->[0];
1060 # Remove the any namespace if we are being told to ignore them
1061 if($ignore_namespace) {
1062 $data_name =~ s/^\w*\.//;
1063 }
1064
1065 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
1066 }
1067
1068 return undef; # was not found
1069}
1070
1071# returns a list of the form [value1, value2, ...]
1072sub get_metadata {
1073 my $self = shift (@_);
1074 my ($section, $field, $ignore_namespace) = @_;
1075 my ($data);
1076
1077 $ignore_namespace = 0 unless defined $ignore_namespace;
1078
1079 my $section_ptr = $self->_lookup_section($section);
1080 if (!defined $section_ptr) {
1081 print STDERR "doc::get_metadata couldn't find section ",
1082 $section, "\n";
1083 return;
1084 }
1085
1086 # Remove the any namespace if we are being told to ignore them
1087 if($ignore_namespace) {
1088 $field =~ s/^\w*\.//;
1089 }
1090
1091 my @metadata = ();
1092 foreach $data (@{$section_ptr->{'metadata'}}) {
1093
1094 my $data_name = $data->[0];
1095 # Remove the any namespace if we are being told to ignore them
1096 if($ignore_namespace) {
1097 $data_name =~ s/^\w*\.//;
1098 }
1099
1100 push (@metadata, $data->[1]) if ($data_name eq $field);
1101 }
1102
1103 return \@metadata;
1104}
1105
1106# returns a list of the form [[field,value],[field,value],...]
1107sub get_all_metadata {
1108 my $self = shift (@_);
1109 my ($section) = @_;
1110
1111 my $section_ptr = $self->_lookup_section($section);
1112 if (!defined $section_ptr) {
1113 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
1114 return;
1115 }
1116
1117 return $section_ptr->{'metadata'};
1118}
1119
1120# $value is optional
1121sub delete_metadata {
1122 my $self = shift (@_);
1123 my ($section, $field, $value) = @_;
1124
1125 my $section_ptr = $self->_lookup_section($section);
1126 if (!defined $section_ptr) {
1127 print STDERR "doc::delete_metadata couldn't find section ", $section, "\n";
1128 return;
1129 }
1130
1131 my $i = 0;
1132 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
1133 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
1134 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
1135 splice (@{$section_ptr->{'metadata'}}, $i, 1);
1136 } else {
1137 $i++;
1138 }
1139 }
1140}
1141
1142sub delete_all_metadata {
1143 my $self = shift (@_);
1144 my ($section) = @_;
1145
1146 my $section_ptr = $self->_lookup_section($section);
1147 if (!defined $section_ptr) {
1148 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
1149 return;
1150 }
1151
1152 $section_ptr->{'metadata'} = [];
1153}
1154
1155sub set_metadata_element {
1156 my $self = shift (@_);
1157 my ($section, $field, $value) = @_;
1158
1159 $self->set_utf8_metadata_element ($section, $field,
1160 &unicode::ascii2utf8(\$value));
1161}
1162
1163# set_utf8_metadata_element assumes the text has already been
1164# converted to the UTF-8 encoding.
1165sub set_utf8_metadata_element {
1166 my $self = shift (@_);
1167 my ($section, $field, $value) = @_;
1168
1169 $self->delete_metadata ($section, $field);
1170 $self->add_utf8_metadata ($section, $field, $value);
1171}
1172
1173
1174# add_metadata assumes the text is in (extended) ascii form. For
1175# text which has already been converted to the UTF-8 format use
1176# add_utf8_metadata.
1177sub add_metadata {
1178 my $self = shift (@_);
1179 my ($section, $field, $value) = @_;
1180
1181 $self->add_utf8_metadata ($section, $field,
1182 &unicode::ascii2utf8(\$value));
1183}
1184
1185sub add_utf8_metadata {
1186 my $self = shift (@_);
1187 my ($section, $field, $value) = @_;
1188
1189 my $section_ptr = $self->_lookup_section($section);
1190 if (!defined $section_ptr) {
1191 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
1192 return;
1193 }
1194 if (!defined $value) {
1195 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
1196 return;
1197 }
1198 if (!defined $field) {
1199 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
1200 return;
1201 }
1202
1203 #print STDERR "###$field=$value\n";
1204 # double check that the value is utf-8
1205 if (unicode::ensure_utf8(\$value)) {
1206 print STDERR "doc::add_utf8_metadata: warning: '$field' wasn't utf8\n";
1207 }
1208
1209 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1210}
1211
1212
1213# methods for dealing with text
1214
1215# returns the text for a section
1216sub get_text {
1217 my $self = shift (@_);
1218 my ($section) = @_;
1219
1220 my $section_ptr = $self->_lookup_section($section);
1221 if (!defined $section_ptr) {
1222 print STDERR "doc::get_text couldn't find section " .
1223 "$section\n";
1224 return "";
1225 }
1226
1227 return $section_ptr->{'text'};
1228}
1229
1230# returns the (utf-8 encoded) length of the text for a section
1231sub get_text_length {
1232 my $self = shift (@_);
1233 my ($section) = @_;
1234
1235 my $section_ptr = $self->_lookup_section($section);
1236 if (!defined $section_ptr) {
1237 print STDERR "doc::get_text_length couldn't find section " .
1238 "$section\n";
1239 return 0;
1240 }
1241
1242 return length ($section_ptr->{'text'});
1243}
1244
1245sub delete_text {
1246 my $self = shift (@_);
1247 my ($section) = @_;
1248
1249 my $section_ptr = $self->_lookup_section($section);
1250 if (!defined $section_ptr) {
1251 print STDERR "doc::delete_text couldn't find section " .
1252 "$section\n";
1253 return;
1254 }
1255
1256 $section_ptr->{'text'} = "";
1257}
1258
1259# add_text assumes the text is in (extended) ascii form. For
1260# text which has been already converted to the UTF-8 format
1261# use add_utf8_text.
1262sub add_text {
1263 my $self = shift (@_);
1264 my ($section, $text) = @_;
1265
1266 # convert the text to UTF-8 encoded unicode characters
1267 # and add the text
1268 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1269}
1270
1271
1272# add_utf8_text assumes the text to be added has already
1273# been converted to the UTF-8 encoding. For ascii text use
1274# add_text
1275sub add_utf8_text {
1276 my $self = shift (@_);
1277 my ($section, $text) = @_;
1278
1279 my $section_ptr = $self->_lookup_section($section);
1280 if (!defined $section_ptr) {
1281 print STDERR "doc::add_utf8_text couldn't find section " .
1282 "$section\n";
1283 return;
1284 }
1285
1286 $section_ptr->{'text'} .= $text;
1287}
1288
1289
1290# methods for dealing with associated files
1291
1292# a file is associated with a document, NOT a section.
1293# if section is defined it is noted in the data structure
1294# only so that files associated from a particular section
1295# may be removed later (using delete_section_assoc_files)
1296sub associate_file {
1297 my $self = shift (@_);
1298 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1299 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1300
1301 # remove all associated files with the same name
1302 $self->delete_assoc_file ($assoc_filename);
1303
1304 push (@{$self->{'associated_files'}},
1305 [$real_filename, $assoc_filename, $mime_type, $section]);
1306}
1307
1308# returns a list of associated files in the form
1309# [[real_filename, assoc_filename, mimetype], ...]
1310sub get_assoc_files {
1311 my $self = shift (@_);
1312
1313 return $self->{'associated_files'};
1314}
1315
1316sub delete_section_assoc_files {
1317 my $self = shift (@_);
1318 my ($section) = @_;
1319
1320 my $i=0;
1321 while ($i < scalar (@{$self->{'associated_files'}})) {
1322 if (defined $self->{'associated_files'}->[$i]->[3] &&
1323 $self->{'associated_files'}->[$i]->[3] eq $section) {
1324 splice (@{$self->{'associated_files'}}, $i, 1);
1325 } else {
1326 $i++;
1327 }
1328 }
1329}
1330
1331sub delete_assoc_file {
1332 my $self = shift (@_);
1333 my ($assoc_filename) = @_;
1334
1335 my $i=0;
1336 while ($i < scalar (@{$self->{'associated_files'}})) {
1337 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1338 splice (@{$self->{'associated_files'}}, $i, 1);
1339 } else {
1340 $i++;
1341 }
1342 }
1343}
1344
1345sub reset_nextsection_ptr {
1346 my $self = shift (@_);
1347 my ($section) = @_;
1348
1349 my $section_ptr = $self->_lookup_section($section);
1350 $section_ptr->{'next_subsection'} = 1;
1351}
1352
13531;
Note: See TracBrowser for help on using the repository browser.