source: trunk/gsdl/perllib/doc.pm@ 8902

Last change on this file since 8902 was 8894, checked in by chi, 19 years ago

Modification of the validated METS format in the docmets.xml

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 36.1 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use unicode;
37use util;
38use ghtml;
39use File::stat;
40##use hashdoc;
41
42# the document type may be indexed_doc, nonindexed_doc, or
43# classification
44
45my $OIDcount = 0;
46
47sub new {
48 my $class = shift (@_);
49 my ($source_filename, $doc_type) = @_;
50
51 my $self = bless {'associated_files'=>[],
52 'subsection_order'=>[],
53 'next_subsection'=>1,
54 'subsections'=>{},
55 'metadata'=>[],
56 'text'=>"",
57 'OIDtype'=>"hash"}, $class;
58
59 #GRB: Added code to set lastmodified for OAI purposes
60 if ((defined $doc_type) && (defined $source_filename) && (-e $source_filename)) {
61 my $file_stat = stat($source_filename);
62 my $mtime = $file_stat->mtime;
63 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $file_stat->mtime);
64 }
65 #GRB: end inserted code
66
67 if (defined $source_filename) {
68 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
69
70 if (defined $collect_dir) {
71 my $dirsep = &util::get_dirsep();
72 if ($collect_dir !~ m/$dirsep$/) {
73 $collect_dir .= $dirsep;
74 }
75
76 $collect_dir =~ s/\\/\\\\/g; # escape DOS style file separator
77
78 if ($source_filename =~ /^$collect_dir(.*)$/) {
79 $source_filename = $1;
80 }
81 }
82
83 $self->set_source_filename ($source_filename);
84 }
85
86 $self->set_doc_type ($doc_type) if defined $doc_type;
87
88 return $self;
89}
90
91# clone the $self object
92sub duplicate {
93 my $self = shift (@_);
94
95 my $newobj = {};
96
97 foreach my $k (keys %$self) {
98 $newobj->{$k} = &clone ($self->{$k});
99 }
100
101 bless $newobj, ref($self);
102 return $newobj;
103}
104
105sub clone {
106 my ($from) = @_;
107 my $type = ref ($from);
108
109 if ($type eq "HASH") {
110 my $to = {};
111 foreach my $key (keys %$from) {
112 $to->{$key} = &clone ($from->{$key});
113 }
114 return $to;
115 } elsif ($type eq "ARRAY") {
116 my $to = [];
117 foreach my $v (@$from) {
118 push (@$to, &clone ($v));
119 }
120 return $to;
121 } else {
122 return $from;
123 }
124}
125
126sub set_OIDtype {
127 my $self = shift (@_);
128 my ($type) = @_;
129
130 if ($type =~ /^(hash|incremental|dirname|assigned)$/) {
131 $self->{'OIDtype'} = $type;
132 } else {
133 $self->{'OIDtype'} = "hash";
134 }
135}
136
137sub set_source_filename {
138 my $self = shift (@_);
139 my ($source_filename) = @_;
140
141 $self->set_metadata_element ($self->get_top_section(),
142 "gsdlsourcefilename",
143 $source_filename);
144}
145
146sub set_converted_filename {
147 my $self = shift (@_);
148 my ($converted_filename) = @_;
149
150 $self->set_metadata_element ($self->get_top_section(),
151 "gsdlconvertedfilename",
152 $converted_filename);
153}
154
155
156# returns the source_filename as it was provided
157sub get_source_filename {
158 my $self = shift (@_);
159
160 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
161}
162
163# returns converted filename if available else returns source filename
164sub get_filename_for_hashing {
165 my $self = shift (@_);
166
167 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
168
169 if (!defined $filename) {
170 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
171 }
172 return $filename;
173}
174
175sub set_doc_type {
176 my $self = shift (@_);
177 my ($doc_type) = @_;
178
179 $self->set_metadata_element ($self->get_top_section(),
180 "gsdldoctype",
181 $doc_type);
182}
183
184# returns the source_filename as it was provided
185# the default of "indexed_doc" is used if no document
186# type was provided
187sub get_doc_type {
188 my $self = shift (@_);
189
190 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
191 return $doc_type if (defined $doc_type);
192 return "indexed_doc";
193}
194
195sub _escape_text {
196 my ($text) = @_;
197
198 # special characters in the gml encoding
199 $text =~ s/&/&/g; # this has to be first...
200 $text =~ s/</&lt;/g;
201 $text =~ s/>/&gt;/g;
202 $text =~ s/\"/&quot;/g;
203
204 return $text;
205}
206
207sub buffer_section_xml {
208 my $self = shift (@_);
209 my ($section) = @_;
210
211 my $section_ptr = $self->_lookup_section ($section);
212 return "" unless defined $section_ptr;
213
214 my $all_text = "<Section>\n";
215 $all_text .= " <Description>\n";
216
217 # output metadata
218 foreach my $data (@{$section_ptr->{'metadata'}}) {
219 my $escaped_value = &_escape_text($data->[1]);
220 $all_text .= ' <Metadata name="' . $data->[0] . '">' . $escaped_value . "</Metadata>\n";
221 }
222
223 $all_text .= " </Description>\n";
224
225 # output the text
226 $all_text .= " <Content>";
227 $all_text .= &_escape_text($section_ptr->{'text'});
228 $all_text .= "</Content>\n";
229
230 # output all the subsections
231 foreach my $subsection (@{$section_ptr->{'subsection_order'}}) {
232 $all_text .= $self->buffer_section_xml("$section.$subsection");
233 }
234
235 $all_text .= "</Section>\n";
236
237 # make sure no nasty control characters have snuck through
238 # (XML::Parser will barf on anything it doesn't consider to be
239 # valid UTF-8 text, including things like \c@, \cC etc.)
240 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
241
242 return $all_text;
243}
244
245sub buffer_txt_section_xml {
246 my $self = shift(@_);
247 my ($section) = @_;
248
249 my $section_ptr = $self->_lookup_section ($section);
250
251 return "" unless defined $section_ptr;
252
253 my $all_text = "<Section>\n";
254
255 ##output the text
256 #$all_text .= " <Content>";
257 $all_text .= &_escape_text($section_ptr->{'text'});
258 #$all_text .= " </Content>\n";
259
260
261 #output all the subsections
262 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
263 $all_text .= $self->buffer_txt_section_xml("$section.$subsection");
264 }
265
266 $all_text .= "</Section>\n";
267
268
269 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
270 return $all_text;
271}
272
273sub buffer_mets_fileSection_section_xml() {
274 my $self = shift(@_);
275 my ($section) = @_;
276
277 #$section="" unless defined $section;
278
279
280 my $section_ptr=$self->_lookup_section($section);
281 return "" unless defined $section_ptr;
282
283
284 #**output fileSection by sections
285 my $section_num ="1". $section;
286
287
288 #my $filePath = $doc_Dir . '/doctxt.xml';
289
290 my $filePath = 'doctxt.xml';
291
292 #**output the fileSection details
293 my $all_text = ' <mets:fileGrp ID="FILEGROUP_PRELUDE' . $section_num . '">'. "\n";
294 $all_text .= ' <mets:file MIMETYPE="text/xml" ID="FILE'.$section_num. '">'. "\n";
295 $all_text .= ' <mets:FLocat LOCTYPE="URL" xlink:href="file:'.$filePath.'#xpointer(/Section[';
296
297 my $xpath = "1".$section;
298
299 $xpath =~ s/\./]\/Section[/g;
300
301 $all_text .= $xpath;
302
303 $all_text .= ']/text())" />' . "\n";
304 $all_text .= " </mets:file>\n";
305 $all_text .= " </mets:fileGrp>\n";
306
307
308 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
309 $all_text .= $self->buffer_mets_fileSection_section_xml("$section.$subsection");
310 }
311
312 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
313
314 return $all_text;
315}
316
317sub buffer_mets_fileWhole_section_xml(){
318 my $self = shift(@_);
319 my ($section) = @_;
320
321 my $section_ptr = $self-> _lookup_section($section);
322 return "" unless defined $section_ptr;
323
324 my $all_text="" unless defined $all_txt;
325
326 my ($dirPath)="" unless defined $dirPath;
327 my $fileID=0;
328
329 #** output the fileSection for the whole section
330 #*** get the sourcefile and associative file
331
332 foreach my $data (@{$section_ptr->{'metadata'}}){
333 my $escaped_value = &_escape_text($data->[1]);
334 if ($data->[0] eq "gsdlsourcefilename") {
335 ($dirPath) = $escaped_value =~ m/^(.*)[\/\\][^\/\\]*$/;
336
337 $all_text .= ' <mets:fileGrp ID="default">'."\n";
338 ++$fileID;
339 $all_text .= ' <mets:file MIMETYPE="text/xml" ID="default.'.$fileID.'">'. "\n";
340
341 $all_text .= ' <mets:FLocat LOCTYPE="URL" xlink:href="file:'.$data->[1].'" />'."\n";
342
343 $all_text .= " </mets:file>\n";
344 }
345
346 if ($data->[0] eq "gsdlassocfile"){
347 $escaped_value =~ m/^(.*?):(.*):$/;
348
349 my $assfilePath = $dirPath . '/'. $1;
350 ++$fileID;
351
352 $all_text .= ' <mets:file MIMETYPE="'.$2.'" ID="default.'.$fileID. '">'. "\n";
353 $all_text .= ' <mets:FLocat LOCTYPE="URL" xlink:href="file:'.$assfilePath.'" />'."\n";
354
355 $all_text .= " </mets:file>\n";
356 }
357 }
358 $all_text .= " </mets:fileGrp>\n";
359
360
361 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
362
363 return $all_text;
364}
365
366sub buffer_mets_StruMapSection_section_xml(){
367 my $self = shift(@_);
368 my ($section, $order_numref) = @_;
369
370 $section="" unless defined $section;
371
372
373 my $section_ptr=$self->_lookup_section($section);
374 return "" unless defined $section_ptr;
375
376
377 #***output fileSection by Sections
378 my $section_num ="1". $section;
379 my $dmd_num = $section_num;
380
381 ##**output the dmdSection details
382 #if ($section_num eq "1") {
383 # $dmd_num = "0";
384 #}
385
386 #**output the StruMap details
387
388 my $all_text = ' <mets:div ID="DS'. $section_num .'" TYPE="Section" ORDER="'.$$order_numref++.'" ORDERLABEL="'. $section_num .'" LABEL="';
389
390 $all_text .= $section_num . '" DMDID="DM'.$dmd_num.'">'. "\n";
391
392 $all_text .= ' <mets:fptr FILEID="FILEGROUP_PRELUDE'.$section_num.'" />'. "\n";
393
394
395 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
396 $all_text .= $self->buffer_mets_StruMapSection_section_xml("$section.$subsection", $order_numref);
397 }
398
399 $all_text .= " </mets:div>\n";
400
401 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
402
403 return $all_text;
404}
405
406
407sub buffer_mets_StruMapWhole_section_xml(){
408 my $self = shift(@_);
409 my ($section) = @_;
410
411 my $section_ptr = $self-> _lookup_section($section);
412 return "" unless defined $section_ptr;
413
414 my $all_text="" unless defined $all_txt;
415 my $fileID=0;
416 my $order_num = 0;
417
418 $all_text .= '<mets:structMap ID="All" TYPE="Whole Document" LABEL="All">'."\n";
419 $all_text .= ' <mets:div ID="DSAll" TYPE="Document" ORDER="'.$order_num.'" ORDERLABEL="All" LABEL="Whole Documemt" DMDID="DM1">' . "\n";
420
421
422 #** output the StruMapSection for the whole section
423 #*** get the sourcefile and associative file
424
425 foreach my $data (@{$section_ptr->{'metadata'}}){
426 my $escaped_value = &_escape_text($data->[1]);
427
428 if ($data->[0] eq "gsdlsourcefilename") {
429 ++$fileID;
430 $all_text .= ' <mets:fptr FILEID="default.'.$fileID.'" />'."\n";
431 }
432
433 if ($data->[0] eq "gsdlassocfile"){
434 ++$fileID;
435 $all_text .= ' <mets:fptr FILEID="default.'.$fileID. '" />'. "\n";
436 }
437 }
438 $all_text .= " </mets:div>\n";
439
440 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
441
442 return $all_text;
443}
444
445
446sub buffer_mets_dmdSection_section_xml(){
447 my $self = shift(@_);
448 my ($section) = @_;
449
450 $section="" unless defined $section;
451
452 my $section_ptr=$self->_lookup_section($section);
453 return "" unless defined $section_ptr;
454
455 #***convert section number
456 my $section_num ="1". $section;
457 my $dmd_num = $section_num;
458
459 # #**output the dmdSection details
460 # if ($section_num eq "1") {
461 # $dmd_num = "0";
462 # }
463 my $all_text = '<mets:dmdSec ID="DM'.$dmd_num.'" GROUPID="'.$section_num.'">'. "\n";
464 $all_text .= ' <mets:mdWrap MDTYPE="OTHER" OTHERMDTYPE="gsdl3" ID="gsdl'.$section_num.'">'."\n";
465 $all_text .= " <mets:xmlData>\n";
466 foreach my $data (@{$section_ptr->{'metadata'}}){
467 my $escaped_value = &_escape_text($data->[1]);
468 $all_text .= ' <gsdl3:Metadata name="'. $data->[0].'">'. $escaped_value. "</gsdl3:Metadata>\n";
469 if ($data->[0] eq "dc.Title") {
470 $all_text .= ' <gsdl3:Metadata name="Title">'. $escaped_value."</gsdl3:Metadata>\n";
471 }
472 }
473
474 $all_text .= " </mets:xmlData>\n";
475 $all_text .= " </mets:mdWrap>\n";
476 $all_text .= "</mets:dmdSec>\n";
477
478 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
479 $all_text .= $self->buffer_mets_dmdSection_section_xml("$section.$subsection");
480 }
481
482 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
483
484 return $all_text;
485}
486
487sub output_section {
488 my $self = shift (@_);
489 my ($handle, $section) = @_;
490
491 print $handle $self->buffer_section_xml($section);
492}
493
494#*** print out DSpace dublin_core metadata section
495sub output_dspace_section {
496 my $self = shift (@_);
497 my ($handle, $section) = @_;
498
499 my $section_ptr = $self->_lookup_section ($section);
500 return "" unless defined $section_ptr;
501
502 my $all_text = "<Section>\n";
503 $all_text .= " <Description>\n";
504
505 # output metadata
506 foreach my $data (@{$section_ptr->{'metadata'}}) {
507 my $escaped_value = &_escape_text($data->[1]);
508 $all_text .= ' <Metadata name="' . $data->[0] . '">' . $escaped_value . "</Metadata>\n";
509 }
510
511 $all_text .= " </Description>\n";
512 $all_text .= "</Section>\n";
513
514 # make sure no nasty control characters have snuck through
515 # (XML::Parser will barf on anything it doesn't consider to be
516 # valid UTF-8 text, including things like \c@, \cC etc.)
517 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
518
519 return $all_text;
520}
521
522#*** print out doctxt.xml file
523sub output_txt_section {
524 my $self = shift (@_);
525 my ($handle, $section) = @_;
526
527 print $handle $self->buffer_txt_section_xml($section);
528}
529
530#*** print out docmets.xml file
531sub output_mets_section {
532 my $self = shift(@_);
533 my ($handle, $section) = @_;
534
535 #***print out the dmdSection
536 print $handle $self->buffer_mets_dmdSection_section_xml($section);
537
538 #***print out the fileSection by sections
539 print $handle "<mets:fileSec>\n";
540 print $handle $self->buffer_mets_fileSection_section_xml($section);
541
542 #***print out the whole fileSection
543 print $handle $self->buffer_mets_fileWhole_section_xml($section);
544 print $handle "</mets:fileSec>\n";
545
546 #***print out the StruMapSection by sections
547 print $handle '<mets:structMap ID="Section" TYPE="Section" LABEL="Section">' . "\n";
548 my $order_num=0;
549 print $handle $self->buffer_mets_StruMapSection_section_xml($section, \$order_num);
550 print $handle "</mets:structMap>\n";
551 print $handle $self->buffer_mets_StruMapWhole_section_xml($section);
552 print $handle "</mets:structMap>\n";
553}
554
555#*** print out dublin_core.xml file
556sub output_dc_section {
557 my $self = shift(@_);
558 my ($handle, $section, $doc_Dir) = @_;
559
560 #***print out the dublin_core
561 $section="" unless defined $section;
562
563 my $section_ptr=$self->_lookup_section($section);
564 return "" unless defined $section_ptr;
565 my $all_text="";
566 foreach my $data (@{$section_ptr->{'metadata'}}){
567 my $escaped_value = &_escape_text($data->[1]);
568 if ($data->[0]=~ /^dc/) {
569 #$all_text .= ' <dcvalue element="'. $data->[0].'" qualifier="#####">'. $escaped_value. "</dcvalue>\n";
570 $all_text .= ' <dcvalue element="'. $data->[0].'">'. $escaped_value. "</dcvalue>\n";
571 }
572 }
573 if ($all_text eq "") {
574 $all_text .= " There is no Dublin Core metatdata in this document\n";
575 }
576 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
577
578 print $handle $all_text;
579}
580
581# look up the reference to the a particular section
582sub _lookup_section {
583 my $self = shift (@_);
584 my ($section) = @_;
585
586 my ($num);
587 my $sectionref = $self;
588
589 while (defined $section && $section ne "") {
590 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
591 $num =~ s/^0+(\d)/$1/; # remove leading 0s
592 $section = "" unless defined $section;
593
594 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
595 $sectionref = $sectionref->{'subsections'}->{$num};
596 } else {
597 return undef;
598 }
599 }
600
601 return $sectionref;
602}
603
604# calculate OID by hashing the contents of the document
605sub _calc_OID {
606 my $self = shift (@_);
607 my ($filename) = @_;
608
609 my $osexe = &util::get_os_exe();
610
611 my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
612 $ENV{'GSDLOS'},"hashfile$osexe");
613
614 my $result = "NULL";
615
616 if (-e "$hashfile_exe") {
617# $result = `\"$hashfile_exe\" \"$filename\"`;
618 $result = `hashfile$osexe \"$filename\"`;
619 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
620
621 } else {
622 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
623 }
624 return "HASH$result";
625}
626
627# methods dealing with OID, not groups of them.
628
629# if $OID is not provided one is calculated
630sub set_OID {
631 my $self = shift (@_);
632 my ($OID) = @_;
633
634 my $use_hash_oid = 0;
635 # if an OID wasn't provided claculate one
636 if (!defined $OID) {
637 $OID = "NULL";
638 if ($self->{'OIDtype'} eq "hash") {
639 $use_hash_oid = 1;
640 } elsif ($self->{'OIDtype'} eq "incremental") {
641 $OID = "D" . $OIDcount;
642 $OIDcount ++;
643
644 } elsif ($self->{'OIDtype'} eq "dirname") {
645 $OID = 'J';
646 my $filename = $self->get_source_filename();
647 if (defined($filename)) { # && -e $filename) {
648 $OID = &File::Basename::dirname($filename);
649 if (defined $OID) {
650 $OID = 'J'.&File::Basename::basename($OID);
651 $OID =~ s/\.//; #remove any periods
652 } else {
653 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
654 $use_hash_oid = 1;
655 }
656 } else {
657 print STDERR "Failed to find filename, generating hash id\n";
658 $use_hash_oid = 1;
659 }
660
661 } elsif ($self->{'OIDtype'} eq "assigned") {
662 my $identifier = $self->get_metadata_element ($self->get_top_section(), "dc.Identifier");
663 if (defined $identifier && $identifier ne "") {
664 $OID = "D" . $self->get_metadata_element ($self->get_top_section(), "dc.Identifier");
665 $OID =~ s/\.//; #remove any periods
666 } else {
667 # need a hash id
668 print STDERR "no dc.Identifier found, generating hash id\n";
669 $use_hash_oid = 1;
670 }
671
672 } else {
673 $use_hash_oid = 1;
674 }
675
676 if ($use_hash_oid) {
677
678 # "hash" OID - feed file to hashfile.exe
679 #my $filename = $self->get_source_filename();
680 # we want to use the converted file for hashing if available
681 # cos its quicker
682 my $filename = $self->get_filename_for_hashing();
683
684 if (defined($filename) && -e $filename) {
685 $OID = $self->_calc_OID ($filename);
686 } else {
687 $filename = &util::get_tmp_filename();
688 if (!open (OUTFILE, ">$filename")) {
689 print STDERR "doc::set_OID could not write to $filename\n";
690 } else {
691 $self->output_section('OUTFILE', $self->get_top_section(),
692 undef, 1);
693 close (OUTFILE);
694 }
695 $OID = $self->_calc_OID ($filename);
696 &util::rm ($filename);
697 }
698 }
699 }
700 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
701}
702
703# this uses hashdoc (embedded c thingy) which is faster but still
704# needs a little work to be suffiently stable
705sub ___set_OID {
706 my $self = shift (@_);
707 my ($OID) = @_;
708
709 # if an OID wasn't provided then calculate hash value based on document
710 if (!defined $OID)
711 {
712 my $hash_text = $self->buffer_section_gml($self->get_top_section(),
713 undef, 1);
714 my $hash_len = length($hash_text);
715
716 $OID = &hashdoc::buffer($hash_text,$hash_len);
717 }
718
719 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
720}
721
722# returns the OID for this document
723sub get_OID {
724 my $self = shift (@_);
725 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
726 return $OID if (defined $OID);
727 return "NULL";
728}
729
730sub delete_OID {
731 my $self = shift (@_);
732
733 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
734}
735
736
737# methods for manipulating section names
738
739# returns the name of the top-most section (the top
740# level of the document
741sub get_top_section {
742 my $self = shift (@_);
743
744 return "";
745}
746
747# returns a section
748sub get_parent_section {
749 my $self = shift (@_);
750 my ($section) = @_;
751
752 $section =~ s/(^|\.)\d+$//;
753
754 return $section;
755}
756
757# returns the first child section (or the end child
758# if there isn't any)
759sub get_begin_child {
760 my $self = shift (@_);
761 my ($section) = @_;
762
763 my $section_ptr = $self->_lookup_section($section);
764 return "" unless defined $section_ptr;
765
766 if (defined $section_ptr->{'subsection_order'}->[0]) {
767 return "$section.$section_ptr->{'subsection_order'}->[0]";
768 }
769
770 return $self->get_end_child ($section);
771}
772
773# returns the next child of a parent section
774sub get_next_child {
775 my $self = shift (@_);
776 my ($section) = @_;
777
778 my $parent_section = $self->get_parent_section($section);
779 my $parent_section_ptr = $self->_lookup_section($parent_section);
780 return undef unless defined $parent_section_ptr;
781
782 my ($section_num) = $section =~ /(\d+)$/;
783 return undef unless defined $section_num;
784
785 my $i = 0;
786 my $section_order = $parent_section_ptr->{'subsection_order'};
787 while ($i < scalar(@$section_order)) {
788 last if $section_order->[$i] eq $section_num;
789 $i++;
790 }
791
792 $i++; # the next child
793 if ($i < scalar(@$section_order)) {
794 return $section_order->[$i] if $parent_section eq "";
795 return "$parent_section.$section_order->[$i]";
796 }
797
798 # no more sections in this level
799 return undef;
800}
801
802# returns a reference to a list of children
803sub get_children {
804 my $self = shift (@_);
805 my ($section) = @_;
806
807 my $section_ptr = $self->_lookup_section($section);
808 return [] unless defined $section_ptr;
809
810 my @children = @{$section_ptr->{'subsection_order'}};
811
812 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
813 return \@children;
814}
815
816# returns the child section one past the last one (which
817# is coded as "0")
818sub get_end_child {
819 my $self = shift (@_);
820 my ($section) = @_;
821
822 return $section . ".0" unless $section eq "";
823 return "0";
824}
825
826# returns the next section in book order
827sub get_next_section {
828 my $self = shift (@_);
829 my ($section) = @_;
830
831 return undef unless defined $section;
832
833 my $section_ptr = $self->_lookup_section($section);
834 return undef unless defined $section_ptr;
835
836 # first try to find first child
837 if (defined $section_ptr->{'subsection_order'}->[0]) {
838 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
839 return "$section.$section_ptr->{'subsection_order'}->[0]";
840 }
841
842 do {
843 # try to find sibling
844 my $next_child = $self->get_next_child ($section);
845 return $next_child if (defined $next_child);
846
847 # move up one level
848 $section = $self->get_parent_section ($section);
849 } while $section =~ /\d/;
850
851 return undef;
852}
853
854sub is_leaf_section {
855 my $self = shift (@_);
856 my ($section) = @_;
857
858 my $section_ptr = $self->_lookup_section($section);
859 return 1 unless defined $section_ptr;
860
861 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
862}
863
864# methods for dealing with sections
865
866# returns the name of the inserted section
867sub insert_section {
868 my $self = shift (@_);
869 my ($before_section) = @_;
870
871 # get the child to insert before and its parent section
872 my $parent_section = "";
873 my $before_child = "0";
874 my @before_section = split (/\./, $before_section);
875 if (scalar(@before_section) > 0) {
876 $before_child = pop (@before_section);
877 $parent_section = join (".", @before_section);
878 }
879
880 my $parent_section_ptr = $self->_lookup_section($parent_section);
881 if (!defined $parent_section_ptr) {
882 print STDERR "doc::insert_section couldn't find parent section " .
883 "$parent_section\n";
884 return;
885 }
886
887 # get the next section number
888 my $section_num = $parent_section_ptr->{'next_subsection'}++;
889
890 my $i = 0;
891 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
892 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
893 $i++;
894 }
895
896 # insert the section number into the order list
897 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
898
899 # add this section to the parent section
900 my $section_ptr = {'subsection_order'=>[],
901 'next_subsection'=>1,
902 'subsections'=>{},
903 'metadata'=>[],
904 'text'=>""};
905 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
906
907 # work out the full section number
908 my $section = $parent_section;
909 $section .= "." unless $section eq "";
910 $section .= $section_num;
911
912 return $section;
913}
914
915# creates a pre-named section
916sub create_named_section {
917 my $self = shift (@_);
918 my ($mastersection) = @_;
919
920 my ($num);
921 my $section = $mastersection;
922 my $sectionref = $self;
923
924 while ($section ne "") {
925 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
926 $num =~ s/^0+(\d)/$1/; # remove leading 0s
927 $section = "" unless defined $section;
928
929 if (defined $num) {
930 if (!defined $sectionref->{'subsections'}->{$num}) {
931 push (@{$sectionref->{'subsection_order'}}, $num);
932 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
933 'next_subsection'=>1,
934 'subsections'=>{},
935 'metadata'=>[],
936 'text'=>""};
937 if ($num >= $sectionref->{'next_subsection'}) {
938 $sectionref->{'next_subsection'} = $num + 1;
939 }
940 }
941 $sectionref = $sectionref->{'subsections'}->{$num};
942
943 } else {
944 print STDERR "doc::create_named_section couldn't create section ";
945 print STDERR "$mastersection\n";
946 last;
947 }
948 }
949}
950
951# returns a reference to a list of subsections
952sub list_subsections {
953 my $self = shift (@_);
954 my ($section) = @_;
955
956 my $section_ptr = $self->_lookup_section ($section);
957 if (!defined $section_ptr) {
958 print STDERR "doc::list_subsections couldn't find section $section\n";
959 return [];
960 }
961
962 return [@{$section_ptr->{'subsection_order'}}];
963}
964
965sub delete_section {
966 my $self = shift (@_);
967 my ($section) = @_;
968
969# my $section_ptr = {'subsection_order'=>[],
970# 'next_subsection'=>1,
971# 'subsections'=>{},
972# 'metadata'=>[],
973# 'text'=>""};
974
975 # if this is the top section reset everything
976 if ($section eq "") {
977 $self->{'subsection_order'} = [];
978 $self->{'subsections'} = {};
979 $self->{'metadata'} = [];
980 $self->{'text'} = "";
981 return;
982 }
983
984 # find the parent of the section to delete
985 my $parent_section = "";
986 my $child = "0";
987 my @section = split (/\./, $section);
988 if (scalar(@section) > 0) {
989 $child = pop (@section);
990 $parent_section = join (".", @section);
991 }
992
993 my $parent_section_ptr = $self->_lookup_section($parent_section);
994 if (!defined $parent_section_ptr) {
995 print STDERR "doc::delete_section couldn't find parent section " .
996 "$parent_section\n";
997 return;
998 }
999
1000 # remove this section from the subsection_order list
1001 my $i = 0;
1002 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
1003 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
1004 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
1005 last;
1006 }
1007 $i++;
1008 }
1009
1010 # remove this section from the subsection hash
1011 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
1012 undef $parent_section_ptr->{'subsections'}->{$child};
1013 }
1014}
1015
1016#--
1017# methods for dealing with metadata
1018
1019# set_metadata_element and get_metadata_element are for metadata
1020# which should only have one value. add_meta_data and get_metadata
1021# are for metadata which can have more than one value.
1022
1023# returns the first metadata value which matches field
1024
1025# This version of get metadata element works much like the one above,
1026# except it allows for the namespace portion of a metadata element to
1027# be ignored, thus if you are searching for dc.Title, the first piece
1028# of matching metadata ending with the name Title (once any namespace
1029# is removed) would be returned.
1030# 28-11-2003 John Thompson
1031sub get_metadata_element {
1032 my $self = shift (@_);
1033 my ($section, $field, $ignore_namespace) = @_;
1034 my ($data);
1035
1036 $ignore_namespace = 0 unless defined $ignore_namespace;
1037
1038 my $section_ptr = $self->_lookup_section($section);
1039 if (!defined $section_ptr) {
1040 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
1041 return;
1042 }
1043
1044 # Remove the any namespace if we are being told to ignore them
1045 if($ignore_namespace) {
1046 $field =~ s/^\w*\.//;
1047 }
1048
1049 foreach $data (@{$section_ptr->{'metadata'}}) {
1050
1051 my $data_name = $data->[0];
1052 # Remove the any namespace if we are being told to ignore them
1053 if($ignore_namespace) {
1054 $data_name =~ s/^\w*\.//;
1055 }
1056
1057 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
1058 }
1059
1060 return undef; # was not found
1061}
1062
1063# returns a list of the form [value1, value2, ...]
1064sub get_metadata {
1065 my $self = shift (@_);
1066 my ($section, $field, $ignore_namespace) = @_;
1067 my ($data);
1068
1069 $ignore_namespace = 0 unless defined $ignore_namespace;
1070
1071 my $section_ptr = $self->_lookup_section($section);
1072 if (!defined $section_ptr) {
1073 print STDERR "doc::get_metadata couldn't find section ",
1074 $section, "\n";
1075 return;
1076 }
1077
1078 # Remove the any namespace if we are being told to ignore them
1079 if($ignore_namespace) {
1080 $field =~ s/^\w*\.//;
1081 }
1082
1083 my @metadata = ();
1084 foreach $data (@{$section_ptr->{'metadata'}}) {
1085
1086 my $data_name = $data->[0];
1087 # Remove the any namespace if we are being told to ignore them
1088 if($ignore_namespace) {
1089 $data_name =~ s/^\w*\.//;
1090 }
1091
1092 push (@metadata, $data->[1]) if ($data_name eq $field);
1093 }
1094
1095 return \@metadata;
1096}
1097
1098# returns a list of the form [[field,value],[field,value],...]
1099sub get_all_metadata {
1100 my $self = shift (@_);
1101 my ($section) = @_;
1102
1103 my $section_ptr = $self->_lookup_section($section);
1104 if (!defined $section_ptr) {
1105 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
1106 return;
1107 }
1108
1109 return $section_ptr->{'metadata'};
1110}
1111
1112# $value is optional
1113sub delete_metadata {
1114 my $self = shift (@_);
1115 my ($section, $field, $value) = @_;
1116
1117 my $section_ptr = $self->_lookup_section($section);
1118 if (!defined $section_ptr) {
1119 print STDERR "doc::delete_metadata couldn't find section ", $section, "\n";
1120 return;
1121 }
1122
1123 my $i = 0;
1124 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
1125 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
1126 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
1127 splice (@{$section_ptr->{'metadata'}}, $i, 1);
1128 } else {
1129 $i++;
1130 }
1131 }
1132}
1133
1134sub delete_all_metadata {
1135 my $self = shift (@_);
1136 my ($section) = @_;
1137
1138 my $section_ptr = $self->_lookup_section($section);
1139 if (!defined $section_ptr) {
1140 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
1141 return;
1142 }
1143
1144 $section_ptr->{'metadata'} = [];
1145}
1146
1147sub set_metadata_element {
1148 my $self = shift (@_);
1149 my ($section, $field, $value) = @_;
1150
1151 $self->set_utf8_metadata_element ($section, $field,
1152 &unicode::ascii2utf8(\$value));
1153}
1154
1155# set_utf8_metadata_element assumes the text has already been
1156# converted to the UTF-8 encoding.
1157sub set_utf8_metadata_element {
1158 my $self = shift (@_);
1159 my ($section, $field, $value) = @_;
1160
1161 $self->delete_metadata ($section, $field);
1162 $self->add_utf8_metadata ($section, $field, $value);
1163}
1164
1165
1166# add_metadata assumes the text is in (extended) ascii form. For
1167# text which has already been converted to the UTF-8 format use
1168# add_utf8_metadata.
1169sub add_metadata {
1170 my $self = shift (@_);
1171 my ($section, $field, $value) = @_;
1172
1173 $self->add_utf8_metadata ($section, $field,
1174 &unicode::ascii2utf8(\$value));
1175}
1176
1177sub add_utf8_metadata {
1178 my $self = shift (@_);
1179 my ($section, $field, $value) = @_;
1180
1181 my $section_ptr = $self->_lookup_section($section);
1182 if (!defined $section_ptr) {
1183 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
1184 return;
1185 }
1186 if (!defined $value) {
1187 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
1188 return;
1189 }
1190 if (!defined $field) {
1191 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
1192 return;
1193 }
1194
1195 #print STDERR "###$field=$value\n";
1196 # double check that the value is utf-8
1197 if (unicode::ensure_utf8(\$value)) {
1198 print STDERR "doc::add_utf8_metadata: warning: '$field' wasn't utf8\n";
1199 }
1200
1201 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1202}
1203
1204
1205# methods for dealing with text
1206
1207# returns the text for a section
1208sub get_text {
1209 my $self = shift (@_);
1210 my ($section) = @_;
1211
1212 my $section_ptr = $self->_lookup_section($section);
1213 if (!defined $section_ptr) {
1214 print STDERR "doc::get_text couldn't find section " .
1215 "$section\n";
1216 return "";
1217 }
1218
1219 return $section_ptr->{'text'};
1220}
1221
1222# returns the (utf-8 encoded) length of the text for a section
1223sub get_text_length {
1224 my $self = shift (@_);
1225 my ($section) = @_;
1226
1227 my $section_ptr = $self->_lookup_section($section);
1228 if (!defined $section_ptr) {
1229 print STDERR "doc::get_text_length couldn't find section " .
1230 "$section\n";
1231 return 0;
1232 }
1233
1234 return length ($section_ptr->{'text'});
1235}
1236
1237sub delete_text {
1238 my $self = shift (@_);
1239 my ($section) = @_;
1240
1241 my $section_ptr = $self->_lookup_section($section);
1242 if (!defined $section_ptr) {
1243 print STDERR "doc::delete_text couldn't find section " .
1244 "$section\n";
1245 return;
1246 }
1247
1248 $section_ptr->{'text'} = "";
1249}
1250
1251# add_text assumes the text is in (extended) ascii form. For
1252# text which has been already converted to the UTF-8 format
1253# use add_utf8_text.
1254sub add_text {
1255 my $self = shift (@_);
1256 my ($section, $text) = @_;
1257
1258 # convert the text to UTF-8 encoded unicode characters
1259 # and add the text
1260 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1261}
1262
1263
1264# add_utf8_text assumes the text to be added has already
1265# been converted to the UTF-8 encoding. For ascii text use
1266# add_text
1267sub add_utf8_text {
1268 my $self = shift (@_);
1269 my ($section, $text) = @_;
1270
1271 my $section_ptr = $self->_lookup_section($section);
1272 if (!defined $section_ptr) {
1273 print STDERR "doc::add_utf8_text couldn't find section " .
1274 "$section\n";
1275 return;
1276 }
1277
1278 $section_ptr->{'text'} .= $text;
1279}
1280
1281
1282# methods for dealing with associated files
1283
1284# a file is associated with a document, NOT a section.
1285# if section is defined it is noted in the data structure
1286# only so that files associated from a particular section
1287# may be removed later (using delete_section_assoc_files)
1288sub associate_file {
1289 my $self = shift (@_);
1290 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1291 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1292
1293 # remove all associated files with the same name
1294 $self->delete_assoc_file ($assoc_filename);
1295
1296 push (@{$self->{'associated_files'}},
1297 [$real_filename, $assoc_filename, $mime_type, $section]);
1298}
1299
1300# returns a list of associated files in the form
1301# [[real_filename, assoc_filename, mimetype], ...]
1302sub get_assoc_files {
1303 my $self = shift (@_);
1304
1305 return $self->{'associated_files'};
1306}
1307
1308sub delete_section_assoc_files {
1309 my $self = shift (@_);
1310 my ($section) = @_;
1311
1312 my $i=0;
1313 while ($i < scalar (@{$self->{'associated_files'}})) {
1314 if (defined $self->{'associated_files'}->[$i]->[3] &&
1315 $self->{'associated_files'}->[$i]->[3] eq $section) {
1316 splice (@{$self->{'associated_files'}}, $i, 1);
1317 } else {
1318 $i++;
1319 }
1320 }
1321}
1322
1323sub delete_assoc_file {
1324 my $self = shift (@_);
1325 my ($assoc_filename) = @_;
1326
1327 my $i=0;
1328 while ($i < scalar (@{$self->{'associated_files'}})) {
1329 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1330 splice (@{$self->{'associated_files'}}, $i, 1);
1331 } else {
1332 $i++;
1333 }
1334 }
1335}
1336
1337sub reset_nextsection_ptr {
1338 my $self = shift (@_);
1339 my ($section) = @_;
1340
1341 my $section_ptr = $self->_lookup_section($section);
1342 $section_ptr->{'next_subsection'} = 1;
1343}
1344
13451;
Note: See TracBrowser for help on using the repository browser.