source: trunk/gsdl/perllib/doc.pm@ 9038

Last change on this file since 9038 was 9038, checked in by kjdon, 19 years ago

when doing hash ids, don't hash on the file if it has zero size

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 36.2 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use unicode;
37use util;
38use ghtml;
39use File::stat;
40##use hashdoc;
41
42# the document type may be indexed_doc, nonindexed_doc, or
43# classification
44
45my $OIDcount = 0;
46
47sub new {
48 my $class = shift (@_);
49 my ($source_filename, $doc_type) = @_;
50
51 my $self = bless {'associated_files'=>[],
52 'subsection_order'=>[],
53 'next_subsection'=>1,
54 'subsections'=>{},
55 'metadata'=>[],
56 'text'=>"",
57 'OIDtype'=>"hash"}, $class;
58
59 #GRB: Added code to set lastmodified for OAI purposes
60 if ((defined $doc_type) && (defined $source_filename) && (-e $source_filename)) {
61 my $file_stat = stat($source_filename);
62 my $mtime = $file_stat->mtime;
63 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $file_stat->mtime);
64 }
65 #GRB: end inserted code
66
67 if (defined $source_filename) {
68 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
69
70 if (defined $collect_dir) {
71 my $dirsep = &util::get_dirsep();
72 if ($collect_dir !~ m/$dirsep$/) {
73 $collect_dir .= $dirsep;
74 }
75
76 $collect_dir =~ s/\\/\\\\/g; # escape DOS style file separator
77
78 if ($source_filename =~ /^$collect_dir(.*)$/) {
79 $source_filename = $1;
80 }
81 }
82
83 $self->set_source_filename ($source_filename);
84 }
85
86 $self->set_doc_type ($doc_type) if defined $doc_type;
87
88 return $self;
89}
90
91# clone the $self object
92sub duplicate {
93 my $self = shift (@_);
94
95 my $newobj = {};
96
97 foreach my $k (keys %$self) {
98 $newobj->{$k} = &clone ($self->{$k});
99 }
100
101 bless $newobj, ref($self);
102 return $newobj;
103}
104
105sub clone {
106 my ($from) = @_;
107 my $type = ref ($from);
108
109 if ($type eq "HASH") {
110 my $to = {};
111 foreach my $key (keys %$from) {
112 $to->{$key} = &clone ($from->{$key});
113 }
114 return $to;
115 } elsif ($type eq "ARRAY") {
116 my $to = [];
117 foreach my $v (@$from) {
118 push (@$to, &clone ($v));
119 }
120 return $to;
121 } else {
122 return $from;
123 }
124}
125
126sub set_OIDtype {
127 my $self = shift (@_);
128 my ($type) = @_;
129
130 if ($type =~ /^(hash|incremental|dirname|assigned)$/) {
131 $self->{'OIDtype'} = $type;
132 } else {
133 $self->{'OIDtype'} = "hash";
134 }
135}
136
137sub set_source_filename {
138 my $self = shift (@_);
139 my ($source_filename) = @_;
140
141 $self->set_metadata_element ($self->get_top_section(),
142 "gsdlsourcefilename",
143 $source_filename);
144}
145
146sub set_converted_filename {
147 my $self = shift (@_);
148 my ($converted_filename) = @_;
149
150 $self->set_metadata_element ($self->get_top_section(),
151 "gsdlconvertedfilename",
152 $converted_filename);
153}
154
155
156# returns the source_filename as it was provided
157sub get_source_filename {
158 my $self = shift (@_);
159
160 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
161}
162
163# returns converted filename if available else returns source filename
164sub get_filename_for_hashing {
165 my $self = shift (@_);
166
167 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
168
169 if (!defined $filename) {
170 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
171 }
172 return $filename;
173}
174
175sub set_doc_type {
176 my $self = shift (@_);
177 my ($doc_type) = @_;
178
179 $self->set_metadata_element ($self->get_top_section(),
180 "gsdldoctype",
181 $doc_type);
182}
183
184# returns the source_filename as it was provided
185# the default of "indexed_doc" is used if no document
186# type was provided
187sub get_doc_type {
188 my $self = shift (@_);
189
190 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
191 return $doc_type if (defined $doc_type);
192 return "indexed_doc";
193}
194
195sub _escape_text {
196 my ($text) = @_;
197
198 # special characters in the gml encoding
199 $text =~ s/&/&/g; # this has to be first...
200 $text =~ s/</&lt;/g;
201 $text =~ s/>/&gt;/g;
202 $text =~ s/\"/&quot;/g;
203
204 return $text;
205}
206
207sub buffer_section_xml {
208 my $self = shift (@_);
209 my ($section) = @_;
210
211 my $section_ptr = $self->_lookup_section ($section);
212 return "" unless defined $section_ptr;
213
214 my $all_text = "<Section>\n";
215 $all_text .= " <Description>\n";
216
217 # output metadata
218 foreach my $data (@{$section_ptr->{'metadata'}}) {
219 my $escaped_value = &_escape_text($data->[1]);
220 $all_text .= ' <Metadata name="' . $data->[0] . '">' . $escaped_value . "</Metadata>\n";
221 }
222
223 $all_text .= " </Description>\n";
224
225 # output the text
226 $all_text .= " <Content>";
227 $all_text .= &_escape_text($section_ptr->{'text'});
228 $all_text .= "</Content>\n";
229
230 # output all the subsections
231 foreach my $subsection (@{$section_ptr->{'subsection_order'}}) {
232 $all_text .= $self->buffer_section_xml("$section.$subsection");
233 }
234
235 $all_text .= "</Section>\n";
236
237 # make sure no nasty control characters have snuck through
238 # (XML::Parser will barf on anything it doesn't consider to be
239 # valid UTF-8 text, including things like \c@, \cC etc.)
240 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
241
242 return $all_text;
243}
244
245sub buffer_txt_section_xml {
246 my $self = shift(@_);
247 my ($section) = @_;
248
249 my $section_ptr = $self->_lookup_section ($section);
250
251 return "" unless defined $section_ptr;
252
253 my $all_text = "<Section>\n";
254
255 ##output the text
256 #$all_text .= " <Content>";
257 $all_text .= &_escape_text($section_ptr->{'text'});
258 #$all_text .= " </Content>\n";
259
260
261 #output all the subsections
262 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
263 $all_text .= $self->buffer_txt_section_xml("$section.$subsection");
264 }
265
266 $all_text .= "</Section>\n";
267
268
269 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
270 return $all_text;
271}
272
273sub buffer_mets_fileSection_section_xml() {
274 my $self = shift(@_);
275 my ($section) = @_;
276
277 #$section="" unless defined $section;
278
279
280 my $section_ptr=$self->_lookup_section($section);
281 return "" unless defined $section_ptr;
282
283
284 #**output fileSection by sections
285 my $section_num ="1". $section;
286
287
288 #my $filePath = $doc_Dir . '/doctxt.xml';
289
290 my $filePath = 'doctxt.xml';
291
292 #**output the fileSection details
293 my $all_text = ' <mets:fileGrp ID="FILEGROUP_PRELUDE' . $section_num . '">'. "\n";
294 $all_text .= ' <mets:file MIMETYPE="text/xml" ID="FILE'.$section_num. '">'. "\n";
295 $all_text .= ' <mets:FLocat LOCTYPE="URL" xlink:href="file:'.$filePath.'#xpointer(/Section[';
296
297 my $xpath = "1".$section;
298
299 $xpath =~ s/\./]\/Section[/g;
300
301 $all_text .= $xpath;
302
303 $all_text .= ']/text())" />' . "\n";
304 $all_text .= " </mets:file>\n";
305 $all_text .= " </mets:fileGrp>\n";
306
307
308 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
309 $all_text .= $self->buffer_mets_fileSection_section_xml("$section.$subsection");
310 }
311
312 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
313
314 return $all_text;
315}
316
317sub buffer_mets_fileWhole_section_xml(){
318 my $self = shift(@_);
319 my ($section) = @_;
320
321 my $section_ptr = $self-> _lookup_section($section);
322 return "" unless defined $section_ptr;
323
324 my $all_text="" unless defined $all_txt;
325
326 my ($dirPath)="" unless defined $dirPath;
327 my $fileID=0;
328
329 #** output the fileSection for the whole section
330 #*** get the sourcefile and associative file
331
332 foreach my $data (@{$section_ptr->{'metadata'}}){
333 my $escaped_value = &_escape_text($data->[1]);
334 if ($data->[0] eq "gsdlsourcefilename") {
335 ($dirPath) = $escaped_value =~ m/^(.*)[\/\\][^\/\\]*$/;
336
337 $all_text .= ' <mets:fileGrp ID="default">'."\n";
338 ++$fileID;
339 $all_text .= ' <mets:file MIMETYPE="text/xml" ID="default.'.$fileID.'">'. "\n";
340
341 $all_text .= ' <mets:FLocat LOCTYPE="URL" xlink:href="file:'.$data->[1].'" />'."\n";
342
343 $all_text .= " </mets:file>\n";
344 }
345
346 if ($data->[0] eq "gsdlassocfile"){
347 $escaped_value =~ m/^(.*?):(.*):$/;
348
349 my $assfilePath = $dirPath . '/'. $1;
350 ++$fileID;
351
352 $all_text .= ' <mets:file MIMETYPE="'.$2.'" ID="default.'.$fileID. '">'. "\n";
353 $all_text .= ' <mets:FLocat LOCTYPE="URL" xlink:href="file:'.$assfilePath.'" />'."\n";
354
355 $all_text .= " </mets:file>\n";
356 }
357 }
358 $all_text .= " </mets:fileGrp>\n";
359
360
361 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
362
363 return $all_text;
364}
365
366sub buffer_mets_StruMapSection_section_xml(){
367 my $self = shift(@_);
368 my ($section, $order_numref) = @_;
369
370 $section="" unless defined $section;
371
372
373 my $section_ptr=$self->_lookup_section($section);
374 return "" unless defined $section_ptr;
375
376
377 #***output fileSection by Sections
378 my $section_num ="1". $section;
379 my $dmd_num = $section_num;
380
381 ##**output the dmdSection details
382 #if ($section_num eq "1") {
383 # $dmd_num = "0";
384 #}
385
386 #**output the StruMap details
387
388 my $all_text = ' <mets:div ID="DS'. $section_num .'" TYPE="Section" ORDER="'.$$order_numref++.'" ORDERLABEL="'. $section_num .'" LABEL="';
389
390 $all_text .= $section_num . '" DMDID="DM'.$dmd_num.'">'. "\n";
391
392 $all_text .= ' <mets:fptr FILEID="FILEGROUP_PRELUDE'.$section_num.'" />'. "\n";
393
394
395 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
396 $all_text .= $self->buffer_mets_StruMapSection_section_xml("$section.$subsection", $order_numref);
397 }
398
399 $all_text .= " </mets:div>\n";
400
401 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
402
403 return $all_text;
404}
405
406
407sub buffer_mets_StruMapWhole_section_xml(){
408 my $self = shift(@_);
409 my ($section) = @_;
410
411 my $section_ptr = $self-> _lookup_section($section);
412 return "" unless defined $section_ptr;
413
414 my $all_text="" unless defined $all_txt;
415 my $fileID=0;
416 my $order_num = 0;
417
418 $all_text .= '<mets:structMap ID="All" TYPE="Whole Document" LABEL="All">'."\n";
419 $all_text .= ' <mets:div ID="DSAll" TYPE="Document" ORDER="'.$order_num.'" ORDERLABEL="All" LABEL="Whole Documemt" DMDID="DM1">' . "\n";
420
421
422 #** output the StruMapSection for the whole section
423 #*** get the sourcefile and associative file
424
425 foreach my $data (@{$section_ptr->{'metadata'}}){
426 my $escaped_value = &_escape_text($data->[1]);
427
428 if ($data->[0] eq "gsdlsourcefilename") {
429 ++$fileID;
430 $all_text .= ' <mets:fptr FILEID="default.'.$fileID.'" />'."\n";
431 }
432
433 if ($data->[0] eq "gsdlassocfile"){
434 ++$fileID;
435 $all_text .= ' <mets:fptr FILEID="default.'.$fileID. '" />'. "\n";
436 }
437 }
438 $all_text .= " </mets:div>\n";
439
440 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
441
442 return $all_text;
443}
444
445
446sub buffer_mets_dmdSection_section_xml(){
447 my $self = shift(@_);
448 my ($section) = @_;
449
450 $section="" unless defined $section;
451
452 my $section_ptr=$self->_lookup_section($section);
453 return "" unless defined $section_ptr;
454
455 #***convert section number
456 my $section_num ="1". $section;
457 my $dmd_num = $section_num;
458
459 # #**output the dmdSection details
460 # if ($section_num eq "1") {
461 # $dmd_num = "0";
462 # }
463 my $all_text = '<mets:dmdSec ID="DM'.$dmd_num.'" GROUPID="'.$section_num.'">'. "\n";
464 $all_text .= ' <mets:mdWrap MDTYPE="OTHER" OTHERMDTYPE="gsdl3" ID="gsdl'.$section_num.'">'."\n";
465 $all_text .= " <mets:xmlData>\n";
466 foreach my $data (@{$section_ptr->{'metadata'}}){
467 my $escaped_value = &_escape_text($data->[1]);
468 $all_text .= ' <gsdl3:Metadata name="'. $data->[0].'">'. $escaped_value. "</gsdl3:Metadata>\n";
469 if ($data->[0] eq "dc.Title") {
470 $all_text .= ' <gsdl3:Metadata name="Title">'. $escaped_value."</gsdl3:Metadata>\n";
471 }
472 }
473
474 $all_text .= " </mets:xmlData>\n";
475 $all_text .= " </mets:mdWrap>\n";
476 $all_text .= "</mets:dmdSec>\n";
477
478 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
479 $all_text .= $self->buffer_mets_dmdSection_section_xml("$section.$subsection");
480 }
481
482 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
483
484 return $all_text;
485}
486
487sub output_section {
488 my $self = shift (@_);
489 my ($handle, $section) = @_;
490
491 print $handle $self->buffer_section_xml($section);
492}
493
494#*** print out DSpace dublin_core metadata section
495sub output_dspace_section {
496 my $self = shift (@_);
497 my ($handle, $section) = @_;
498
499 my $section_ptr = $self->_lookup_section ($section);
500 return "" unless defined $section_ptr;
501
502 my $all_text = "<Section>\n";
503 $all_text .= " <Description>\n";
504
505 # output metadata
506 foreach my $data (@{$section_ptr->{'metadata'}}) {
507 my $escaped_value = &_escape_text($data->[1]);
508 $all_text .= ' <Metadata name="' . $data->[0] . '">' . $escaped_value . "</Metadata>\n";
509 }
510
511 $all_text .= " </Description>\n";
512 $all_text .= "</Section>\n";
513
514 # make sure no nasty control characters have snuck through
515 # (XML::Parser will barf on anything it doesn't consider to be
516 # valid UTF-8 text, including things like \c@, \cC etc.)
517 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
518
519 return $all_text;
520}
521
522#*** print out doctxt.xml file
523sub output_txt_section {
524 my $self = shift (@_);
525 my ($handle, $section) = @_;
526
527 print $handle $self->buffer_txt_section_xml($section);
528}
529
530#*** print out docmets.xml file
531sub output_mets_section {
532 my $self = shift(@_);
533 my ($handle, $section) = @_;
534
535 #***print out the dmdSection
536 print $handle $self->buffer_mets_dmdSection_section_xml($section);
537
538 #***print out the fileSection by sections
539 print $handle "<mets:fileSec>\n";
540 print $handle $self->buffer_mets_fileSection_section_xml($section);
541
542 #***print out the whole fileSection
543 print $handle $self->buffer_mets_fileWhole_section_xml($section);
544 print $handle "</mets:fileSec>\n";
545
546 #***print out the StruMapSection by sections
547 print $handle '<mets:structMap ID="Section" TYPE="Section" LABEL="Section">' . "\n";
548 my $order_num=0;
549 print $handle $self->buffer_mets_StruMapSection_section_xml($section, \$order_num);
550 print $handle "</mets:structMap>\n";
551 print $handle $self->buffer_mets_StruMapWhole_section_xml($section);
552 print $handle "</mets:structMap>\n";
553}
554
555#*** print out dublin_core.xml file
556sub output_dc_section {
557 my $self = shift(@_);
558 my ($handle, $section, $doc_Dir) = @_;
559
560 #***print out the dublin_core
561 $section="" unless defined $section;
562
563 my $section_ptr=$self->_lookup_section($section);
564 return "" unless defined $section_ptr;
565 my $all_text="";
566 foreach my $data (@{$section_ptr->{'metadata'}}){
567 my $escaped_value = &_escape_text($data->[1]);
568 if ($data->[0]=~ /^dc/) {
569 $data->[0] =~ tr/[A-Z]/[a-z]/;
570 $data->[0] =~ /^dc\.(.*)/;
571 my $dc_element = $1;
572 #$all_text .= ' <dcvalue element="'. $data->[0].'" qualifier="#####">'. $escaped_value. "</dcvalue>\n";
573 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
574 }
575 }
576 if ($all_text eq "") {
577 $all_text .= " There is no Dublin Core metatdata in this document\n";
578 }
579 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
580
581 print $handle $all_text;
582}
583
584# look up the reference to the a particular section
585sub _lookup_section {
586 my $self = shift (@_);
587 my ($section) = @_;
588
589 my ($num);
590 my $sectionref = $self;
591
592 while (defined $section && $section ne "") {
593 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
594 $num =~ s/^0+(\d)/$1/; # remove leading 0s
595 $section = "" unless defined $section;
596
597 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
598 $sectionref = $sectionref->{'subsections'}->{$num};
599 } else {
600 return undef;
601 }
602 }
603
604 return $sectionref;
605}
606
607# calculate OID by hashing the contents of the document
608sub _calc_OID {
609 my $self = shift (@_);
610 my ($filename) = @_;
611
612 my $osexe = &util::get_os_exe();
613
614 my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
615 $ENV{'GSDLOS'},"hashfile$osexe");
616
617 my $result = "NULL";
618
619 if (-e "$hashfile_exe") {
620# $result = `\"$hashfile_exe\" \"$filename\"`;
621 $result = `hashfile$osexe \"$filename\"`;
622 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
623
624 } else {
625 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
626 }
627 return "HASH$result";
628}
629
630# methods dealing with OID, not groups of them.
631
632# if $OID is not provided one is calculated
633sub set_OID {
634 my $self = shift (@_);
635 my ($OID) = @_;
636
637 my $use_hash_oid = 0;
638 # if an OID wasn't provided claculate one
639 if (!defined $OID) {
640 $OID = "NULL";
641 if ($self->{'OIDtype'} eq "hash") {
642 $use_hash_oid = 1;
643 } elsif ($self->{'OIDtype'} eq "incremental") {
644 $OID = "D" . $OIDcount;
645 $OIDcount ++;
646
647 } elsif ($self->{'OIDtype'} eq "dirname") {
648 $OID = 'J';
649 my $filename = $self->get_source_filename();
650 if (defined($filename)) { # && -e $filename) {
651 $OID = &File::Basename::dirname($filename);
652 if (defined $OID) {
653 $OID = 'J'.&File::Basename::basename($OID);
654 $OID =~ s/\.//; #remove any periods
655 } else {
656 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
657 $use_hash_oid = 1;
658 }
659 } else {
660 print STDERR "Failed to find filename, generating hash id\n";
661 $use_hash_oid = 1;
662 }
663
664 } elsif ($self->{'OIDtype'} eq "assigned") {
665 my $identifier = $self->get_metadata_element ($self->get_top_section(), "dc.Identifier");
666 if (defined $identifier && $identifier ne "") {
667 $OID = "D" . $self->get_metadata_element ($self->get_top_section(), "dc.Identifier");
668 $OID =~ s/\.//; #remove any periods
669 } else {
670 # need a hash id
671 print STDERR "no dc.Identifier found, generating hash id\n";
672 $use_hash_oid = 1;
673 }
674
675 } else {
676 $use_hash_oid = 1;
677 }
678
679 if ($use_hash_oid) {
680
681 # "hash" OID - feed file to hashfile.exe
682 #my $filename = $self->get_source_filename();
683 # we want to use the converted file for hashing if available
684 # cos its quicker
685 my $filename = $self->get_filename_for_hashing();
686
687 # -z: don't want to hash on the file if it is zero size
688 if (defined($filename) && -e $filename && !-z $filename) {
689 $OID = $self->_calc_OID ($filename);
690 } else {
691 $filename = &util::get_tmp_filename();
692 if (!open (OUTFILE, ">$filename")) {
693 print STDERR "doc::set_OID could not write to $filename\n";
694 } else {
695 $self->output_section('OUTFILE', $self->get_top_section(),
696 undef, 1);
697 close (OUTFILE);
698 }
699 $OID = $self->_calc_OID ($filename);
700 &util::rm ($filename);
701 }
702 }
703 }
704 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
705}
706
707# this uses hashdoc (embedded c thingy) which is faster but still
708# needs a little work to be suffiently stable
709sub ___set_OID {
710 my $self = shift (@_);
711 my ($OID) = @_;
712
713 # if an OID wasn't provided then calculate hash value based on document
714 if (!defined $OID)
715 {
716 my $hash_text = $self->buffer_section_gml($self->get_top_section(),
717 undef, 1);
718 my $hash_len = length($hash_text);
719
720 $OID = &hashdoc::buffer($hash_text,$hash_len);
721 }
722
723 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
724}
725
726# returns the OID for this document
727sub get_OID {
728 my $self = shift (@_);
729 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
730 return $OID if (defined $OID);
731 return "NULL";
732}
733
734sub delete_OID {
735 my $self = shift (@_);
736
737 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
738}
739
740
741# methods for manipulating section names
742
743# returns the name of the top-most section (the top
744# level of the document
745sub get_top_section {
746 my $self = shift (@_);
747
748 return "";
749}
750
751# returns a section
752sub get_parent_section {
753 my $self = shift (@_);
754 my ($section) = @_;
755
756 $section =~ s/(^|\.)\d+$//;
757
758 return $section;
759}
760
761# returns the first child section (or the end child
762# if there isn't any)
763sub get_begin_child {
764 my $self = shift (@_);
765 my ($section) = @_;
766
767 my $section_ptr = $self->_lookup_section($section);
768 return "" unless defined $section_ptr;
769
770 if (defined $section_ptr->{'subsection_order'}->[0]) {
771 return "$section.$section_ptr->{'subsection_order'}->[0]";
772 }
773
774 return $self->get_end_child ($section);
775}
776
777# returns the next child of a parent section
778sub get_next_child {
779 my $self = shift (@_);
780 my ($section) = @_;
781
782 my $parent_section = $self->get_parent_section($section);
783 my $parent_section_ptr = $self->_lookup_section($parent_section);
784 return undef unless defined $parent_section_ptr;
785
786 my ($section_num) = $section =~ /(\d+)$/;
787 return undef unless defined $section_num;
788
789 my $i = 0;
790 my $section_order = $parent_section_ptr->{'subsection_order'};
791 while ($i < scalar(@$section_order)) {
792 last if $section_order->[$i] eq $section_num;
793 $i++;
794 }
795
796 $i++; # the next child
797 if ($i < scalar(@$section_order)) {
798 return $section_order->[$i] if $parent_section eq "";
799 return "$parent_section.$section_order->[$i]";
800 }
801
802 # no more sections in this level
803 return undef;
804}
805
806# returns a reference to a list of children
807sub get_children {
808 my $self = shift (@_);
809 my ($section) = @_;
810
811 my $section_ptr = $self->_lookup_section($section);
812 return [] unless defined $section_ptr;
813
814 my @children = @{$section_ptr->{'subsection_order'}};
815
816 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
817 return \@children;
818}
819
820# returns the child section one past the last one (which
821# is coded as "0")
822sub get_end_child {
823 my $self = shift (@_);
824 my ($section) = @_;
825
826 return $section . ".0" unless $section eq "";
827 return "0";
828}
829
830# returns the next section in book order
831sub get_next_section {
832 my $self = shift (@_);
833 my ($section) = @_;
834
835 return undef unless defined $section;
836
837 my $section_ptr = $self->_lookup_section($section);
838 return undef unless defined $section_ptr;
839
840 # first try to find first child
841 if (defined $section_ptr->{'subsection_order'}->[0]) {
842 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
843 return "$section.$section_ptr->{'subsection_order'}->[0]";
844 }
845
846 do {
847 # try to find sibling
848 my $next_child = $self->get_next_child ($section);
849 return $next_child if (defined $next_child);
850
851 # move up one level
852 $section = $self->get_parent_section ($section);
853 } while $section =~ /\d/;
854
855 return undef;
856}
857
858sub is_leaf_section {
859 my $self = shift (@_);
860 my ($section) = @_;
861
862 my $section_ptr = $self->_lookup_section($section);
863 return 1 unless defined $section_ptr;
864
865 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
866}
867
868# methods for dealing with sections
869
870# returns the name of the inserted section
871sub insert_section {
872 my $self = shift (@_);
873 my ($before_section) = @_;
874
875 # get the child to insert before and its parent section
876 my $parent_section = "";
877 my $before_child = "0";
878 my @before_section = split (/\./, $before_section);
879 if (scalar(@before_section) > 0) {
880 $before_child = pop (@before_section);
881 $parent_section = join (".", @before_section);
882 }
883
884 my $parent_section_ptr = $self->_lookup_section($parent_section);
885 if (!defined $parent_section_ptr) {
886 print STDERR "doc::insert_section couldn't find parent section " .
887 "$parent_section\n";
888 return;
889 }
890
891 # get the next section number
892 my $section_num = $parent_section_ptr->{'next_subsection'}++;
893
894 my $i = 0;
895 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
896 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
897 $i++;
898 }
899
900 # insert the section number into the order list
901 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
902
903 # add this section to the parent section
904 my $section_ptr = {'subsection_order'=>[],
905 'next_subsection'=>1,
906 'subsections'=>{},
907 'metadata'=>[],
908 'text'=>""};
909 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
910
911 # work out the full section number
912 my $section = $parent_section;
913 $section .= "." unless $section eq "";
914 $section .= $section_num;
915
916 return $section;
917}
918
919# creates a pre-named section
920sub create_named_section {
921 my $self = shift (@_);
922 my ($mastersection) = @_;
923
924 my ($num);
925 my $section = $mastersection;
926 my $sectionref = $self;
927
928 while ($section ne "") {
929 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
930 $num =~ s/^0+(\d)/$1/; # remove leading 0s
931 $section = "" unless defined $section;
932
933 if (defined $num) {
934 if (!defined $sectionref->{'subsections'}->{$num}) {
935 push (@{$sectionref->{'subsection_order'}}, $num);
936 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
937 'next_subsection'=>1,
938 'subsections'=>{},
939 'metadata'=>[],
940 'text'=>""};
941 if ($num >= $sectionref->{'next_subsection'}) {
942 $sectionref->{'next_subsection'} = $num + 1;
943 }
944 }
945 $sectionref = $sectionref->{'subsections'}->{$num};
946
947 } else {
948 print STDERR "doc::create_named_section couldn't create section ";
949 print STDERR "$mastersection\n";
950 last;
951 }
952 }
953}
954
955# returns a reference to a list of subsections
956sub list_subsections {
957 my $self = shift (@_);
958 my ($section) = @_;
959
960 my $section_ptr = $self->_lookup_section ($section);
961 if (!defined $section_ptr) {
962 print STDERR "doc::list_subsections couldn't find section $section\n";
963 return [];
964 }
965
966 return [@{$section_ptr->{'subsection_order'}}];
967}
968
969sub delete_section {
970 my $self = shift (@_);
971 my ($section) = @_;
972
973# my $section_ptr = {'subsection_order'=>[],
974# 'next_subsection'=>1,
975# 'subsections'=>{},
976# 'metadata'=>[],
977# 'text'=>""};
978
979 # if this is the top section reset everything
980 if ($section eq "") {
981 $self->{'subsection_order'} = [];
982 $self->{'subsections'} = {};
983 $self->{'metadata'} = [];
984 $self->{'text'} = "";
985 return;
986 }
987
988 # find the parent of the section to delete
989 my $parent_section = "";
990 my $child = "0";
991 my @section = split (/\./, $section);
992 if (scalar(@section) > 0) {
993 $child = pop (@section);
994 $parent_section = join (".", @section);
995 }
996
997 my $parent_section_ptr = $self->_lookup_section($parent_section);
998 if (!defined $parent_section_ptr) {
999 print STDERR "doc::delete_section couldn't find parent section " .
1000 "$parent_section\n";
1001 return;
1002 }
1003
1004 # remove this section from the subsection_order list
1005 my $i = 0;
1006 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
1007 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
1008 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
1009 last;
1010 }
1011 $i++;
1012 }
1013
1014 # remove this section from the subsection hash
1015 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
1016 undef $parent_section_ptr->{'subsections'}->{$child};
1017 }
1018}
1019
1020#--
1021# methods for dealing with metadata
1022
1023# set_metadata_element and get_metadata_element are for metadata
1024# which should only have one value. add_meta_data and get_metadata
1025# are for metadata which can have more than one value.
1026
1027# returns the first metadata value which matches field
1028
1029# This version of get metadata element works much like the one above,
1030# except it allows for the namespace portion of a metadata element to
1031# be ignored, thus if you are searching for dc.Title, the first piece
1032# of matching metadata ending with the name Title (once any namespace
1033# is removed) would be returned.
1034# 28-11-2003 John Thompson
1035sub get_metadata_element {
1036 my $self = shift (@_);
1037 my ($section, $field, $ignore_namespace) = @_;
1038 my ($data);
1039
1040 $ignore_namespace = 0 unless defined $ignore_namespace;
1041
1042 my $section_ptr = $self->_lookup_section($section);
1043 if (!defined $section_ptr) {
1044 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
1045 return;
1046 }
1047
1048 # Remove the any namespace if we are being told to ignore them
1049 if($ignore_namespace) {
1050 $field =~ s/^\w*\.//;
1051 }
1052
1053 foreach $data (@{$section_ptr->{'metadata'}}) {
1054
1055 my $data_name = $data->[0];
1056 # Remove the any namespace if we are being told to ignore them
1057 if($ignore_namespace) {
1058 $data_name =~ s/^\w*\.//;
1059 }
1060
1061 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
1062 }
1063
1064 return undef; # was not found
1065}
1066
1067# returns a list of the form [value1, value2, ...]
1068sub get_metadata {
1069 my $self = shift (@_);
1070 my ($section, $field, $ignore_namespace) = @_;
1071 my ($data);
1072
1073 $ignore_namespace = 0 unless defined $ignore_namespace;
1074
1075 my $section_ptr = $self->_lookup_section($section);
1076 if (!defined $section_ptr) {
1077 print STDERR "doc::get_metadata couldn't find section ",
1078 $section, "\n";
1079 return;
1080 }
1081
1082 # Remove the any namespace if we are being told to ignore them
1083 if($ignore_namespace) {
1084 $field =~ s/^\w*\.//;
1085 }
1086
1087 my @metadata = ();
1088 foreach $data (@{$section_ptr->{'metadata'}}) {
1089
1090 my $data_name = $data->[0];
1091 # Remove the any namespace if we are being told to ignore them
1092 if($ignore_namespace) {
1093 $data_name =~ s/^\w*\.//;
1094 }
1095
1096 push (@metadata, $data->[1]) if ($data_name eq $field);
1097 }
1098
1099 return \@metadata;
1100}
1101
1102# returns a list of the form [[field,value],[field,value],...]
1103sub get_all_metadata {
1104 my $self = shift (@_);
1105 my ($section) = @_;
1106
1107 my $section_ptr = $self->_lookup_section($section);
1108 if (!defined $section_ptr) {
1109 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
1110 return;
1111 }
1112
1113 return $section_ptr->{'metadata'};
1114}
1115
1116# $value is optional
1117sub delete_metadata {
1118 my $self = shift (@_);
1119 my ($section, $field, $value) = @_;
1120
1121 my $section_ptr = $self->_lookup_section($section);
1122 if (!defined $section_ptr) {
1123 print STDERR "doc::delete_metadata couldn't find section ", $section, "\n";
1124 return;
1125 }
1126
1127 my $i = 0;
1128 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
1129 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
1130 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
1131 splice (@{$section_ptr->{'metadata'}}, $i, 1);
1132 } else {
1133 $i++;
1134 }
1135 }
1136}
1137
1138sub delete_all_metadata {
1139 my $self = shift (@_);
1140 my ($section) = @_;
1141
1142 my $section_ptr = $self->_lookup_section($section);
1143 if (!defined $section_ptr) {
1144 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
1145 return;
1146 }
1147
1148 $section_ptr->{'metadata'} = [];
1149}
1150
1151sub set_metadata_element {
1152 my $self = shift (@_);
1153 my ($section, $field, $value) = @_;
1154
1155 $self->set_utf8_metadata_element ($section, $field,
1156 &unicode::ascii2utf8(\$value));
1157}
1158
1159# set_utf8_metadata_element assumes the text has already been
1160# converted to the UTF-8 encoding.
1161sub set_utf8_metadata_element {
1162 my $self = shift (@_);
1163 my ($section, $field, $value) = @_;
1164
1165 $self->delete_metadata ($section, $field);
1166 $self->add_utf8_metadata ($section, $field, $value);
1167}
1168
1169
1170# add_metadata assumes the text is in (extended) ascii form. For
1171# text which has already been converted to the UTF-8 format use
1172# add_utf8_metadata.
1173sub add_metadata {
1174 my $self = shift (@_);
1175 my ($section, $field, $value) = @_;
1176
1177 $self->add_utf8_metadata ($section, $field,
1178 &unicode::ascii2utf8(\$value));
1179}
1180
1181sub add_utf8_metadata {
1182 my $self = shift (@_);
1183 my ($section, $field, $value) = @_;
1184
1185 my $section_ptr = $self->_lookup_section($section);
1186 if (!defined $section_ptr) {
1187 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
1188 return;
1189 }
1190 if (!defined $value) {
1191 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
1192 return;
1193 }
1194 if (!defined $field) {
1195 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
1196 return;
1197 }
1198
1199 #print STDERR "###$field=$value\n";
1200 # double check that the value is utf-8
1201 if (unicode::ensure_utf8(\$value)) {
1202 print STDERR "doc::add_utf8_metadata: warning: '$field' wasn't utf8\n";
1203 }
1204
1205 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1206}
1207
1208
1209# methods for dealing with text
1210
1211# returns the text for a section
1212sub get_text {
1213 my $self = shift (@_);
1214 my ($section) = @_;
1215
1216 my $section_ptr = $self->_lookup_section($section);
1217 if (!defined $section_ptr) {
1218 print STDERR "doc::get_text couldn't find section " .
1219 "$section\n";
1220 return "";
1221 }
1222
1223 return $section_ptr->{'text'};
1224}
1225
1226# returns the (utf-8 encoded) length of the text for a section
1227sub get_text_length {
1228 my $self = shift (@_);
1229 my ($section) = @_;
1230
1231 my $section_ptr = $self->_lookup_section($section);
1232 if (!defined $section_ptr) {
1233 print STDERR "doc::get_text_length couldn't find section " .
1234 "$section\n";
1235 return 0;
1236 }
1237
1238 return length ($section_ptr->{'text'});
1239}
1240
1241sub delete_text {
1242 my $self = shift (@_);
1243 my ($section) = @_;
1244
1245 my $section_ptr = $self->_lookup_section($section);
1246 if (!defined $section_ptr) {
1247 print STDERR "doc::delete_text couldn't find section " .
1248 "$section\n";
1249 return;
1250 }
1251
1252 $section_ptr->{'text'} = "";
1253}
1254
1255# add_text assumes the text is in (extended) ascii form. For
1256# text which has been already converted to the UTF-8 format
1257# use add_utf8_text.
1258sub add_text {
1259 my $self = shift (@_);
1260 my ($section, $text) = @_;
1261
1262 # convert the text to UTF-8 encoded unicode characters
1263 # and add the text
1264 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1265}
1266
1267
1268# add_utf8_text assumes the text to be added has already
1269# been converted to the UTF-8 encoding. For ascii text use
1270# add_text
1271sub add_utf8_text {
1272 my $self = shift (@_);
1273 my ($section, $text) = @_;
1274
1275 my $section_ptr = $self->_lookup_section($section);
1276 if (!defined $section_ptr) {
1277 print STDERR "doc::add_utf8_text couldn't find section " .
1278 "$section\n";
1279 return;
1280 }
1281
1282 $section_ptr->{'text'} .= $text;
1283}
1284
1285
1286# methods for dealing with associated files
1287
1288# a file is associated with a document, NOT a section.
1289# if section is defined it is noted in the data structure
1290# only so that files associated from a particular section
1291# may be removed later (using delete_section_assoc_files)
1292sub associate_file {
1293 my $self = shift (@_);
1294 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1295 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1296
1297 # remove all associated files with the same name
1298 $self->delete_assoc_file ($assoc_filename);
1299
1300 push (@{$self->{'associated_files'}},
1301 [$real_filename, $assoc_filename, $mime_type, $section]);
1302}
1303
1304# returns a list of associated files in the form
1305# [[real_filename, assoc_filename, mimetype], ...]
1306sub get_assoc_files {
1307 my $self = shift (@_);
1308
1309 return $self->{'associated_files'};
1310}
1311
1312sub delete_section_assoc_files {
1313 my $self = shift (@_);
1314 my ($section) = @_;
1315
1316 my $i=0;
1317 while ($i < scalar (@{$self->{'associated_files'}})) {
1318 if (defined $self->{'associated_files'}->[$i]->[3] &&
1319 $self->{'associated_files'}->[$i]->[3] eq $section) {
1320 splice (@{$self->{'associated_files'}}, $i, 1);
1321 } else {
1322 $i++;
1323 }
1324 }
1325}
1326
1327sub delete_assoc_file {
1328 my $self = shift (@_);
1329 my ($assoc_filename) = @_;
1330
1331 my $i=0;
1332 while ($i < scalar (@{$self->{'associated_files'}})) {
1333 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1334 splice (@{$self->{'associated_files'}}, $i, 1);
1335 } else {
1336 $i++;
1337 }
1338 }
1339}
1340
1341sub reset_nextsection_ptr {
1342 my $self = shift (@_);
1343 my ($section) = @_;
1344
1345 my $section_ptr = $self->_lookup_section($section);
1346 $section_ptr->{'next_subsection'} = 1;
1347}
1348
13491;
Note: See TracBrowser for help on using the repository browser.