source: trunk/gsdl/perllib/doc.pm@ 9838

Last change on this file since 9838 was 9838, checked in by davidb, 19 years ago

General improvements to saving files in DSpace format. Main additino is to
use extracted metadata values for dc.* values that are not present. For example
if there is no dc.Title then ex.Title is used instead.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 38.3 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use unicode;
37use util;
38use ghtml;
39use File::stat;
40##use hashdoc;
41
42# the document type may be indexed_doc, nonindexed_doc, or
43# classification
44
45my $OIDcount = 0;
46
47sub new {
48 my $class = shift (@_);
49 my ($source_filename, $doc_type) = @_;
50
51 my $self = bless {'associated_files'=>[],
52 'subsection_order'=>[],
53 'next_subsection'=>1,
54 'subsections'=>{},
55 'metadata'=>[],
56 'text'=>"",
57 'OIDtype'=>"hash"}, $class;
58
59 #GRB: Added code to set lastmodified for OAI purposes
60 if ((defined $doc_type) && (defined $source_filename) && (-e $source_filename)) {
61 my $file_stat = stat($source_filename);
62 my $mtime = $file_stat->mtime;
63 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $file_stat->mtime);
64 }
65 #GRB: end inserted code
66
67 if (defined $source_filename) {
68 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
69
70 if (defined $collect_dir) {
71 my $dirsep = &util::get_dirsep();
72 if ($collect_dir !~ m/$dirsep$/) {
73 $collect_dir .= $dirsep;
74 }
75
76 $collect_dir =~ s/\\/\\\\/g; # escape DOS style file separator
77
78 # if from within GSDLCOLLECTDIR, then remove directory prefix
79 # so source_filename is realative to it. This is done to aid
80 # portability, i.e. the collection can be moved to somewhere
81 # else on the file system and the archives directory will still
82 # work. This is needed, for example in the applet version of
83 # GLI where GSDLHOME/collect on the server will be different to
84 # the collect directory of the remove user. Of course,
85 # GSDLCOLLECTDIR subsequently needs to be put back on to turn
86 # it back into a full pathname.
87
88 if ($source_filename =~ /^$collect_dir(.*)$/) {
89 $source_filename = $1;
90 }
91 }
92
93 $self->set_source_filename ($source_filename);
94 }
95
96 $self->set_doc_type ($doc_type) if defined $doc_type;
97
98 return $self;
99}
100
101# clone the $self object
102sub duplicate {
103 my $self = shift (@_);
104
105 my $newobj = {};
106
107 foreach my $k (keys %$self) {
108 $newobj->{$k} = &clone ($self->{$k});
109 }
110
111 bless $newobj, ref($self);
112 return $newobj;
113}
114
115sub clone {
116 my ($from) = @_;
117 my $type = ref ($from);
118
119 if ($type eq "HASH") {
120 my $to = {};
121 foreach my $key (keys %$from) {
122 $to->{$key} = &clone ($from->{$key});
123 }
124 return $to;
125 } elsif ($type eq "ARRAY") {
126 my $to = [];
127 foreach my $v (@$from) {
128 push (@$to, &clone ($v));
129 }
130 return $to;
131 } else {
132 return $from;
133 }
134}
135
136sub set_OIDtype {
137 my $self = shift (@_);
138 my ($type) = @_;
139
140 if ($type =~ /^(hash|incremental|dirname|assigned)$/) {
141 $self->{'OIDtype'} = $type;
142 } else {
143 $self->{'OIDtype'} = "hash";
144 }
145}
146
147sub set_source_filename {
148 my $self = shift (@_);
149 my ($source_filename) = @_;
150
151 $self->set_metadata_element ($self->get_top_section(),
152 "gsdlsourcefilename",
153 $source_filename);
154}
155
156sub set_converted_filename {
157 my $self = shift (@_);
158 my ($converted_filename) = @_;
159
160 $self->set_metadata_element ($self->get_top_section(),
161 "gsdlconvertedfilename",
162 $converted_filename);
163}
164
165
166# returns the source_filename as it was provided
167sub get_source_filename {
168 my $self = shift (@_);
169
170 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
171}
172
173# returns converted filename if available else returns source filename
174sub get_filename_for_hashing {
175 my $self = shift (@_);
176
177 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
178
179 if (!defined $filename) {
180 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
181 }
182 return $filename;
183}
184
185sub set_doc_type {
186 my $self = shift (@_);
187 my ($doc_type) = @_;
188
189 $self->set_metadata_element ($self->get_top_section(),
190 "gsdldoctype",
191 $doc_type);
192}
193
194# returns the source_filename as it was provided
195# the default of "indexed_doc" is used if no document
196# type was provided
197sub get_doc_type {
198 my $self = shift (@_);
199
200 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
201 return $doc_type if (defined $doc_type);
202 return "indexed_doc";
203}
204
205sub _escape_text {
206 my ($text) = @_;
207
208 # special characters in the gml encoding
209 $text =~ s/&/&/g; # this has to be first...
210 $text =~ s/</&lt;/g;
211 $text =~ s/>/&gt;/g;
212 $text =~ s/\"/&quot;/g;
213
214 return $text;
215}
216
217sub buffer_section_xml {
218 my $self = shift (@_);
219 my ($section) = @_;
220
221 my $section_ptr = $self->_lookup_section ($section);
222 return "" unless defined $section_ptr;
223
224 my $all_text = "<Section>\n";
225 $all_text .= " <Description>\n";
226
227 # output metadata
228 foreach my $data (@{$section_ptr->{'metadata'}}) {
229 my $escaped_value = &_escape_text($data->[1]);
230 $all_text .= ' <Metadata name="' . $data->[0] . '">' . $escaped_value . "</Metadata>\n";
231 }
232
233 $all_text .= " </Description>\n";
234
235 # output the text
236 $all_text .= " <Content>";
237 $all_text .= &_escape_text($section_ptr->{'text'});
238 $all_text .= "</Content>\n";
239
240 # output all the subsections
241 foreach my $subsection (@{$section_ptr->{'subsection_order'}}) {
242 $all_text .= $self->buffer_section_xml("$section.$subsection");
243 }
244
245 $all_text .= "</Section>\n";
246
247 # make sure no nasty control characters have snuck through
248 # (XML::Parser will barf on anything it doesn't consider to be
249 # valid UTF-8 text, including things like \c@, \cC etc.)
250 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
251
252 return $all_text;
253}
254
255sub buffer_txt_section_xml {
256 my $self = shift(@_);
257 my ($section) = @_;
258
259 my $section_ptr = $self->_lookup_section ($section);
260
261 return "" unless defined $section_ptr;
262
263 my $all_text = "<Section>\n";
264
265 ##output the text
266 #$all_text .= " <Content>";
267 $all_text .= &_escape_text($section_ptr->{'text'});
268 #$all_text .= " </Content>\n";
269
270
271 #output all the subsections
272 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
273 $all_text .= $self->buffer_txt_section_xml("$section.$subsection");
274 }
275
276 $all_text .= "</Section>\n";
277
278
279 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
280 return $all_text;
281}
282
283sub buffer_mets_fileSection_section_xml() {
284 my $self = shift(@_);
285 my ($section) = @_;
286
287 #$section="" unless defined $section;
288
289
290 my $section_ptr=$self->_lookup_section($section);
291 return "" unless defined $section_ptr;
292
293
294 #**output fileSection by sections
295 my $section_num ="1". $section;
296
297
298 #my $filePath = $doc_Dir . '/doctxt.xml';
299
300 my $filePath = 'doctxt.xml';
301
302 #**output the fileSection details
303 my $all_text = ' <mets:fileGrp ID="FILEGROUP_PRELUDE' . $section_num . '">'. "\n";
304 $all_text .= ' <mets:file MIMETYPE="text/xml" ID="FILE'.$section_num. '">'. "\n";
305 $all_text .= ' <mets:FLocat LOCTYPE="URL" xlink:href="file:'.$filePath.'#xpointer(/Section[';
306
307 my $xpath = "1".$section;
308
309 $xpath =~ s/\./]\/Section[/g;
310
311 $all_text .= $xpath;
312
313 $all_text .= ']/text())" />' . "\n";
314 $all_text .= " </mets:file>\n";
315 $all_text .= " </mets:fileGrp>\n";
316
317
318 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
319 $all_text .= $self->buffer_mets_fileSection_section_xml("$section.$subsection");
320 }
321
322 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
323
324 return $all_text;
325}
326
327sub buffer_mets_fileWhole_section_xml(){
328 my $self = shift(@_);
329 my ($section) = @_;
330
331 my $section_ptr = $self-> _lookup_section($section);
332 return "" unless defined $section_ptr;
333
334 my $all_text="" unless defined $all_txt;
335
336 my ($dirPath)="" unless defined $dirPath;
337 my $fileID=0;
338
339 #** output the fileSection for the whole section
340 #*** get the sourcefile and associative file
341
342 foreach my $data (@{$section_ptr->{'metadata'}}){
343 my $escaped_value = &_escape_text($data->[1]);
344 if ($data->[0] eq "gsdlsourcefilename") {
345 ($dirPath) = $escaped_value =~ m/^(.*)[\/\\][^\/\\]*$/;
346
347 $all_text .= ' <mets:fileGrp ID="default">'."\n";
348 ++$fileID;
349 $all_text .= ' <mets:file MIMETYPE="text/xml" ID="default.'.$fileID.'">'. "\n";
350
351 $all_text .= ' <mets:FLocat LOCTYPE="URL" xlink:href="file:'.$data->[1].'" />'."\n";
352
353 $all_text .= " </mets:file>\n";
354 }
355
356 if ($data->[0] eq "gsdlassocfile"){
357
358 $escaped_value =~ m/^(.*?):(.*):(.*)$/;
359 my $assoc_file = $1;
360 my $mime_type = $2;
361 my $assoc_dir = $3;
362
363 my $assfilePath = $dirPath . '/'. $assoc_file;
364 ++$fileID;
365
366 $all_text .= ' <mets:file MIMETYPE="'.$mime_type.'" ID="default.'.$fileID. '">'. "\n";
367 $all_text .= ' <mets:FLocat LOCTYPE="URL" xlink:href="file:'.$assfilePath.'" />'."\n";
368
369 $all_text .= " </mets:file>\n";
370 }
371 }
372 $all_text .= " </mets:fileGrp>\n";
373
374
375 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
376
377 return $all_text;
378}
379
380sub buffer_mets_StruMapSection_section_xml(){
381 my $self = shift(@_);
382 my ($section, $order_numref) = @_;
383
384 $section="" unless defined $section;
385
386
387 my $section_ptr=$self->_lookup_section($section);
388 return "" unless defined $section_ptr;
389
390
391 #***output fileSection by Sections
392 my $section_num ="1". $section;
393 my $dmd_num = $section_num;
394
395 ##**output the dmdSection details
396 #if ($section_num eq "1") {
397 # $dmd_num = "0";
398 #}
399
400 #**output the StruMap details
401
402 my $all_text = ' <mets:div ID="DS'. $section_num .'" TYPE="Section" ORDER="'.$$order_numref++.'" ORDERLABEL="'. $section_num .'" LABEL="';
403
404 $all_text .= $section_num . '" DMDID="DM'.$dmd_num.'">'. "\n";
405
406 $all_text .= ' <mets:fptr FILEID="FILEGROUP_PRELUDE'.$section_num.'" />'. "\n";
407
408
409 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
410 $all_text .= $self->buffer_mets_StruMapSection_section_xml("$section.$subsection", $order_numref);
411 }
412
413 $all_text .= " </mets:div>\n";
414
415 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
416
417 return $all_text;
418}
419
420
421sub buffer_mets_StruMapWhole_section_xml(){
422 my $self = shift(@_);
423 my ($section) = @_;
424
425 my $section_ptr = $self-> _lookup_section($section);
426 return "" unless defined $section_ptr;
427
428 my $all_text="" unless defined $all_txt;
429 my $fileID=0;
430 my $order_num = 0;
431
432 $all_text .= '<mets:structMap ID="All" TYPE="Whole Document" LABEL="All">'."\n";
433 $all_text .= ' <mets:div ID="DSAll" TYPE="Document" ORDER="'.$order_num.'" ORDERLABEL="All" LABEL="Whole Documemt" DMDID="DM1">' . "\n";
434
435
436 #** output the StruMapSection for the whole section
437 #*** get the sourcefile and associative file
438
439 foreach my $data (@{$section_ptr->{'metadata'}}){
440 my $escaped_value = &_escape_text($data->[1]);
441
442 if ($data->[0] eq "gsdlsourcefilename") {
443 ++$fileID;
444 $all_text .= ' <mets:fptr FILEID="default.'.$fileID.'" />'."\n";
445 }
446
447 if ($data->[0] eq "gsdlassocfile"){
448 ++$fileID;
449 $all_text .= ' <mets:fptr FILEID="default.'.$fileID. '" />'. "\n";
450 }
451 }
452 $all_text .= " </mets:div>\n";
453
454 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
455
456 return $all_text;
457}
458
459
460sub buffer_mets_dmdSection_section_xml(){
461 my $self = shift(@_);
462 my ($section) = @_;
463
464 $section="" unless defined $section;
465
466 my $section_ptr=$self->_lookup_section($section);
467 return "" unless defined $section_ptr;
468
469 #***convert section number
470 my $section_num ="1". $section;
471 my $dmd_num = $section_num;
472
473 # #**output the dmdSection details
474 # if ($section_num eq "1") {
475 # $dmd_num = "0";
476 # }
477 my $all_text = '<mets:dmdSec ID="DM'.$dmd_num.'" GROUPID="'.$section_num.'">'. "\n";
478 $all_text .= ' <mets:mdWrap MDTYPE="OTHER" OTHERMDTYPE="gsdl3" ID="gsdl'.$section_num.'">'."\n";
479 $all_text .= " <mets:xmlData>\n";
480 foreach my $data (@{$section_ptr->{'metadata'}}){
481 my $escaped_value = &_escape_text($data->[1]);
482 $all_text .= ' <gsdl3:Metadata name="'. $data->[0].'">'. $escaped_value. "</gsdl3:Metadata>\n";
483 if ($data->[0] eq "dc.Title") {
484 $all_text .= ' <gsdl3:Metadata name="Title">'. $escaped_value."</gsdl3:Metadata>\n";
485 }
486 }
487
488 $all_text .= " </mets:xmlData>\n";
489 $all_text .= " </mets:mdWrap>\n";
490 $all_text .= "</mets:dmdSec>\n";
491
492 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
493 $all_text .= $self->buffer_mets_dmdSection_section_xml("$section.$subsection");
494 }
495
496 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
497
498 return $all_text;
499}
500
501sub output_section {
502 my $self = shift (@_);
503 my ($handle, $section) = @_;
504
505 print $handle $self->buffer_section_xml($section);
506}
507
508#*** print out DSpace dublin_core metadata section
509sub output_dspace_section {
510 my $self = shift (@_);
511 my ($handle, $section) = @_;
512
513 my $section_ptr = $self->_lookup_section ($section);
514 return "" unless defined $section_ptr;
515
516 my $all_text = "<Section>\n";
517 $all_text .= " <Description>\n";
518
519 # output metadata
520 foreach my $data (@{$section_ptr->{'metadata'}}) {
521 my $escaped_value = &_escape_text($data->[1]);
522 $all_text .= ' <Metadata name="' . $data->[0] . '">' . $escaped_value . "</Metadata>\n";
523 }
524
525 $all_text .= " </Description>\n";
526 $all_text .= "</Section>\n";
527
528 # make sure no nasty control characters have snuck through
529 # (XML::Parser will barf on anything it doesn't consider to be
530 # valid UTF-8 text, including things like \c@, \cC etc.)
531 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
532
533 return $all_text;
534}
535
536#*** print out doctxt.xml file
537sub output_txt_section {
538 my $self = shift (@_);
539 my ($handle, $section) = @_;
540
541 print $handle $self->buffer_txt_section_xml($section);
542}
543
544#*** print out docmets.xml file
545sub output_mets_section {
546 my $self = shift(@_);
547 my ($handle, $section) = @_;
548
549 #***print out the dmdSection
550 print $handle $self->buffer_mets_dmdSection_section_xml($section);
551
552 #***print out the fileSection by sections
553 print $handle "<mets:fileSec>\n";
554 print $handle $self->buffer_mets_fileSection_section_xml($section);
555
556 #***print out the whole fileSection
557 print $handle $self->buffer_mets_fileWhole_section_xml($section);
558 print $handle "</mets:fileSec>\n";
559
560 #***print out the StruMapSection by sections
561 print $handle '<mets:structMap ID="Section" TYPE="Section" LABEL="Section">' . "\n";
562 my $order_num=0;
563 print $handle $self->buffer_mets_StruMapSection_section_xml($section, \$order_num);
564 print $handle "</mets:structMap>\n";
565 print $handle $self->buffer_mets_StruMapWhole_section_xml($section);
566 print $handle "</mets:structMap>\n";
567}
568
569my $dc_set = { Title => 1,
570 Creator => 1,
571 Subject => 1,
572 Description => 1,
573 Publisher => 1,
574 Contributors => 1,
575 Date => 1,
576 Type => 1,
577 Format => 1,
578 Identifier => 1,
579 Source => 1,
580 Language => 1,
581 Relation => 1,
582 Coverage => 1,
583 Rights => 1};
584
585
586
587#*** print out dublin_core.xml file
588sub output_dc_section {
589 my $self = shift(@_);
590 my ($handle, $section, $doc_Dir) = @_;
591
592 #***print out the dublin_core
593 $section="" unless defined $section;
594
595 my $section_ptr=$self->_lookup_section($section);
596 return "" unless defined $section_ptr;
597
598 my $explicit_dc = {};
599 my $explicit_ex = {};
600
601 my $all_text="";
602 foreach my $data (@{$section_ptr->{'metadata'}}){
603 my $escaped_value = &_escape_text($data->[1]);
604 if ($data->[0]=~ m/^dc\./) {
605 $data->[0] =~ tr/[A-Z]/[a-z]/;
606
607 $data->[0] =~ m/^dc\.(.*)/;
608 my $dc_element = $1;
609
610 if (!defined $explicit_dc->{$dc_element}) {
611 $explicit_dc->{$dc_element} = [];
612 }
613 push(@{$explicit_dc->{$dc_element}},$escaped_value);
614
615 #$all_text .= ' <dcvalue element="'. $data->[0].'" qualifier="#####">'. $escaped_value. "</dcvalue>\n";
616 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
617 }
618 elsif (($data->[0] =~ m/^ex\./) || ($data->[0] !~ m/\./)) {
619 $data->[0] =~ m/^(ex\.)?(.*)/;
620 my $ex_element = $2;
621 my $lc_ex_element = lc($ex_element);
622
623 if (defined $dc_set->{$ex_element}) {
624 if (!defined $explicit_ex->{$lc_ex_element}) {
625 $explicit_ex->{$lc_ex_element} = [];
626 }
627 push(@{$explicit_ex->{$lc_ex_element}},$escaped_value);
628 }
629 }
630 }
631
632 # go through dc_set and for any element *not* defined in explicit_dc
633 # that do exist in explicit_ex, add it in as metadata
634 foreach my $k ( keys %$dc_set ) {
635 my $lc_k = lc($k);
636
637 if (!defined $explicit_dc->{$lc_k}) {
638 if (defined $explicit_ex->{$lc_k}) {
639
640 foreach my $v (@{$explicit_ex->{$lc_k}}) {
641 my $dc_element = $lc_k;
642 my $escaped_value = $v;
643
644 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
645
646 }
647 }
648 }
649 }
650
651 if ($all_text eq "") {
652 $all_text .= " There is no Dublin Core metatdata in this document\n";
653 }
654 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
655
656 print $handle $all_text;
657}
658
659# look up the reference to the a particular section
660sub _lookup_section {
661 my $self = shift (@_);
662 my ($section) = @_;
663
664 my ($num);
665 my $sectionref = $self;
666
667 while (defined $section && $section ne "") {
668 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
669 $num =~ s/^0+(\d)/$1/; # remove leading 0s
670 $section = "" unless defined $section;
671
672 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
673 $sectionref = $sectionref->{'subsections'}->{$num};
674 } else {
675 return undef;
676 }
677 }
678
679 return $sectionref;
680}
681
682# calculate OID by hashing the contents of the document
683sub _calc_OID {
684 my $self = shift (@_);
685 my ($filename) = @_;
686
687 my $osexe = &util::get_os_exe();
688
689 my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
690 $ENV{'GSDLOS'},"hashfile$osexe");
691
692 my $result = "NULL";
693
694 if (-e "$hashfile_exe") {
695# $result = `\"$hashfile_exe\" \"$filename\"`;
696 $result = `hashfile$osexe \"$filename\"`;
697 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
698
699 } else {
700 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
701 }
702 return "HASH$result";
703}
704
705# methods dealing with OID, not groups of them.
706
707# if $OID is not provided one is calculated
708sub set_OID {
709 my $self = shift (@_);
710 my ($OID) = @_;
711
712 my $use_hash_oid = 0;
713 # if an OID wasn't provided claculate one
714 if (!defined $OID) {
715 $OID = "NULL";
716 if ($self->{'OIDtype'} eq "hash") {
717 $use_hash_oid = 1;
718 } elsif ($self->{'OIDtype'} eq "incremental") {
719 $OID = "D" . $OIDcount;
720 $OIDcount ++;
721
722 } elsif ($self->{'OIDtype'} eq "dirname") {
723 $OID = 'J';
724 my $filename = $self->get_source_filename();
725 if (defined($filename)) { # && -e $filename) {
726 $OID = &File::Basename::dirname($filename);
727 if (defined $OID) {
728 $OID = 'J'.&File::Basename::basename($OID);
729 $OID =~ s/\.//; #remove any periods
730 } else {
731 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
732 $use_hash_oid = 1;
733 }
734 } else {
735 print STDERR "Failed to find filename, generating hash id\n";
736 $use_hash_oid = 1;
737 }
738
739 } elsif ($self->{'OIDtype'} eq "assigned") {
740 my $identifier = $self->get_metadata_element ($self->get_top_section(), "dc.Identifier");
741 if (defined $identifier && $identifier ne "") {
742 $OID = "D" . $self->get_metadata_element ($self->get_top_section(), "dc.Identifier");
743 $OID =~ s/\.//; #remove any periods
744 } else {
745 # need a hash id
746 print STDERR "no dc.Identifier found, generating hash id\n";
747 $use_hash_oid = 1;
748 }
749
750 } else {
751 $use_hash_oid = 1;
752 }
753
754 if ($use_hash_oid) {
755
756 # "hash" OID - feed file to hashfile.exe
757 #my $filename = $self->get_source_filename();
758 # we want to use the converted file for hashing if available
759 # cos its quicker
760 my $filename = $self->get_filename_for_hashing();
761
762 # -z: don't want to hash on the file if it is zero size
763 if (defined($filename) && -e $filename && !-z $filename) {
764 $OID = $self->_calc_OID ($filename);
765 } else {
766 $filename = &util::get_tmp_filename();
767 if (!open (OUTFILE, ">$filename")) {
768 print STDERR "doc::set_OID could not write to $filename\n";
769 } else {
770 $self->output_section('OUTFILE', $self->get_top_section(),
771 undef, 1);
772 close (OUTFILE);
773 }
774 $OID = $self->_calc_OID ($filename);
775 &util::rm ($filename);
776 }
777 }
778 }
779 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
780}
781
782# this uses hashdoc (embedded c thingy) which is faster but still
783# needs a little work to be suffiently stable
784sub ___set_OID {
785 my $self = shift (@_);
786 my ($OID) = @_;
787
788 # if an OID wasn't provided then calculate hash value based on document
789 if (!defined $OID)
790 {
791 my $hash_text = $self->buffer_section_gml($self->get_top_section(),
792 undef, 1);
793 my $hash_len = length($hash_text);
794
795 $OID = &hashdoc::buffer($hash_text,$hash_len);
796 }
797
798 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
799}
800
801# returns the OID for this document
802sub get_OID {
803 my $self = shift (@_);
804 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
805 return $OID if (defined $OID);
806 return "NULL";
807}
808
809sub delete_OID {
810 my $self = shift (@_);
811
812 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
813}
814
815
816# methods for manipulating section names
817
818# returns the name of the top-most section (the top
819# level of the document
820sub get_top_section {
821 my $self = shift (@_);
822
823 return "";
824}
825
826# returns a section
827sub get_parent_section {
828 my $self = shift (@_);
829 my ($section) = @_;
830
831 $section =~ s/(^|\.)\d+$//;
832
833 return $section;
834}
835
836# returns the first child section (or the end child
837# if there isn't any)
838sub get_begin_child {
839 my $self = shift (@_);
840 my ($section) = @_;
841
842 my $section_ptr = $self->_lookup_section($section);
843 return "" unless defined $section_ptr;
844
845 if (defined $section_ptr->{'subsection_order'}->[0]) {
846 return "$section.$section_ptr->{'subsection_order'}->[0]";
847 }
848
849 return $self->get_end_child ($section);
850}
851
852# returns the next child of a parent section
853sub get_next_child {
854 my $self = shift (@_);
855 my ($section) = @_;
856
857 my $parent_section = $self->get_parent_section($section);
858 my $parent_section_ptr = $self->_lookup_section($parent_section);
859 return undef unless defined $parent_section_ptr;
860
861 my ($section_num) = $section =~ /(\d+)$/;
862 return undef unless defined $section_num;
863
864 my $i = 0;
865 my $section_order = $parent_section_ptr->{'subsection_order'};
866 while ($i < scalar(@$section_order)) {
867 last if $section_order->[$i] eq $section_num;
868 $i++;
869 }
870
871 $i++; # the next child
872 if ($i < scalar(@$section_order)) {
873 return $section_order->[$i] if $parent_section eq "";
874 return "$parent_section.$section_order->[$i]";
875 }
876
877 # no more sections in this level
878 return undef;
879}
880
881# returns a reference to a list of children
882sub get_children {
883 my $self = shift (@_);
884 my ($section) = @_;
885
886 my $section_ptr = $self->_lookup_section($section);
887 return [] unless defined $section_ptr;
888
889 my @children = @{$section_ptr->{'subsection_order'}};
890
891 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
892 return \@children;
893}
894
895# returns the child section one past the last one (which
896# is coded as "0")
897sub get_end_child {
898 my $self = shift (@_);
899 my ($section) = @_;
900
901 return $section . ".0" unless $section eq "";
902 return "0";
903}
904
905# returns the next section in book order
906sub get_next_section {
907 my $self = shift (@_);
908 my ($section) = @_;
909
910 return undef unless defined $section;
911
912 my $section_ptr = $self->_lookup_section($section);
913 return undef unless defined $section_ptr;
914
915 # first try to find first child
916 if (defined $section_ptr->{'subsection_order'}->[0]) {
917 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
918 return "$section.$section_ptr->{'subsection_order'}->[0]";
919 }
920
921 do {
922 # try to find sibling
923 my $next_child = $self->get_next_child ($section);
924 return $next_child if (defined $next_child);
925
926 # move up one level
927 $section = $self->get_parent_section ($section);
928 } while $section =~ /\d/;
929
930 return undef;
931}
932
933sub is_leaf_section {
934 my $self = shift (@_);
935 my ($section) = @_;
936
937 my $section_ptr = $self->_lookup_section($section);
938 return 1 unless defined $section_ptr;
939
940 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
941}
942
943# methods for dealing with sections
944
945# returns the name of the inserted section
946sub insert_section {
947 my $self = shift (@_);
948 my ($before_section) = @_;
949
950 # get the child to insert before and its parent section
951 my $parent_section = "";
952 my $before_child = "0";
953 my @before_section = split (/\./, $before_section);
954 if (scalar(@before_section) > 0) {
955 $before_child = pop (@before_section);
956 $parent_section = join (".", @before_section);
957 }
958
959 my $parent_section_ptr = $self->_lookup_section($parent_section);
960 if (!defined $parent_section_ptr) {
961 print STDERR "doc::insert_section couldn't find parent section " .
962 "$parent_section\n";
963 return;
964 }
965
966 # get the next section number
967 my $section_num = $parent_section_ptr->{'next_subsection'}++;
968
969 my $i = 0;
970 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
971 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
972 $i++;
973 }
974
975 # insert the section number into the order list
976 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
977
978 # add this section to the parent section
979 my $section_ptr = {'subsection_order'=>[],
980 'next_subsection'=>1,
981 'subsections'=>{},
982 'metadata'=>[],
983 'text'=>""};
984 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
985
986 # work out the full section number
987 my $section = $parent_section;
988 $section .= "." unless $section eq "";
989 $section .= $section_num;
990
991 return $section;
992}
993
994# creates a pre-named section
995sub create_named_section {
996 my $self = shift (@_);
997 my ($mastersection) = @_;
998
999 my ($num);
1000 my $section = $mastersection;
1001 my $sectionref = $self;
1002
1003 while ($section ne "") {
1004 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
1005 $num =~ s/^0+(\d)/$1/; # remove leading 0s
1006 $section = "" unless defined $section;
1007
1008 if (defined $num) {
1009 if (!defined $sectionref->{'subsections'}->{$num}) {
1010 push (@{$sectionref->{'subsection_order'}}, $num);
1011 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
1012 'next_subsection'=>1,
1013 'subsections'=>{},
1014 'metadata'=>[],
1015 'text'=>""};
1016 if ($num >= $sectionref->{'next_subsection'}) {
1017 $sectionref->{'next_subsection'} = $num + 1;
1018 }
1019 }
1020 $sectionref = $sectionref->{'subsections'}->{$num};
1021
1022 } else {
1023 print STDERR "doc::create_named_section couldn't create section ";
1024 print STDERR "$mastersection\n";
1025 last;
1026 }
1027 }
1028}
1029
1030# returns a reference to a list of subsections
1031sub list_subsections {
1032 my $self = shift (@_);
1033 my ($section) = @_;
1034
1035 my $section_ptr = $self->_lookup_section ($section);
1036 if (!defined $section_ptr) {
1037 print STDERR "doc::list_subsections couldn't find section $section\n";
1038 return [];
1039 }
1040
1041 return [@{$section_ptr->{'subsection_order'}}];
1042}
1043
1044sub delete_section {
1045 my $self = shift (@_);
1046 my ($section) = @_;
1047
1048# my $section_ptr = {'subsection_order'=>[],
1049# 'next_subsection'=>1,
1050# 'subsections'=>{},
1051# 'metadata'=>[],
1052# 'text'=>""};
1053
1054 # if this is the top section reset everything
1055 if ($section eq "") {
1056 $self->{'subsection_order'} = [];
1057 $self->{'subsections'} = {};
1058 $self->{'metadata'} = [];
1059 $self->{'text'} = "";
1060 return;
1061 }
1062
1063 # find the parent of the section to delete
1064 my $parent_section = "";
1065 my $child = "0";
1066 my @section = split (/\./, $section);
1067 if (scalar(@section) > 0) {
1068 $child = pop (@section);
1069 $parent_section = join (".", @section);
1070 }
1071
1072 my $parent_section_ptr = $self->_lookup_section($parent_section);
1073 if (!defined $parent_section_ptr) {
1074 print STDERR "doc::delete_section couldn't find parent section " .
1075 "$parent_section\n";
1076 return;
1077 }
1078
1079 # remove this section from the subsection_order list
1080 my $i = 0;
1081 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
1082 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
1083 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
1084 last;
1085 }
1086 $i++;
1087 }
1088
1089 # remove this section from the subsection hash
1090 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
1091 undef $parent_section_ptr->{'subsections'}->{$child};
1092 }
1093}
1094
1095#--
1096# methods for dealing with metadata
1097
1098# set_metadata_element and get_metadata_element are for metadata
1099# which should only have one value. add_meta_data and get_metadata
1100# are for metadata which can have more than one value.
1101
1102# returns the first metadata value which matches field
1103
1104# This version of get metadata element works much like the one above,
1105# except it allows for the namespace portion of a metadata element to
1106# be ignored, thus if you are searching for dc.Title, the first piece
1107# of matching metadata ending with the name Title (once any namespace
1108# is removed) would be returned.
1109# 28-11-2003 John Thompson
1110sub get_metadata_element {
1111 my $self = shift (@_);
1112 my ($section, $field, $ignore_namespace) = @_;
1113 my ($data);
1114
1115 $ignore_namespace = 0 unless defined $ignore_namespace;
1116
1117 my $section_ptr = $self->_lookup_section($section);
1118 if (!defined $section_ptr) {
1119 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
1120 return;
1121 }
1122
1123 # Remove the any namespace if we are being told to ignore them
1124 if($ignore_namespace) {
1125 $field =~ s/^\w*\.//;
1126 }
1127
1128 foreach $data (@{$section_ptr->{'metadata'}}) {
1129
1130 my $data_name = $data->[0];
1131 # Remove the any namespace if we are being told to ignore them
1132 if($ignore_namespace) {
1133 $data_name =~ s/^\w*\.//;
1134 }
1135
1136 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
1137 }
1138
1139 return undef; # was not found
1140}
1141
1142# returns a list of the form [value1, value2, ...]
1143sub get_metadata {
1144 my $self = shift (@_);
1145 my ($section, $field, $ignore_namespace) = @_;
1146 my ($data);
1147
1148 $ignore_namespace = 0 unless defined $ignore_namespace;
1149
1150 my $section_ptr = $self->_lookup_section($section);
1151 if (!defined $section_ptr) {
1152 print STDERR "doc::get_metadata couldn't find section ",
1153 $section, "\n";
1154 return;
1155 }
1156
1157 # Remove the any namespace if we are being told to ignore them
1158 if($ignore_namespace) {
1159 $field =~ s/^\w*\.//;
1160 }
1161
1162 my @metadata = ();
1163 foreach $data (@{$section_ptr->{'metadata'}}) {
1164
1165 my $data_name = $data->[0];
1166 # Remove the any namespace if we are being told to ignore them
1167 if($ignore_namespace) {
1168 $data_name =~ s/^\w*\.//;
1169 }
1170
1171 push (@metadata, $data->[1]) if ($data_name eq $field);
1172 }
1173
1174 return \@metadata;
1175}
1176
1177# returns a list of the form [[field,value],[field,value],...]
1178sub get_all_metadata {
1179 my $self = shift (@_);
1180 my ($section) = @_;
1181
1182 my $section_ptr = $self->_lookup_section($section);
1183 if (!defined $section_ptr) {
1184 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
1185 return;
1186 }
1187
1188 return $section_ptr->{'metadata'};
1189}
1190
1191# $value is optional
1192sub delete_metadata {
1193 my $self = shift (@_);
1194 my ($section, $field, $value) = @_;
1195
1196 my $section_ptr = $self->_lookup_section($section);
1197 if (!defined $section_ptr) {
1198 print STDERR "doc::delete_metadata couldn't find section ", $section, "\n";
1199 return;
1200 }
1201
1202 my $i = 0;
1203 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
1204 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
1205 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
1206 splice (@{$section_ptr->{'metadata'}}, $i, 1);
1207 } else {
1208 $i++;
1209 }
1210 }
1211}
1212
1213sub delete_all_metadata {
1214 my $self = shift (@_);
1215 my ($section) = @_;
1216
1217 my $section_ptr = $self->_lookup_section($section);
1218 if (!defined $section_ptr) {
1219 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
1220 return;
1221 }
1222
1223 $section_ptr->{'metadata'} = [];
1224}
1225
1226sub set_metadata_element {
1227 my $self = shift (@_);
1228 my ($section, $field, $value) = @_;
1229
1230 $self->set_utf8_metadata_element ($section, $field,
1231 &unicode::ascii2utf8(\$value));
1232}
1233
1234# set_utf8_metadata_element assumes the text has already been
1235# converted to the UTF-8 encoding.
1236sub set_utf8_metadata_element {
1237 my $self = shift (@_);
1238 my ($section, $field, $value) = @_;
1239
1240 $self->delete_metadata ($section, $field);
1241 $self->add_utf8_metadata ($section, $field, $value);
1242}
1243
1244
1245# add_metadata assumes the text is in (extended) ascii form. For
1246# text which has already been converted to the UTF-8 format use
1247# add_utf8_metadata.
1248sub add_metadata {
1249 my $self = shift (@_);
1250 my ($section, $field, $value) = @_;
1251
1252 $self->add_utf8_metadata ($section, $field,
1253 &unicode::ascii2utf8(\$value));
1254}
1255
1256sub add_utf8_metadata {
1257 my $self = shift (@_);
1258 my ($section, $field, $value) = @_;
1259
1260 my $section_ptr = $self->_lookup_section($section);
1261 if (!defined $section_ptr) {
1262 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
1263 return;
1264 }
1265 if (!defined $value) {
1266 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
1267 return;
1268 }
1269 if (!defined $field) {
1270 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
1271 return;
1272 }
1273
1274 #print STDERR "###$field=$value\n";
1275 # double check that the value is utf-8
1276 if (unicode::ensure_utf8(\$value)) {
1277 print STDERR "doc::add_utf8_metadata: warning: '$field' wasn't utf8\n";
1278 }
1279
1280 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1281}
1282
1283
1284# methods for dealing with text
1285
1286# returns the text for a section
1287sub get_text {
1288 my $self = shift (@_);
1289 my ($section) = @_;
1290
1291 my $section_ptr = $self->_lookup_section($section);
1292 if (!defined $section_ptr) {
1293 print STDERR "doc::get_text couldn't find section " .
1294 "$section\n";
1295 return "";
1296 }
1297
1298 return $section_ptr->{'text'};
1299}
1300
1301# returns the (utf-8 encoded) length of the text for a section
1302sub get_text_length {
1303 my $self = shift (@_);
1304 my ($section) = @_;
1305
1306 my $section_ptr = $self->_lookup_section($section);
1307 if (!defined $section_ptr) {
1308 print STDERR "doc::get_text_length couldn't find section " .
1309 "$section\n";
1310 return 0;
1311 }
1312
1313 return length ($section_ptr->{'text'});
1314}
1315
1316sub delete_text {
1317 my $self = shift (@_);
1318 my ($section) = @_;
1319
1320 my $section_ptr = $self->_lookup_section($section);
1321 if (!defined $section_ptr) {
1322 print STDERR "doc::delete_text couldn't find section " .
1323 "$section\n";
1324 return;
1325 }
1326
1327 $section_ptr->{'text'} = "";
1328}
1329
1330# add_text assumes the text is in (extended) ascii form. For
1331# text which has been already converted to the UTF-8 format
1332# use add_utf8_text.
1333sub add_text {
1334 my $self = shift (@_);
1335 my ($section, $text) = @_;
1336
1337 # convert the text to UTF-8 encoded unicode characters
1338 # and add the text
1339 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1340}
1341
1342
1343# add_utf8_text assumes the text to be added has already
1344# been converted to the UTF-8 encoding. For ascii text use
1345# add_text
1346sub add_utf8_text {
1347 my $self = shift (@_);
1348 my ($section, $text) = @_;
1349
1350 my $section_ptr = $self->_lookup_section($section);
1351 if (!defined $section_ptr) {
1352 print STDERR "doc::add_utf8_text couldn't find section " .
1353 "$section\n";
1354 return;
1355 }
1356
1357 $section_ptr->{'text'} .= $text;
1358}
1359
1360
1361# methods for dealing with associated files
1362
1363# a file is associated with a document, NOT a section.
1364# if section is defined it is noted in the data structure
1365# only so that files associated from a particular section
1366# may be removed later (using delete_section_assoc_files)
1367sub associate_file {
1368 my $self = shift (@_);
1369 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1370 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1371
1372 # remove all associated files with the same name
1373 $self->delete_assoc_file ($assoc_filename);
1374
1375 push (@{$self->{'associated_files'}},
1376 [$real_filename, $assoc_filename, $mime_type, $section]);
1377}
1378
1379# returns a list of associated files in the form
1380# [[real_filename, assoc_filename, mimetype], ...]
1381sub get_assoc_files {
1382 my $self = shift (@_);
1383
1384 return $self->{'associated_files'};
1385}
1386
1387sub delete_section_assoc_files {
1388 my $self = shift (@_);
1389 my ($section) = @_;
1390
1391 my $i=0;
1392 while ($i < scalar (@{$self->{'associated_files'}})) {
1393 if (defined $self->{'associated_files'}->[$i]->[3] &&
1394 $self->{'associated_files'}->[$i]->[3] eq $section) {
1395 splice (@{$self->{'associated_files'}}, $i, 1);
1396 } else {
1397 $i++;
1398 }
1399 }
1400}
1401
1402sub delete_assoc_file {
1403 my $self = shift (@_);
1404 my ($assoc_filename) = @_;
1405
1406 my $i=0;
1407 while ($i < scalar (@{$self->{'associated_files'}})) {
1408 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1409 splice (@{$self->{'associated_files'}}, $i, 1);
1410 } else {
1411 $i++;
1412 }
1413 }
1414}
1415
1416sub reset_nextsection_ptr {
1417 my $self = shift (@_);
1418 my ($section) = @_;
1419
1420 my $section_ptr = $self->_lookup_section($section);
1421 $section_ptr->{'next_subsection'} = 1;
1422}
1423
14241;
Note: See TracBrowser for help on using the repository browser.