source: trunk/gsdl/perllib/doc.pm@ 8906

Last change on this file since 8906 was 8906, checked in by chi, 19 years ago

Modifications of dublin_core element value into the DSpace's format. For example,"dc.Title" will be saved as "title" in dublin_core.xml.

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 36.2 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use unicode;
37use util;
38use ghtml;
39use File::stat;
40##use hashdoc;
41
42# the document type may be indexed_doc, nonindexed_doc, or
43# classification
44
45my $OIDcount = 0;
46
47sub new {
48 my $class = shift (@_);
49 my ($source_filename, $doc_type) = @_;
50
51 my $self = bless {'associated_files'=>[],
52 'subsection_order'=>[],
53 'next_subsection'=>1,
54 'subsections'=>{},
55 'metadata'=>[],
56 'text'=>"",
57 'OIDtype'=>"hash"}, $class;
58
59 #GRB: Added code to set lastmodified for OAI purposes
60 if ((defined $doc_type) && (defined $source_filename) && (-e $source_filename)) {
61 my $file_stat = stat($source_filename);
62 my $mtime = $file_stat->mtime;
63 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $file_stat->mtime);
64 }
65 #GRB: end inserted code
66
67 if (defined $source_filename) {
68 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
69
70 if (defined $collect_dir) {
71 my $dirsep = &util::get_dirsep();
72 if ($collect_dir !~ m/$dirsep$/) {
73 $collect_dir .= $dirsep;
74 }
75
76 $collect_dir =~ s/\\/\\\\/g; # escape DOS style file separator
77
78 if ($source_filename =~ /^$collect_dir(.*)$/) {
79 $source_filename = $1;
80 }
81 }
82
83 $self->set_source_filename ($source_filename);
84 }
85
86 $self->set_doc_type ($doc_type) if defined $doc_type;
87
88 return $self;
89}
90
91# clone the $self object
92sub duplicate {
93 my $self = shift (@_);
94
95 my $newobj = {};
96
97 foreach my $k (keys %$self) {
98 $newobj->{$k} = &clone ($self->{$k});
99 }
100
101 bless $newobj, ref($self);
102 return $newobj;
103}
104
105sub clone {
106 my ($from) = @_;
107 my $type = ref ($from);
108
109 if ($type eq "HASH") {
110 my $to = {};
111 foreach my $key (keys %$from) {
112 $to->{$key} = &clone ($from->{$key});
113 }
114 return $to;
115 } elsif ($type eq "ARRAY") {
116 my $to = [];
117 foreach my $v (@$from) {
118 push (@$to, &clone ($v));
119 }
120 return $to;
121 } else {
122 return $from;
123 }
124}
125
126sub set_OIDtype {
127 my $self = shift (@_);
128 my ($type) = @_;
129
130 if ($type =~ /^(hash|incremental|dirname|assigned)$/) {
131 $self->{'OIDtype'} = $type;
132 } else {
133 $self->{'OIDtype'} = "hash";
134 }
135}
136
137sub set_source_filename {
138 my $self = shift (@_);
139 my ($source_filename) = @_;
140
141 $self->set_metadata_element ($self->get_top_section(),
142 "gsdlsourcefilename",
143 $source_filename);
144}
145
146sub set_converted_filename {
147 my $self = shift (@_);
148 my ($converted_filename) = @_;
149
150 $self->set_metadata_element ($self->get_top_section(),
151 "gsdlconvertedfilename",
152 $converted_filename);
153}
154
155
156# returns the source_filename as it was provided
157sub get_source_filename {
158 my $self = shift (@_);
159
160 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
161}
162
163# returns converted filename if available else returns source filename
164sub get_filename_for_hashing {
165 my $self = shift (@_);
166
167 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
168
169 if (!defined $filename) {
170 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
171 }
172 return $filename;
173}
174
175sub set_doc_type {
176 my $self = shift (@_);
177 my ($doc_type) = @_;
178
179 $self->set_metadata_element ($self->get_top_section(),
180 "gsdldoctype",
181 $doc_type);
182}
183
184# returns the source_filename as it was provided
185# the default of "indexed_doc" is used if no document
186# type was provided
187sub get_doc_type {
188 my $self = shift (@_);
189
190 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
191 return $doc_type if (defined $doc_type);
192 return "indexed_doc";
193}
194
195sub _escape_text {
196 my ($text) = @_;
197
198 # special characters in the gml encoding
199 $text =~ s/&/&/g; # this has to be first...
200 $text =~ s/</&lt;/g;
201 $text =~ s/>/&gt;/g;
202 $text =~ s/\"/&quot;/g;
203
204 return $text;
205}
206
207sub buffer_section_xml {
208 my $self = shift (@_);
209 my ($section) = @_;
210
211 my $section_ptr = $self->_lookup_section ($section);
212 return "" unless defined $section_ptr;
213
214 my $all_text = "<Section>\n";
215 $all_text .= " <Description>\n";
216
217 # output metadata
218 foreach my $data (@{$section_ptr->{'metadata'}}) {
219 my $escaped_value = &_escape_text($data->[1]);
220 $all_text .= ' <Metadata name="' . $data->[0] . '">' . $escaped_value . "</Metadata>\n";
221 }
222
223 $all_text .= " </Description>\n";
224
225 # output the text
226 $all_text .= " <Content>";
227 $all_text .= &_escape_text($section_ptr->{'text'});
228 $all_text .= "</Content>\n";
229
230 # output all the subsections
231 foreach my $subsection (@{$section_ptr->{'subsection_order'}}) {
232 $all_text .= $self->buffer_section_xml("$section.$subsection");
233 }
234
235 $all_text .= "</Section>\n";
236
237 # make sure no nasty control characters have snuck through
238 # (XML::Parser will barf on anything it doesn't consider to be
239 # valid UTF-8 text, including things like \c@, \cC etc.)
240 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
241
242 return $all_text;
243}
244
245sub buffer_txt_section_xml {
246 my $self = shift(@_);
247 my ($section) = @_;
248
249 my $section_ptr = $self->_lookup_section ($section);
250
251 return "" unless defined $section_ptr;
252
253 my $all_text = "<Section>\n";
254
255 ##output the text
256 #$all_text .= " <Content>";
257 $all_text .= &_escape_text($section_ptr->{'text'});
258 #$all_text .= " </Content>\n";
259
260
261 #output all the subsections
262 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
263 $all_text .= $self->buffer_txt_section_xml("$section.$subsection");
264 }
265
266 $all_text .= "</Section>\n";
267
268
269 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
270 return $all_text;
271}
272
273sub buffer_mets_fileSection_section_xml() {
274 my $self = shift(@_);
275 my ($section) = @_;
276
277 #$section="" unless defined $section;
278
279
280 my $section_ptr=$self->_lookup_section($section);
281 return "" unless defined $section_ptr;
282
283
284 #**output fileSection by sections
285 my $section_num ="1". $section;
286
287
288 #my $filePath = $doc_Dir . '/doctxt.xml';
289
290 my $filePath = 'doctxt.xml';
291
292 #**output the fileSection details
293 my $all_text = ' <mets:fileGrp ID="FILEGROUP_PRELUDE' . $section_num . '">'. "\n";
294 $all_text .= ' <mets:file MIMETYPE="text/xml" ID="FILE'.$section_num. '">'. "\n";
295 $all_text .= ' <mets:FLocat LOCTYPE="URL" xlink:href="file:'.$filePath.'#xpointer(/Section[';
296
297 my $xpath = "1".$section;
298
299 $xpath =~ s/\./]\/Section[/g;
300
301 $all_text .= $xpath;
302
303 $all_text .= ']/text())" />' . "\n";
304 $all_text .= " </mets:file>\n";
305 $all_text .= " </mets:fileGrp>\n";
306
307
308 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
309 $all_text .= $self->buffer_mets_fileSection_section_xml("$section.$subsection");
310 }
311
312 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
313
314 return $all_text;
315}
316
317sub buffer_mets_fileWhole_section_xml(){
318 my $self = shift(@_);
319 my ($section) = @_;
320
321 my $section_ptr = $self-> _lookup_section($section);
322 return "" unless defined $section_ptr;
323
324 my $all_text="" unless defined $all_txt;
325
326 my ($dirPath)="" unless defined $dirPath;
327 my $fileID=0;
328
329 #** output the fileSection for the whole section
330 #*** get the sourcefile and associative file
331
332 foreach my $data (@{$section_ptr->{'metadata'}}){
333 my $escaped_value = &_escape_text($data->[1]);
334 if ($data->[0] eq "gsdlsourcefilename") {
335 ($dirPath) = $escaped_value =~ m/^(.*)[\/\\][^\/\\]*$/;
336
337 $all_text .= ' <mets:fileGrp ID="default">'."\n";
338 ++$fileID;
339 $all_text .= ' <mets:file MIMETYPE="text/xml" ID="default.'.$fileID.'">'. "\n";
340
341 $all_text .= ' <mets:FLocat LOCTYPE="URL" xlink:href="file:'.$data->[1].'" />'."\n";
342
343 $all_text .= " </mets:file>\n";
344 }
345
346 if ($data->[0] eq "gsdlassocfile"){
347 $escaped_value =~ m/^(.*?):(.*):$/;
348
349 my $assfilePath = $dirPath . '/'. $1;
350 ++$fileID;
351
352 $all_text .= ' <mets:file MIMETYPE="'.$2.'" ID="default.'.$fileID. '">'. "\n";
353 $all_text .= ' <mets:FLocat LOCTYPE="URL" xlink:href="file:'.$assfilePath.'" />'."\n";
354
355 $all_text .= " </mets:file>\n";
356 }
357 }
358 $all_text .= " </mets:fileGrp>\n";
359
360
361 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
362
363 return $all_text;
364}
365
366sub buffer_mets_StruMapSection_section_xml(){
367 my $self = shift(@_);
368 my ($section, $order_numref) = @_;
369
370 $section="" unless defined $section;
371
372
373 my $section_ptr=$self->_lookup_section($section);
374 return "" unless defined $section_ptr;
375
376
377 #***output fileSection by Sections
378 my $section_num ="1". $section;
379 my $dmd_num = $section_num;
380
381 ##**output the dmdSection details
382 #if ($section_num eq "1") {
383 # $dmd_num = "0";
384 #}
385
386 #**output the StruMap details
387
388 my $all_text = ' <mets:div ID="DS'. $section_num .'" TYPE="Section" ORDER="'.$$order_numref++.'" ORDERLABEL="'. $section_num .'" LABEL="';
389
390 $all_text .= $section_num . '" DMDID="DM'.$dmd_num.'">'. "\n";
391
392 $all_text .= ' <mets:fptr FILEID="FILEGROUP_PRELUDE'.$section_num.'" />'. "\n";
393
394
395 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
396 $all_text .= $self->buffer_mets_StruMapSection_section_xml("$section.$subsection", $order_numref);
397 }
398
399 $all_text .= " </mets:div>\n";
400
401 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
402
403 return $all_text;
404}
405
406
407sub buffer_mets_StruMapWhole_section_xml(){
408 my $self = shift(@_);
409 my ($section) = @_;
410
411 my $section_ptr = $self-> _lookup_section($section);
412 return "" unless defined $section_ptr;
413
414 my $all_text="" unless defined $all_txt;
415 my $fileID=0;
416 my $order_num = 0;
417
418 $all_text .= '<mets:structMap ID="All" TYPE="Whole Document" LABEL="All">'."\n";
419 $all_text .= ' <mets:div ID="DSAll" TYPE="Document" ORDER="'.$order_num.'" ORDERLABEL="All" LABEL="Whole Documemt" DMDID="DM1">' . "\n";
420
421
422 #** output the StruMapSection for the whole section
423 #*** get the sourcefile and associative file
424
425 foreach my $data (@{$section_ptr->{'metadata'}}){
426 my $escaped_value = &_escape_text($data->[1]);
427
428 if ($data->[0] eq "gsdlsourcefilename") {
429 ++$fileID;
430 $all_text .= ' <mets:fptr FILEID="default.'.$fileID.'" />'."\n";
431 }
432
433 if ($data->[0] eq "gsdlassocfile"){
434 ++$fileID;
435 $all_text .= ' <mets:fptr FILEID="default.'.$fileID. '" />'. "\n";
436 }
437 }
438 $all_text .= " </mets:div>\n";
439
440 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
441
442 return $all_text;
443}
444
445
446sub buffer_mets_dmdSection_section_xml(){
447 my $self = shift(@_);
448 my ($section) = @_;
449
450 $section="" unless defined $section;
451
452 my $section_ptr=$self->_lookup_section($section);
453 return "" unless defined $section_ptr;
454
455 #***convert section number
456 my $section_num ="1". $section;
457 my $dmd_num = $section_num;
458
459 # #**output the dmdSection details
460 # if ($section_num eq "1") {
461 # $dmd_num = "0";
462 # }
463 my $all_text = '<mets:dmdSec ID="DM'.$dmd_num.'" GROUPID="'.$section_num.'">'. "\n";
464 $all_text .= ' <mets:mdWrap MDTYPE="OTHER" OTHERMDTYPE="gsdl3" ID="gsdl'.$section_num.'">'."\n";
465 $all_text .= " <mets:xmlData>\n";
466 foreach my $data (@{$section_ptr->{'metadata'}}){
467 my $escaped_value = &_escape_text($data->[1]);
468 $all_text .= ' <gsdl3:Metadata name="'. $data->[0].'">'. $escaped_value. "</gsdl3:Metadata>\n";
469 if ($data->[0] eq "dc.Title") {
470 $all_text .= ' <gsdl3:Metadata name="Title">'. $escaped_value."</gsdl3:Metadata>\n";
471 }
472 }
473
474 $all_text .= " </mets:xmlData>\n";
475 $all_text .= " </mets:mdWrap>\n";
476 $all_text .= "</mets:dmdSec>\n";
477
478 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
479 $all_text .= $self->buffer_mets_dmdSection_section_xml("$section.$subsection");
480 }
481
482 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
483
484 return $all_text;
485}
486
487sub output_section {
488 my $self = shift (@_);
489 my ($handle, $section) = @_;
490
491 print $handle $self->buffer_section_xml($section);
492}
493
494#*** print out DSpace dublin_core metadata section
495sub output_dspace_section {
496 my $self = shift (@_);
497 my ($handle, $section) = @_;
498
499 my $section_ptr = $self->_lookup_section ($section);
500 return "" unless defined $section_ptr;
501
502 my $all_text = "<Section>\n";
503 $all_text .= " <Description>\n";
504
505 # output metadata
506 foreach my $data (@{$section_ptr->{'metadata'}}) {
507 my $escaped_value = &_escape_text($data->[1]);
508 $all_text .= ' <Metadata name="' . $data->[0] . '">' . $escaped_value . "</Metadata>\n";
509 }
510
511 $all_text .= " </Description>\n";
512 $all_text .= "</Section>\n";
513
514 # make sure no nasty control characters have snuck through
515 # (XML::Parser will barf on anything it doesn't consider to be
516 # valid UTF-8 text, including things like \c@, \cC etc.)
517 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
518
519 return $all_text;
520}
521
522#*** print out doctxt.xml file
523sub output_txt_section {
524 my $self = shift (@_);
525 my ($handle, $section) = @_;
526
527 print $handle $self->buffer_txt_section_xml($section);
528}
529
530#*** print out docmets.xml file
531sub output_mets_section {
532 my $self = shift(@_);
533 my ($handle, $section) = @_;
534
535 #***print out the dmdSection
536 print $handle $self->buffer_mets_dmdSection_section_xml($section);
537
538 #***print out the fileSection by sections
539 print $handle "<mets:fileSec>\n";
540 print $handle $self->buffer_mets_fileSection_section_xml($section);
541
542 #***print out the whole fileSection
543 print $handle $self->buffer_mets_fileWhole_section_xml($section);
544 print $handle "</mets:fileSec>\n";
545
546 #***print out the StruMapSection by sections
547 print $handle '<mets:structMap ID="Section" TYPE="Section" LABEL="Section">' . "\n";
548 my $order_num=0;
549 print $handle $self->buffer_mets_StruMapSection_section_xml($section, \$order_num);
550 print $handle "</mets:structMap>\n";
551 print $handle $self->buffer_mets_StruMapWhole_section_xml($section);
552 print $handle "</mets:structMap>\n";
553}
554
555#*** print out dublin_core.xml file
556sub output_dc_section {
557 my $self = shift(@_);
558 my ($handle, $section, $doc_Dir) = @_;
559
560 #***print out the dublin_core
561 $section="" unless defined $section;
562
563 my $section_ptr=$self->_lookup_section($section);
564 return "" unless defined $section_ptr;
565 my $all_text="";
566 foreach my $data (@{$section_ptr->{'metadata'}}){
567 my $escaped_value = &_escape_text($data->[1]);
568 if ($data->[0]=~ /^dc/) {
569 $data->[0] =~ tr/[A-Z]/[a-z]/;
570 $data->[0] =~ /^dc\.(.*)/;
571 my $dc_element = $1;
572 #$all_text .= ' <dcvalue element="'. $data->[0].'" qualifier="#####">'. $escaped_value. "</dcvalue>\n";
573 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
574 }
575 }
576 if ($all_text eq "") {
577 $all_text .= " There is no Dublin Core metatdata in this document\n";
578 }
579 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
580
581 print $handle $all_text;
582}
583
584# look up the reference to the a particular section
585sub _lookup_section {
586 my $self = shift (@_);
587 my ($section) = @_;
588
589 my ($num);
590 my $sectionref = $self;
591
592 while (defined $section && $section ne "") {
593 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
594 $num =~ s/^0+(\d)/$1/; # remove leading 0s
595 $section = "" unless defined $section;
596
597 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
598 $sectionref = $sectionref->{'subsections'}->{$num};
599 } else {
600 return undef;
601 }
602 }
603
604 return $sectionref;
605}
606
607# calculate OID by hashing the contents of the document
608sub _calc_OID {
609 my $self = shift (@_);
610 my ($filename) = @_;
611
612 my $osexe = &util::get_os_exe();
613
614 my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
615 $ENV{'GSDLOS'},"hashfile$osexe");
616
617 my $result = "NULL";
618
619 if (-e "$hashfile_exe") {
620# $result = `\"$hashfile_exe\" \"$filename\"`;
621 $result = `hashfile$osexe \"$filename\"`;
622 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
623
624 } else {
625 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
626 }
627 return "HASH$result";
628}
629
630# methods dealing with OID, not groups of them.
631
632# if $OID is not provided one is calculated
633sub set_OID {
634 my $self = shift (@_);
635 my ($OID) = @_;
636
637 my $use_hash_oid = 0;
638 # if an OID wasn't provided claculate one
639 if (!defined $OID) {
640 $OID = "NULL";
641 if ($self->{'OIDtype'} eq "hash") {
642 $use_hash_oid = 1;
643 } elsif ($self->{'OIDtype'} eq "incremental") {
644 $OID = "D" . $OIDcount;
645 $OIDcount ++;
646
647 } elsif ($self->{'OIDtype'} eq "dirname") {
648 $OID = 'J';
649 my $filename = $self->get_source_filename();
650 if (defined($filename)) { # && -e $filename) {
651 $OID = &File::Basename::dirname($filename);
652 if (defined $OID) {
653 $OID = 'J'.&File::Basename::basename($OID);
654 $OID =~ s/\.//; #remove any periods
655 } else {
656 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
657 $use_hash_oid = 1;
658 }
659 } else {
660 print STDERR "Failed to find filename, generating hash id\n";
661 $use_hash_oid = 1;
662 }
663
664 } elsif ($self->{'OIDtype'} eq "assigned") {
665 my $identifier = $self->get_metadata_element ($self->get_top_section(), "dc.Identifier");
666 if (defined $identifier && $identifier ne "") {
667 $OID = "D" . $self->get_metadata_element ($self->get_top_section(), "dc.Identifier");
668 $OID =~ s/\.//; #remove any periods
669 } else {
670 # need a hash id
671 print STDERR "no dc.Identifier found, generating hash id\n";
672 $use_hash_oid = 1;
673 }
674
675 } else {
676 $use_hash_oid = 1;
677 }
678
679 if ($use_hash_oid) {
680
681 # "hash" OID - feed file to hashfile.exe
682 #my $filename = $self->get_source_filename();
683 # we want to use the converted file for hashing if available
684 # cos its quicker
685 my $filename = $self->get_filename_for_hashing();
686
687 if (defined($filename) && -e $filename) {
688 $OID = $self->_calc_OID ($filename);
689 } else {
690 $filename = &util::get_tmp_filename();
691 if (!open (OUTFILE, ">$filename")) {
692 print STDERR "doc::set_OID could not write to $filename\n";
693 } else {
694 $self->output_section('OUTFILE', $self->get_top_section(),
695 undef, 1);
696 close (OUTFILE);
697 }
698 $OID = $self->_calc_OID ($filename);
699 &util::rm ($filename);
700 }
701 }
702 }
703 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
704}
705
706# this uses hashdoc (embedded c thingy) which is faster but still
707# needs a little work to be suffiently stable
708sub ___set_OID {
709 my $self = shift (@_);
710 my ($OID) = @_;
711
712 # if an OID wasn't provided then calculate hash value based on document
713 if (!defined $OID)
714 {
715 my $hash_text = $self->buffer_section_gml($self->get_top_section(),
716 undef, 1);
717 my $hash_len = length($hash_text);
718
719 $OID = &hashdoc::buffer($hash_text,$hash_len);
720 }
721
722 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
723}
724
725# returns the OID for this document
726sub get_OID {
727 my $self = shift (@_);
728 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
729 return $OID if (defined $OID);
730 return "NULL";
731}
732
733sub delete_OID {
734 my $self = shift (@_);
735
736 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
737}
738
739
740# methods for manipulating section names
741
742# returns the name of the top-most section (the top
743# level of the document
744sub get_top_section {
745 my $self = shift (@_);
746
747 return "";
748}
749
750# returns a section
751sub get_parent_section {
752 my $self = shift (@_);
753 my ($section) = @_;
754
755 $section =~ s/(^|\.)\d+$//;
756
757 return $section;
758}
759
760# returns the first child section (or the end child
761# if there isn't any)
762sub get_begin_child {
763 my $self = shift (@_);
764 my ($section) = @_;
765
766 my $section_ptr = $self->_lookup_section($section);
767 return "" unless defined $section_ptr;
768
769 if (defined $section_ptr->{'subsection_order'}->[0]) {
770 return "$section.$section_ptr->{'subsection_order'}->[0]";
771 }
772
773 return $self->get_end_child ($section);
774}
775
776# returns the next child of a parent section
777sub get_next_child {
778 my $self = shift (@_);
779 my ($section) = @_;
780
781 my $parent_section = $self->get_parent_section($section);
782 my $parent_section_ptr = $self->_lookup_section($parent_section);
783 return undef unless defined $parent_section_ptr;
784
785 my ($section_num) = $section =~ /(\d+)$/;
786 return undef unless defined $section_num;
787
788 my $i = 0;
789 my $section_order = $parent_section_ptr->{'subsection_order'};
790 while ($i < scalar(@$section_order)) {
791 last if $section_order->[$i] eq $section_num;
792 $i++;
793 }
794
795 $i++; # the next child
796 if ($i < scalar(@$section_order)) {
797 return $section_order->[$i] if $parent_section eq "";
798 return "$parent_section.$section_order->[$i]";
799 }
800
801 # no more sections in this level
802 return undef;
803}
804
805# returns a reference to a list of children
806sub get_children {
807 my $self = shift (@_);
808 my ($section) = @_;
809
810 my $section_ptr = $self->_lookup_section($section);
811 return [] unless defined $section_ptr;
812
813 my @children = @{$section_ptr->{'subsection_order'}};
814
815 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
816 return \@children;
817}
818
819# returns the child section one past the last one (which
820# is coded as "0")
821sub get_end_child {
822 my $self = shift (@_);
823 my ($section) = @_;
824
825 return $section . ".0" unless $section eq "";
826 return "0";
827}
828
829# returns the next section in book order
830sub get_next_section {
831 my $self = shift (@_);
832 my ($section) = @_;
833
834 return undef unless defined $section;
835
836 my $section_ptr = $self->_lookup_section($section);
837 return undef unless defined $section_ptr;
838
839 # first try to find first child
840 if (defined $section_ptr->{'subsection_order'}->[0]) {
841 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
842 return "$section.$section_ptr->{'subsection_order'}->[0]";
843 }
844
845 do {
846 # try to find sibling
847 my $next_child = $self->get_next_child ($section);
848 return $next_child if (defined $next_child);
849
850 # move up one level
851 $section = $self->get_parent_section ($section);
852 } while $section =~ /\d/;
853
854 return undef;
855}
856
857sub is_leaf_section {
858 my $self = shift (@_);
859 my ($section) = @_;
860
861 my $section_ptr = $self->_lookup_section($section);
862 return 1 unless defined $section_ptr;
863
864 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
865}
866
867# methods for dealing with sections
868
869# returns the name of the inserted section
870sub insert_section {
871 my $self = shift (@_);
872 my ($before_section) = @_;
873
874 # get the child to insert before and its parent section
875 my $parent_section = "";
876 my $before_child = "0";
877 my @before_section = split (/\./, $before_section);
878 if (scalar(@before_section) > 0) {
879 $before_child = pop (@before_section);
880 $parent_section = join (".", @before_section);
881 }
882
883 my $parent_section_ptr = $self->_lookup_section($parent_section);
884 if (!defined $parent_section_ptr) {
885 print STDERR "doc::insert_section couldn't find parent section " .
886 "$parent_section\n";
887 return;
888 }
889
890 # get the next section number
891 my $section_num = $parent_section_ptr->{'next_subsection'}++;
892
893 my $i = 0;
894 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
895 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
896 $i++;
897 }
898
899 # insert the section number into the order list
900 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
901
902 # add this section to the parent section
903 my $section_ptr = {'subsection_order'=>[],
904 'next_subsection'=>1,
905 'subsections'=>{},
906 'metadata'=>[],
907 'text'=>""};
908 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
909
910 # work out the full section number
911 my $section = $parent_section;
912 $section .= "." unless $section eq "";
913 $section .= $section_num;
914
915 return $section;
916}
917
918# creates a pre-named section
919sub create_named_section {
920 my $self = shift (@_);
921 my ($mastersection) = @_;
922
923 my ($num);
924 my $section = $mastersection;
925 my $sectionref = $self;
926
927 while ($section ne "") {
928 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
929 $num =~ s/^0+(\d)/$1/; # remove leading 0s
930 $section = "" unless defined $section;
931
932 if (defined $num) {
933 if (!defined $sectionref->{'subsections'}->{$num}) {
934 push (@{$sectionref->{'subsection_order'}}, $num);
935 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
936 'next_subsection'=>1,
937 'subsections'=>{},
938 'metadata'=>[],
939 'text'=>""};
940 if ($num >= $sectionref->{'next_subsection'}) {
941 $sectionref->{'next_subsection'} = $num + 1;
942 }
943 }
944 $sectionref = $sectionref->{'subsections'}->{$num};
945
946 } else {
947 print STDERR "doc::create_named_section couldn't create section ";
948 print STDERR "$mastersection\n";
949 last;
950 }
951 }
952}
953
954# returns a reference to a list of subsections
955sub list_subsections {
956 my $self = shift (@_);
957 my ($section) = @_;
958
959 my $section_ptr = $self->_lookup_section ($section);
960 if (!defined $section_ptr) {
961 print STDERR "doc::list_subsections couldn't find section $section\n";
962 return [];
963 }
964
965 return [@{$section_ptr->{'subsection_order'}}];
966}
967
968sub delete_section {
969 my $self = shift (@_);
970 my ($section) = @_;
971
972# my $section_ptr = {'subsection_order'=>[],
973# 'next_subsection'=>1,
974# 'subsections'=>{},
975# 'metadata'=>[],
976# 'text'=>""};
977
978 # if this is the top section reset everything
979 if ($section eq "") {
980 $self->{'subsection_order'} = [];
981 $self->{'subsections'} = {};
982 $self->{'metadata'} = [];
983 $self->{'text'} = "";
984 return;
985 }
986
987 # find the parent of the section to delete
988 my $parent_section = "";
989 my $child = "0";
990 my @section = split (/\./, $section);
991 if (scalar(@section) > 0) {
992 $child = pop (@section);
993 $parent_section = join (".", @section);
994 }
995
996 my $parent_section_ptr = $self->_lookup_section($parent_section);
997 if (!defined $parent_section_ptr) {
998 print STDERR "doc::delete_section couldn't find parent section " .
999 "$parent_section\n";
1000 return;
1001 }
1002
1003 # remove this section from the subsection_order list
1004 my $i = 0;
1005 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
1006 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
1007 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
1008 last;
1009 }
1010 $i++;
1011 }
1012
1013 # remove this section from the subsection hash
1014 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
1015 undef $parent_section_ptr->{'subsections'}->{$child};
1016 }
1017}
1018
1019#--
1020# methods for dealing with metadata
1021
1022# set_metadata_element and get_metadata_element are for metadata
1023# which should only have one value. add_meta_data and get_metadata
1024# are for metadata which can have more than one value.
1025
1026# returns the first metadata value which matches field
1027
1028# This version of get metadata element works much like the one above,
1029# except it allows for the namespace portion of a metadata element to
1030# be ignored, thus if you are searching for dc.Title, the first piece
1031# of matching metadata ending with the name Title (once any namespace
1032# is removed) would be returned.
1033# 28-11-2003 John Thompson
1034sub get_metadata_element {
1035 my $self = shift (@_);
1036 my ($section, $field, $ignore_namespace) = @_;
1037 my ($data);
1038
1039 $ignore_namespace = 0 unless defined $ignore_namespace;
1040
1041 my $section_ptr = $self->_lookup_section($section);
1042 if (!defined $section_ptr) {
1043 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
1044 return;
1045 }
1046
1047 # Remove the any namespace if we are being told to ignore them
1048 if($ignore_namespace) {
1049 $field =~ s/^\w*\.//;
1050 }
1051
1052 foreach $data (@{$section_ptr->{'metadata'}}) {
1053
1054 my $data_name = $data->[0];
1055 # Remove the any namespace if we are being told to ignore them
1056 if($ignore_namespace) {
1057 $data_name =~ s/^\w*\.//;
1058 }
1059
1060 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
1061 }
1062
1063 return undef; # was not found
1064}
1065
1066# returns a list of the form [value1, value2, ...]
1067sub get_metadata {
1068 my $self = shift (@_);
1069 my ($section, $field, $ignore_namespace) = @_;
1070 my ($data);
1071
1072 $ignore_namespace = 0 unless defined $ignore_namespace;
1073
1074 my $section_ptr = $self->_lookup_section($section);
1075 if (!defined $section_ptr) {
1076 print STDERR "doc::get_metadata couldn't find section ",
1077 $section, "\n";
1078 return;
1079 }
1080
1081 # Remove the any namespace if we are being told to ignore them
1082 if($ignore_namespace) {
1083 $field =~ s/^\w*\.//;
1084 }
1085
1086 my @metadata = ();
1087 foreach $data (@{$section_ptr->{'metadata'}}) {
1088
1089 my $data_name = $data->[0];
1090 # Remove the any namespace if we are being told to ignore them
1091 if($ignore_namespace) {
1092 $data_name =~ s/^\w*\.//;
1093 }
1094
1095 push (@metadata, $data->[1]) if ($data_name eq $field);
1096 }
1097
1098 return \@metadata;
1099}
1100
1101# returns a list of the form [[field,value],[field,value],...]
1102sub get_all_metadata {
1103 my $self = shift (@_);
1104 my ($section) = @_;
1105
1106 my $section_ptr = $self->_lookup_section($section);
1107 if (!defined $section_ptr) {
1108 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
1109 return;
1110 }
1111
1112 return $section_ptr->{'metadata'};
1113}
1114
1115# $value is optional
1116sub delete_metadata {
1117 my $self = shift (@_);
1118 my ($section, $field, $value) = @_;
1119
1120 my $section_ptr = $self->_lookup_section($section);
1121 if (!defined $section_ptr) {
1122 print STDERR "doc::delete_metadata couldn't find section ", $section, "\n";
1123 return;
1124 }
1125
1126 my $i = 0;
1127 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
1128 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
1129 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
1130 splice (@{$section_ptr->{'metadata'}}, $i, 1);
1131 } else {
1132 $i++;
1133 }
1134 }
1135}
1136
1137sub delete_all_metadata {
1138 my $self = shift (@_);
1139 my ($section) = @_;
1140
1141 my $section_ptr = $self->_lookup_section($section);
1142 if (!defined $section_ptr) {
1143 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
1144 return;
1145 }
1146
1147 $section_ptr->{'metadata'} = [];
1148}
1149
1150sub set_metadata_element {
1151 my $self = shift (@_);
1152 my ($section, $field, $value) = @_;
1153
1154 $self->set_utf8_metadata_element ($section, $field,
1155 &unicode::ascii2utf8(\$value));
1156}
1157
1158# set_utf8_metadata_element assumes the text has already been
1159# converted to the UTF-8 encoding.
1160sub set_utf8_metadata_element {
1161 my $self = shift (@_);
1162 my ($section, $field, $value) = @_;
1163
1164 $self->delete_metadata ($section, $field);
1165 $self->add_utf8_metadata ($section, $field, $value);
1166}
1167
1168
1169# add_metadata assumes the text is in (extended) ascii form. For
1170# text which has already been converted to the UTF-8 format use
1171# add_utf8_metadata.
1172sub add_metadata {
1173 my $self = shift (@_);
1174 my ($section, $field, $value) = @_;
1175
1176 $self->add_utf8_metadata ($section, $field,
1177 &unicode::ascii2utf8(\$value));
1178}
1179
1180sub add_utf8_metadata {
1181 my $self = shift (@_);
1182 my ($section, $field, $value) = @_;
1183
1184 my $section_ptr = $self->_lookup_section($section);
1185 if (!defined $section_ptr) {
1186 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
1187 return;
1188 }
1189 if (!defined $value) {
1190 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
1191 return;
1192 }
1193 if (!defined $field) {
1194 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
1195 return;
1196 }
1197
1198 #print STDERR "###$field=$value\n";
1199 # double check that the value is utf-8
1200 if (unicode::ensure_utf8(\$value)) {
1201 print STDERR "doc::add_utf8_metadata: warning: '$field' wasn't utf8\n";
1202 }
1203
1204 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1205}
1206
1207
1208# methods for dealing with text
1209
1210# returns the text for a section
1211sub get_text {
1212 my $self = shift (@_);
1213 my ($section) = @_;
1214
1215 my $section_ptr = $self->_lookup_section($section);
1216 if (!defined $section_ptr) {
1217 print STDERR "doc::get_text couldn't find section " .
1218 "$section\n";
1219 return "";
1220 }
1221
1222 return $section_ptr->{'text'};
1223}
1224
1225# returns the (utf-8 encoded) length of the text for a section
1226sub get_text_length {
1227 my $self = shift (@_);
1228 my ($section) = @_;
1229
1230 my $section_ptr = $self->_lookup_section($section);
1231 if (!defined $section_ptr) {
1232 print STDERR "doc::get_text_length couldn't find section " .
1233 "$section\n";
1234 return 0;
1235 }
1236
1237 return length ($section_ptr->{'text'});
1238}
1239
1240sub delete_text {
1241 my $self = shift (@_);
1242 my ($section) = @_;
1243
1244 my $section_ptr = $self->_lookup_section($section);
1245 if (!defined $section_ptr) {
1246 print STDERR "doc::delete_text couldn't find section " .
1247 "$section\n";
1248 return;
1249 }
1250
1251 $section_ptr->{'text'} = "";
1252}
1253
1254# add_text assumes the text is in (extended) ascii form. For
1255# text which has been already converted to the UTF-8 format
1256# use add_utf8_text.
1257sub add_text {
1258 my $self = shift (@_);
1259 my ($section, $text) = @_;
1260
1261 # convert the text to UTF-8 encoded unicode characters
1262 # and add the text
1263 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1264}
1265
1266
1267# add_utf8_text assumes the text to be added has already
1268# been converted to the UTF-8 encoding. For ascii text use
1269# add_text
1270sub add_utf8_text {
1271 my $self = shift (@_);
1272 my ($section, $text) = @_;
1273
1274 my $section_ptr = $self->_lookup_section($section);
1275 if (!defined $section_ptr) {
1276 print STDERR "doc::add_utf8_text couldn't find section " .
1277 "$section\n";
1278 return;
1279 }
1280
1281 $section_ptr->{'text'} .= $text;
1282}
1283
1284
1285# methods for dealing with associated files
1286
1287# a file is associated with a document, NOT a section.
1288# if section is defined it is noted in the data structure
1289# only so that files associated from a particular section
1290# may be removed later (using delete_section_assoc_files)
1291sub associate_file {
1292 my $self = shift (@_);
1293 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1294 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1295
1296 # remove all associated files with the same name
1297 $self->delete_assoc_file ($assoc_filename);
1298
1299 push (@{$self->{'associated_files'}},
1300 [$real_filename, $assoc_filename, $mime_type, $section]);
1301}
1302
1303# returns a list of associated files in the form
1304# [[real_filename, assoc_filename, mimetype], ...]
1305sub get_assoc_files {
1306 my $self = shift (@_);
1307
1308 return $self->{'associated_files'};
1309}
1310
1311sub delete_section_assoc_files {
1312 my $self = shift (@_);
1313 my ($section) = @_;
1314
1315 my $i=0;
1316 while ($i < scalar (@{$self->{'associated_files'}})) {
1317 if (defined $self->{'associated_files'}->[$i]->[3] &&
1318 $self->{'associated_files'}->[$i]->[3] eq $section) {
1319 splice (@{$self->{'associated_files'}}, $i, 1);
1320 } else {
1321 $i++;
1322 }
1323 }
1324}
1325
1326sub delete_assoc_file {
1327 my $self = shift (@_);
1328 my ($assoc_filename) = @_;
1329
1330 my $i=0;
1331 while ($i < scalar (@{$self->{'associated_files'}})) {
1332 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1333 splice (@{$self->{'associated_files'}}, $i, 1);
1334 } else {
1335 $i++;
1336 }
1337 }
1338}
1339
1340sub reset_nextsection_ptr {
1341 my $self = shift (@_);
1342 my ($section) = @_;
1343
1344 my $section_ptr = $self->_lookup_section($section);
1345 $section_ptr->{'next_subsection'} = 1;
1346}
1347
13481;
Note: See TracBrowser for help on using the repository browser.