source: trunk/gsdl/perllib/doc.pm@ 12401

Last change on this file since 12401 was 12327, checked in by shaoqun, 18 years ago

added methods used by plugout

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 44.0 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use unicode;
37use util;
38use ghtml;
39use File::stat;
40##use hashdoc;
41
42# the document type may be indexed_doc, nonindexed_doc, or
43# classification
44
45my $OIDcount = 0;
46
47sub new {
48 my $class = shift (@_);
49 my ($source_filename, $doc_type) = @_;
50
51 my $self = bless {'associated_files'=>[],
52 'subsection_order'=>[],
53 'next_subsection'=>1,
54 'subsections'=>{},
55 'metadata'=>[],
56 'text'=>"",
57 'OIDtype'=>"hash"}, $class;
58
59 # used to set lastmodified here, but this can screw up the HASH ids, so
60 # the docsave processor now calls set_lastmodified
61
62 if (defined $source_filename) {
63 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
64
65 if (defined $collect_dir) {
66 my $dirsep = &util::get_dirsep();
67 if ($collect_dir !~ m/$dirsep$/) {
68 $collect_dir .= $dirsep;
69 }
70
71 $collect_dir =~ s/\\/\\\\/g; # escape DOS style file separator
72
73 # if from within GSDLCOLLECTDIR, then remove directory prefix
74 # so source_filename is realative to it. This is done to aid
75 # portability, i.e. the collection can be moved to somewhere
76 # else on the file system and the archives directory will still
77 # work. This is needed, for example in the applet version of
78 # GLI where GSDLHOME/collect on the server will be different to
79 # the collect directory of the remove user. Of course,
80 # GSDLCOLLECTDIR subsequently needs to be put back on to turn
81 # it back into a full pathname.
82
83 if ($source_filename =~ /^$collect_dir(.*)$/) {
84 $source_filename = $1;
85 }
86 }
87
88 $self->set_source_filename ($source_filename);
89 }
90
91 $self->set_doc_type ($doc_type) if defined $doc_type;
92
93 return $self;
94}
95# set lastmodified for OAI purposes, added by GRB, moved by kjdon
96sub set_lastmodified {
97 my $self = shift (@_);
98
99 my $source_filename = $self->get_source_filename();
100 if ((defined $self->get_metadata_element ($self->get_top_section(), "gsdldoctype")) &&
101 (defined $source_filename) && (-e $source_filename)) {
102 my $file_stat = stat($source_filename);
103 my $mtime = $file_stat->mtime;
104 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $file_stat->mtime);
105 }
106}
107
108# clone the $self object
109sub duplicate {
110 my $self = shift (@_);
111
112 my $newobj = {};
113
114 foreach my $k (keys %$self) {
115 $newobj->{$k} = &clone ($self->{$k});
116 }
117
118 bless $newobj, ref($self);
119 return $newobj;
120}
121
122sub clone {
123 my ($from) = @_;
124 my $type = ref ($from);
125
126 if ($type eq "HASH") {
127 my $to = {};
128 foreach my $key (keys %$from) {
129 $to->{$key} = &clone ($from->{$key});
130 }
131 return $to;
132 } elsif ($type eq "ARRAY") {
133 my $to = [];
134 foreach my $v (@$from) {
135 push (@$to, &clone ($v));
136 }
137 return $to;
138 } else {
139 return $from;
140 }
141}
142
143sub set_OIDtype {
144 my $self = shift (@_);
145 my ($type, $metadata) = @_;
146
147 if (defined $type && $type =~ /^(hash|incremental|dirname|assigned)$/) {
148 $self->{'OIDtype'} = $type;
149 } else {
150 $self->{'OIDtype'} = "hash";
151 }
152 if ($type =~ /^assigned$/) {
153 if (defined $metadata) {
154 $self->{'OIDmetadata'} = $metadata;
155 } else {
156 $self->{'OIDmetadata'} = "dc.Identifier";
157 }
158 }
159}
160
161sub set_source_filename {
162 my $self = shift (@_);
163 my ($source_filename) = @_;
164
165 $self->set_metadata_element ($self->get_top_section(),
166 "gsdlsourcefilename",
167 $source_filename);
168}
169
170sub set_converted_filename {
171 my $self = shift (@_);
172 my ($converted_filename) = @_;
173
174 $self->set_metadata_element ($self->get_top_section(),
175 "gsdlconvertedfilename",
176 $converted_filename);
177}
178
179
180# returns the source_filename as it was provided
181sub get_source_filename {
182 my $self = shift (@_);
183
184 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
185}
186
187# returns converted filename if available else returns source filename
188sub get_filename_for_hashing {
189 my $self = shift (@_);
190
191 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
192
193 if (!defined $filename) {
194 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
195 # if NULPlug processed file, then don't give a filename
196 if (defined $plugin_name && $plugin_name eq "NULPlug") {
197 $filename = undef;
198 } else {
199 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
200 }
201 }
202 return $filename;
203}
204
205sub set_doc_type {
206 my $self = shift (@_);
207 my ($doc_type) = @_;
208
209 $self->set_metadata_element ($self->get_top_section(),
210 "gsdldoctype",
211 $doc_type);
212}
213
214# returns the gsdldoctype as it was provided
215# the default of "indexed_doc" is used if no document
216# type was provided
217sub get_doc_type {
218 my $self = shift (@_);
219
220 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
221 return $doc_type if (defined $doc_type);
222 return "indexed_doc";
223}
224
225sub _escape_text {
226 my ($text) = @_;
227 # special characters in the gml encoding
228 $text =~ s/&/&/g; # this has to be first...
229 $text =~ s/</&lt;/g;
230 $text =~ s/>/&gt;/g;
231 $text =~ s/\"/&quot;/g;
232
233 return $text;
234}
235
236sub buffer_section_xml {
237 my $self = shift (@_);
238 my ($section) = @_;
239
240 my $section_ptr = $self->_lookup_section ($section);
241 return "" unless defined $section_ptr;
242
243 my $all_text = "<Section>\n";
244 $all_text .= " <Description>\n";
245
246 # output metadata
247 foreach my $data (@{$section_ptr->{'metadata'}}) {
248 my $escaped_value = &_escape_text($data->[1]);
249 $all_text .= ' <Metadata name="' . $data->[0] . '">' . $escaped_value . "</Metadata>\n";
250 }
251
252 $all_text .= " </Description>\n";
253
254 # output the text
255 $all_text .= " <Content>";
256 $all_text .= &_escape_text($section_ptr->{'text'});
257 $all_text .= "</Content>\n";
258
259 # output all the subsections
260 foreach my $subsection (@{$section_ptr->{'subsection_order'}}) {
261 $all_text .= $self->buffer_section_xml("$section.$subsection");
262 }
263
264 $all_text .= "</Section>\n";
265
266 # make sure no nasty control characters have snuck through
267 # (XML::Parser will barf on anything it doesn't consider to be
268 # valid UTF-8 text, including things like \c@, \cC etc.)
269 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
270
271 return $all_text;
272}
273
274sub buffer_txt_section_xml {
275 my $self = shift(@_);
276 my ($section) = @_;
277
278 my $section_ptr = $self->_lookup_section ($section);
279
280 return "" unless defined $section_ptr;
281
282 my $all_text = "<Section>\n";
283
284 ##output the text
285 #$all_text .= " <Content>";
286 $all_text .= &_escape_text($section_ptr->{'text'});
287 #$all_text .= " </Content>\n";
288
289
290 #output all the subsections
291 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
292 $all_text .= $self->buffer_txt_section_xml("$section.$subsection");
293 }
294
295 $all_text .= "</Section>\n";
296
297
298 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
299 return $all_text;
300}
301
302sub buffer_mets_fileSection_section_xml() {
303 my $self = shift(@_);
304 my ($section,$version) = @_;
305
306 #$section="" unless defined $section;
307
308
309 my $section_ptr=$self->_lookup_section($section);
310 return "" unless defined $section_ptr;
311
312
313 # output fileSection by sections
314 my $section_num ="1". $section;
315
316 my $filePath = 'doctxt.xml';
317
318 my $opt_owner_id = "";
319 if ($version eq "fedora") {
320 $opt_owner_id = "OWNERID=\"M\"";
321 }
322
323 # output the fileSection details
324 my $all_text = ' <mets:fileGrp ID="FILEGROUP_PRELUDE' . $section_num . '">'. "\n";
325 $all_text .= " <mets:file MIMETYPE=\"text/xml\" ID=\"FILE$section_num\" $opt_owner_id >\n";
326 $all_text .= ' <mets:FLocat LOCTYPE="URL" xlink:href="file:'.$filePath.'#xpointer(/Section[';
327
328 my $xpath = "1".$section;
329 $xpath =~ s/\./]\/Section[/g;
330
331 $all_text .= $xpath;
332
333 $all_text .= ']/text())" xlink:title="Hierarchical Document Structure"/>' . "\n";
334 $all_text .= " </mets:file>\n";
335 $all_text .= " </mets:fileGrp>\n";
336
337
338 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
339 $all_text .= $self->buffer_mets_fileSection_section_xml("$section.$subsection",$version);
340 }
341
342 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
343
344 return $all_text;
345}
346
347sub buffer_mets_fileWhole_section_xml(){
348 my $self = shift(@_);
349 my ($section,$version,$working_dir) = @_;
350
351 my $section_ptr = $self-> _lookup_section($section);
352 return "" unless defined $section_ptr;
353
354 my $all_text="" unless defined $all_txt;
355
356 my $fileID=0;
357
358 # Output the fileSection for the whole section
359 # => get the sourcefile and associative file
360
361 my $id_root = "";
362 my $opt_owner_id = "";
363
364 if ($version eq "fedora") {
365 $opt_owner_id = "OWNERID=\"M\"";
366 }
367 else {
368 $id_root = "default";
369 }
370
371 if ($version ne "fedora") {
372 $all_text .= " <mets:fileGrp ID=\"$id_root\">\n";
373 }
374
375 foreach my $data (@{$section_ptr->{'metadata'}}){
376 my $escaped_value = &_escape_text($data->[1]);
377
378 if (($data->[0] eq "gsdlsourcefilename") && ($version ne "fedora")) {
379 my ($dirPath) = $escaped_value =~ m/^(.*)[\/\\][^\/\\]*$/;
380
381 ++$fileID;
382 $all_text .= " <mets:file MIMETYPE=\"text/xml\" ID=\"$id_root.$fileID\" $opt_owner_id >\n";
383
384 $all_text .= ' <mets:FLocat LOCTYPE="URL" xlink:href="file:'.$data->[1].'" />'."\n";
385
386 $all_text .= " </mets:file>\n";
387 }
388
389 if ($data->[0] eq "gsdlassocfile"){
390
391 $escaped_value =~ m/^(.*?):(.*):(.*)$/;
392 my $assoc_file = $1;
393 my $mime_type = $2;
394 my $assoc_dir = $3;
395
396 if ($version eq "fedora") {
397 $id_root = $assoc_file;
398 $id_root =~ s/\//_/g;
399 $all_text .= " <mets:fileGrp ID=\"$id_root\">\n";
400 }
401
402 my $assfilePath = ($assoc_dir eq "") ? $assoc_file : "$assoc_dir/$assoc_file";
403 ++$fileID;
404
405 my $mime_attr = "MIMETYPE=\"$mime_type\"";
406 my $xlink_title = "xlink:title=\"$assoc_file\"";
407
408 my $id_attr;
409 my $xlink_href;
410
411 if ($version eq "fedora") {
412 $id_attr = "ID=\"$id_root.0\"";
413
414 my $fedora_prefix = $ENV{'FEDORA_PREFIX'};
415 if (!defined $fedora_prefix) {
416 $xlink_href = "xlink:href=\"$assfilePath\"";
417 }
418 else
419 {
420 my $gsdlhome = $ENV{'GSDLHOME'};
421 my $gsdl_href = "$working_dir/$assfilePath";
422
423 $gsdl_href =~ s/^$gsdlhome(\/)?//;
424 $gsdl_href = "/gsdl/$gsdl_href";
425
426 my $fserver = $ENV{'FEDORA_HOSTNAME'};
427 my $fport = $ENV{'FEDORA_SERVER_PORT'};
428
429 my $fdomain = "http://$fserver:$fport";
430 $xlink_href = "xlink:href=\"$fdomain$gsdl_href\"";
431 }
432
433 my $top_section = $self->get_top_section();
434 my $id = $self->get_metadata_element($top_section,"Identifier");
435 }
436 else {
437 $id_attr = "ID=\"$id_root.$fileID\"";
438 $xlink_href = "xlink:href=\"$assfilePath\"";
439 }
440
441 $all_text .= " <mets:file $mime_attr $id_attr $opt_owner_id >\n";
442 $all_text .= " <mets:FLocat LOCTYPE=\"URL\" $xlink_href $xlink_title />\n";
443
444 $all_text .= " </mets:file>\n";
445
446 if ($version eq "fedora") {
447 $all_text .= " </mets:fileGrp>\n";
448 }
449
450 }
451 }
452
453 if ($version ne "fedora") {
454 $all_text .= " </mets:fileGrp>\n";
455 }
456
457 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
458
459 return $all_text;
460}
461
462sub buffer_mets_StructMapSection_section_xml(){
463 my $self = shift(@_);
464 my ($section, $order_numref) = @_;
465
466 $section="" unless defined $section;
467
468
469 my $section_ptr=$self->_lookup_section($section);
470 return "" unless defined $section_ptr;
471
472
473 # output fileSection by Sections
474 my $section_num ="1". $section;
475 my $dmd_num = $section_num;
476
477 ##**output the dmdSection details
478 #if ($section_num eq "1") {
479 # $dmd_num = "0";
480 #}
481
482 #**output the StructMap details
483
484 my $dmdid_attr = "DM$dmd_num";
485
486 my $all_text = " <mets:div ID=\"DS$section_num\" TYPE=\"Section\" \n";
487 $all_text .= ' ORDER="'.$$order_numref++.'" ORDERLABEL="'. $section_num .'" '."\n";
488 $all_text .= " LABEL=\"$section_num\" DMDID=\"$dmdid_attr\">\n";
489
490 $all_text .= ' <mets:fptr FILEID="FILEGROUP_PRELUDE'.$section_num.'" />'. "\n";
491
492
493 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
494 $all_text .= $self->buffer_mets_StructMapSection_section_xml("$section.$subsection", $order_numref);
495 }
496
497 $all_text .= " </mets:div>\n";
498
499 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
500
501 return $all_text;
502}
503
504
505sub buffer_mets_StructMapWhole_section_xml(){
506 my $self = shift(@_);
507 my ($section) = @_;
508
509 my $section_ptr = $self-> _lookup_section($section);
510 return "" unless defined $section_ptr;
511
512 my $all_text="" unless defined $all_txt;
513 my $fileID=0;
514 my $order_num = 0;
515
516 $all_text .= ' <mets:div ID="DSAll" TYPE="Document" ORDER="'.$order_num.'" ORDERLABEL="All" LABEL="Whole Documemt" DMDID="DM1">' . "\n";
517
518 #** output the StructMapSection for the whole section
519 # get the sourcefile and associative file
520
521 foreach my $data (@{$section_ptr->{'metadata'}}){
522 my $escaped_value = &_escape_text($data->[1]);
523
524 if ($data->[0] eq "gsdlsourcefilename") {
525 ++$fileID;
526 $all_text .= ' <mets:fptr FILEID="default.'.$fileID.'" />'."\n";
527 }
528
529 if ($data->[0] eq "gsdlassocfile"){
530 ++$fileID;
531 $all_text .= ' <mets:fptr FILEID="default.'.$fileID. '" />'. "\n";
532 }
533 }
534 $all_text .= " </mets:div>\n";
535
536 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
537
538 return $all_text;
539}
540
541
542sub buffer_mets_dmdSection_section_xml(){
543 my $self = shift(@_);
544 my ($section,$version) = @_;
545
546 $section="" unless defined $section;
547
548 my $section_ptr=$self->_lookup_section($section);
549 return "" unless defined $section_ptr;
550
551 # convert section number
552 my $section_num ="1". $section;
553 my $dmd_num = $section_num;
554
555 # #**output the dmdSection details
556 # if ($section_num eq "1") {
557 # $dmd_num = "0";
558 # }
559
560
561 my $all_text = "";
562
563 my $label_attr = "";
564 if ($version eq "fedora") {
565 $all_text .= "<mets:amdSec ID=\"DC\" >\n";
566 $all_text .= " <mets:techMD ID=\"DC.0\">\n"; # .0 fedora version number?
567
568 $label_attr = "LABEL=\"Dublin Core Metadata\"";
569 }
570 else {
571 # TODO::
572 #print STDERR "***** Check that GROUPID in dmdSec is valid!!!\n";
573 #print STDERR "***** Check to see if <techMD> required\n";
574 # if it isn't allowed, go back and set $mdTag = dmdSec/amdSec
575
576 $all_text .= "<mets:dmdSec ID=\"DM$dmd_num\" GROUPID=\"$section_num\">\n";
577 }
578
579 $all_text .= " <mets:mdWrap $label_attr MDTYPE=\"OTHER\" OTHERMDTYPE=\"gsdl3\" ID=\"gsdl$section_num\">\n";
580 $all_text .= " <mets:xmlData>\n";
581
582 if ($version eq "fedora") {
583 my $dc_namespace = "";
584 $dc_namespace .= "xmlns:dc=\"http://purl.org/dc/elements/1.1/\"";
585 $dc_namespace .= " xmlns:oai_dc=\"http://www.openarchives.org/OAI/2.0/oai_dc/\">\n";
586
587 $all_text .= " <oai_dc:dc $dc_namespace>\n";
588
589 $all_text .= $self->buffer_dc_section($section,"oai_dc");
590 $all_text .= " </oai_dc:dc>\n";
591 }
592 else {
593 foreach my $data (@{$section_ptr->{'metadata'}}){
594 my $escaped_value = &_escape_text($data->[1]);
595 $all_text .= ' <gsdl3:Metadata name="'. $data->[0].'">'. $escaped_value. "</gsdl3:Metadata>\n";
596 if ($data->[0] eq "dc.Title") {
597 $all_text .= ' <gsdl3:Metadata name="Title">'. $escaped_value."</gsdl3:Metadata>\n";
598 }
599 }
600 }
601
602 $all_text .= " </mets:xmlData>\n";
603 $all_text .= " </mets:mdWrap>\n";
604
605 if ($version eq "fedora") {
606 $all_text .= " </mets:techMD>\n";
607 $all_text .= "</mets:amdSec>\n";
608 }
609 else {
610 $all_text .= "</mets:dmdSec>\n";
611 }
612
613
614 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
615 $all_text .= $self->buffer_mets_dmdSection_section_xml("$section.$subsection",$version);
616 }
617
618 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
619
620 return $all_text;
621}
622
623sub output_section {
624 my $self = shift (@_);
625 my ($handle, $section) = @_;
626
627 print $handle $self->buffer_section_xml($section);
628}
629
630# print out DSpace dublin_core metadata section
631sub output_dspace_section {
632 my $self = shift (@_);
633 my ($handle, $section) = @_;
634
635 my $section_ptr = $self->_lookup_section ($section);
636 return "" unless defined $section_ptr;
637
638 my $all_text = "<Section>\n";
639 $all_text .= " <Description>\n";
640
641 # output metadata
642 foreach my $data (@{$section_ptr->{'metadata'}}) {
643 my $escaped_value = &_escape_text($data->[1]);
644 $all_text .= ' <Metadata name="' . $data->[0] . '">' . $escaped_value . "</Metadata>\n";
645 }
646
647 $all_text .= " </Description>\n";
648 $all_text .= "</Section>\n";
649
650 # make sure no nasty control characters have snuck through
651 # (XML::Parser will barf on anything it doesn't consider to be
652 # valid UTF-8 text, including things like \c@, \cC etc.)
653 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
654
655 return $all_text;
656}
657
658# print out doctxt.xml file
659sub output_txt_section {
660 my $self = shift (@_);
661 my ($handle, $section) = @_;
662
663 print $handle $self->buffer_txt_section_xml($section);
664}
665
666# print out docmets.xml file
667sub output_mets_section {
668 my $self = shift(@_);
669 my ($handle, $section, $version, $working_dir) = @_;
670
671 # print out the dmdSection
672 print $handle $self->buffer_mets_dmdSection_section_xml($section, $version);
673
674 print $handle "<mets:fileSec>\n";
675 if ($version eq "fedora") {
676 print $handle " <mets:fileGrp ID=\"DATASTREAMS\">\n";
677 }
678
679 # print out the fileSection by sections
680 print $handle $self->buffer_mets_fileSection_section_xml($section,$version);
681
682 # print out the whole fileSection
683 print $handle $self->buffer_mets_fileWhole_section_xml($section,$version,$working_dir);
684
685 if ($version eq "fedora") {
686 print $handle " </mets:fileGrp>\n";
687 }
688 print $handle "</mets:fileSec>\n";
689
690 # print out the StructMapSection by sections
691
692 my $struct_type;
693 if ($version eq "fedora") {
694 $struct_type = "fedora:dsBindingMap";
695 }
696 else {
697 $struct_type = "Section";
698 }
699
700 if ($version ne "fedora") {
701 print $handle "<mets:structMap ID=\"Section\" TYPE=\"$struct_type\" LABEL=\"Section\">\n";
702 my $order_num=0;
703 print $handle $self->buffer_mets_StructMapSection_section_xml($section, \$order_num);
704 print $handle "</mets:structMap>\n";
705
706 print $handle '<mets:structMap ID="All" TYPE="Whole Document" LABEL="All">'."\n";
707 print $handle $self->buffer_mets_StructMapWhole_section_xml($section);
708 print $handle "</mets:structMap>\n";
709 }
710
711}
712
713my $dc_set = { Title => 1,
714 Creator => 1,
715 Subject => 1,
716 Description => 1,
717 Publisher => 1,
718 Contributor => 1,
719 Date => 1,
720 Type => 1,
721 Format => 1,
722 Identifier => 1,
723 Source => 1,
724 Language => 1,
725 Relation => 1,
726 Coverage => 1,
727 Rights => 1};
728
729
730
731
732# Build up dublin_core metadata. Priority given to dc.* over ex.*
733
734sub new_buffer_dc_section {
735
736 my $self = shift(@_);
737 my ($section, $version) = @_;
738
739 # build up string of dublin core metadata
740 $section="" unless defined $section;
741
742 my $section_ptr=$self->_lookup_section($section);
743 return "" unless defined $section_ptr;
744 foreach my $data (@{$section_ptr->{'metadata'}}){
745 my $escaped_value = &_escape_text($data->[1]);
746 my $dc_element = $data->[0];
747
748 my @array = split('\.',$dc_element);
749 my ($type,$name);
750
751 if(defined $array[1])
752 {
753 $type = $array[0];
754 $name = $array[1];
755 }
756 else
757 {
758 $type = "ex";
759 $name = $array[0];
760 }
761
762 $all_text .= ' <Metadata Type="'. $type.'" Name="'.$name.'">'. $escaped_value. "</Metadata>\n";
763 }
764 return $all_text;
765}
766
767
768sub buffer_dc_section {
769 my $self = shift(@_);
770 my ($section, $version) = @_;
771
772 # build up string of dublin core metadata
773 $section="" unless defined $section;
774
775 my $section_ptr=$self->_lookup_section($section);
776 return "" unless defined $section_ptr;
777
778
779 my $explicit_dc = {};
780 my $explicit_ex = {};
781
782 my $all_text="";
783 foreach my $data (@{$section_ptr->{'metadata'}}){
784 foreach my $temp (@$data)
785 {
786 print "($temp) ";
787 }
788 print "\n";
789 my $escaped_value = &_escape_text($data->[1]);
790 if ($data->[0]=~ m/^dc\./) {
791 $data->[0] =~ tr/[A-Z]/[a-z]/;
792
793 $data->[0] =~ m/^dc\.(.*)/;
794 my $dc_element = $1;
795
796 if (!defined $explicit_dc->{$dc_element}) {
797 $explicit_dc->{$dc_element} = [];
798 }
799 push(@{$explicit_dc->{$dc_element}},$escaped_value);
800
801 #$all_text .= ' <dcvalue element="'. $data->[0].'" qualifier="#####">'. $escaped_value. "</dcvalue>\n";
802 if (defined $version && ($version eq "oai_dc")) {
803 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
804 }
805 else {
806 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
807 }
808
809 }
810 elsif (($data->[0] =~ m/^ex\./) || ($data->[0] !~ m/\./)) {
811 $data->[0] =~ m/^(ex\.)?(.*)/;
812 my $ex_element = $2;
813 my $lc_ex_element = lc($ex_element);
814
815 if (defined $dc_set->{$ex_element}) {
816 if (!defined $explicit_ex->{$lc_ex_element}) {
817 $explicit_ex->{$lc_ex_element} = [];
818 }
819 push(@{$explicit_ex->{$lc_ex_element}},$escaped_value);
820 }
821 }
822 }
823
824 # go through dc_set and for any element *not* defined in explicit_dc
825 # that do exist in explicit_ex, add it in as metadata
826 foreach my $k ( keys %$dc_set ) {
827 my $lc_k = lc($k);
828
829 if (!defined $explicit_dc->{$lc_k}) {
830 if (defined $explicit_ex->{$lc_k}) {
831
832 foreach my $v (@{$explicit_ex->{$lc_k}}) {
833 my $dc_element = $lc_k;
834 my $escaped_value = $v;
835
836 if (defined $version && ($version eq "oai_dc")) {
837 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
838 }
839 else {
840 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
841 }
842
843 }
844 }
845 }
846 }
847
848 if ($all_text eq "") {
849 $all_text .= " There is no Dublin Core metatdata in this document\n";
850 }
851 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
852
853 return $all_text;
854}
855
856
857# Print out dublin_core metadata
858sub output_dc_section {
859 my $self = shift(@_);
860 my ($handle, $section, $version) = @_;
861
862 #my $all_text = $self->buffer_dc_section($section,$version);
863 my $all_text = $self->new_buffer_dc_section($section,$version);
864
865 print $handle $all_text;
866}
867
868
869# look up the reference to the a particular section
870sub _lookup_section {
871 my $self = shift (@_);
872 my ($section) = @_;
873
874 my ($num);
875 my $sectionref = $self;
876
877 while (defined $section && $section ne "") {
878
879 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
880
881 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
882
883 $section = "" unless defined $section;
884
885
886 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
887 $sectionref = $sectionref->{'subsections'}->{$num};
888 } else {
889 return undef;
890 }
891 }
892
893 return $sectionref;
894}
895
896# calculate OID by hashing the contents of the document
897sub _calc_OID {
898 my $self = shift (@_);
899 my ($filename) = @_;
900
901 my $osexe = &util::get_os_exe();
902
903 my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
904 $ENV{'GSDLOS'},"hashfile$osexe");
905
906 my $result = "NULL";
907
908 if (-e "$hashfile_exe") {
909# $result = `\"$hashfile_exe\" \"$filename\"`;
910 $result = `hashfile$osexe \"$filename\"`;
911 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
912
913 } else {
914 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
915 }
916 return "HASH$result";
917}
918
919# methods dealing with OID, not groups of them.
920
921# if $OID is not provided one is calculated
922sub set_OID {
923 my $self = shift (@_);
924 my ($OID) = @_;
925
926 my $use_hash_oid = 0;
927 # if an OID wasn't provided claculate one
928 if (!defined $OID) {
929 $OID = "NULL";
930 if ($self->{'OIDtype'} eq "hash") {
931 $use_hash_oid = 1;
932 } elsif ($self->{'OIDtype'} eq "incremental") {
933 $OID = "D" . $OIDcount;
934 $OIDcount ++;
935
936 } elsif ($self->{'OIDtype'} eq "dirname") {
937 $OID = 'J';
938 my $filename = $self->get_source_filename();
939 if (defined($filename)) { # && -e $filename) {
940 $OID = &File::Basename::dirname($filename);
941 if (defined $OID) {
942 $OID = 'J'.&File::Basename::basename($OID);
943 $OID =~ s/\.//; #remove any periods
944 } else {
945 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
946 $use_hash_oid = 1;
947 }
948 } else {
949 print STDERR "Failed to find filename, generating hash id\n";
950 $use_hash_oid = 1;
951 }
952
953 } elsif ($self->{'OIDtype'} eq "assigned") {
954 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
955 if (defined $identifier && $identifier ne "") {
956 $OID = "D" . $identifier;
957 $OID =~ s/\.//; #remove any periods
958 } else {
959 # need a hash id
960 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
961 $use_hash_oid = 1;
962 }
963
964 } else {
965 $use_hash_oid = 1;
966 }
967
968 if ($use_hash_oid) {
969
970 # "hash" OID - feed file to hashfile.exe
971 #my $filename = $self->get_source_filename();
972 # we want to use the converted file for hashing if available
973 # cos its quicker
974 my $filename = $self->get_filename_for_hashing();
975 # -z: don't want to hash on the file if it is zero size
976 if (defined($filename) && -e $filename && !-z $filename) {
977 $OID = $self->_calc_OID ($filename);
978 } else {
979 $filename = &util::get_tmp_filename();
980 if (!open (OUTFILE, ">$filename")) {
981 print STDERR "doc::set_OID could not write to $filename\n";
982 } else {
983 $self->output_section('OUTFILE', $self->get_top_section(),
984 undef, 1);
985 close (OUTFILE);
986 }
987 $OID = $self->_calc_OID ($filename);
988 &util::rm ($filename);
989 }
990 }
991 }
992 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
993}
994
995# this uses hashdoc (embedded c thingy) which is faster but still
996# needs a little work to be suffiently stable
997sub ___set_OID {
998 my $self = shift (@_);
999 my ($OID) = @_;
1000
1001 # if an OID wasn't provided then calculate hash value based on document
1002 if (!defined $OID)
1003 {
1004 my $hash_text = $self->buffer_section_gml($self->get_top_section(),
1005 undef, 1);
1006 my $hash_len = length($hash_text);
1007
1008 $OID = &hashdoc::buffer($hash_text,$hash_len);
1009 }
1010
1011 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
1012}
1013
1014# returns the OID for this document
1015sub get_OID {
1016 my $self = shift (@_);
1017 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
1018 return $OID if (defined $OID);
1019 return "NULL";
1020}
1021
1022sub delete_OID {
1023 my $self = shift (@_);
1024
1025 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
1026}
1027
1028
1029# methods for manipulating section names
1030
1031# returns the name of the top-most section (the top
1032# level of the document
1033sub get_top_section {
1034 my $self = shift (@_);
1035
1036 return "";
1037}
1038
1039# returns a section
1040sub get_parent_section {
1041 my $self = shift (@_);
1042 my ($section) = @_;
1043
1044 $section =~ s/(^|\.)\d+$//;
1045
1046 return $section;
1047}
1048
1049# returns the first child section (or the end child
1050# if there isn't any)
1051sub get_begin_child {
1052 my $self = shift (@_);
1053 my ($section) = @_;
1054
1055 my $section_ptr = $self->_lookup_section($section);
1056 return "" unless defined $section_ptr;
1057
1058 if (defined $section_ptr->{'subsection_order'}->[0]) {
1059 return "$section.$section_ptr->{'subsection_order'}->[0]";
1060 }
1061
1062 return $self->get_end_child ($section);
1063}
1064
1065# returns the next child of a parent section
1066sub get_next_child {
1067 my $self = shift (@_);
1068 my ($section) = @_;
1069
1070 my $parent_section = $self->get_parent_section($section);
1071 my $parent_section_ptr = $self->_lookup_section($parent_section);
1072 return undef unless defined $parent_section_ptr;
1073
1074 my ($section_num) = $section =~ /(\d+)$/;
1075 return undef unless defined $section_num;
1076
1077 my $i = 0;
1078 my $section_order = $parent_section_ptr->{'subsection_order'};
1079 while ($i < scalar(@$section_order)) {
1080 last if $section_order->[$i] eq $section_num;
1081 $i++;
1082 }
1083
1084 $i++; # the next child
1085 if ($i < scalar(@$section_order)) {
1086 return $section_order->[$i] if $parent_section eq "";
1087 return "$parent_section.$section_order->[$i]";
1088 }
1089
1090 # no more sections in this level
1091 return undef;
1092}
1093
1094# returns a reference to a list of children
1095sub get_children {
1096 my $self = shift (@_);
1097 my ($section) = @_;
1098
1099 my $section_ptr = $self->_lookup_section($section);
1100 return [] unless defined $section_ptr;
1101
1102 my @children = @{$section_ptr->{'subsection_order'}};
1103
1104 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
1105 return \@children;
1106}
1107
1108# returns the child section one past the last one (which
1109# is coded as "0")
1110sub get_end_child {
1111 my $self = shift (@_);
1112 my ($section) = @_;
1113
1114 return $section . ".0" unless $section eq "";
1115 return "0";
1116}
1117
1118# returns the next section in book order
1119sub get_next_section {
1120 my $self = shift (@_);
1121 my ($section) = @_;
1122
1123 return undef unless defined $section;
1124
1125 my $section_ptr = $self->_lookup_section($section);
1126 return undef unless defined $section_ptr;
1127
1128 # first try to find first child
1129 if (defined $section_ptr->{'subsection_order'}->[0]) {
1130 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
1131 return "$section.$section_ptr->{'subsection_order'}->[0]";
1132 }
1133
1134 do {
1135 # try to find sibling
1136 my $next_child = $self->get_next_child ($section);
1137 return $next_child if (defined $next_child);
1138
1139 # move up one level
1140 $section = $self->get_parent_section ($section);
1141 } while $section =~ /\d/;
1142
1143 return undef;
1144}
1145
1146sub is_leaf_section {
1147 my $self = shift (@_);
1148 my ($section) = @_;
1149
1150 my $section_ptr = $self->_lookup_section($section);
1151 return 1 unless defined $section_ptr;
1152
1153 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
1154}
1155
1156# methods for dealing with sections
1157
1158# returns the name of the inserted section
1159sub insert_section {
1160 my $self = shift (@_);
1161 my ($before_section) = @_;
1162
1163 # get the child to insert before and its parent section
1164 my $parent_section = "";
1165 my $before_child = "0";
1166 my @before_section = split (/\./, $before_section);
1167 if (scalar(@before_section) > 0) {
1168 $before_child = pop (@before_section);
1169 $parent_section = join (".", @before_section);
1170 }
1171
1172 my $parent_section_ptr = $self->_lookup_section($parent_section);
1173 if (!defined $parent_section_ptr) {
1174 print STDERR "doc::insert_section couldn't find parent section " .
1175 "$parent_section\n";
1176 return;
1177 }
1178
1179 # get the next section number
1180 my $section_num = $parent_section_ptr->{'next_subsection'}++;
1181
1182 my $i = 0;
1183 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
1184 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
1185 $i++;
1186 }
1187
1188 # insert the section number into the order list
1189 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
1190
1191 # add this section to the parent section
1192 my $section_ptr = {'subsection_order'=>[],
1193 'next_subsection'=>1,
1194 'subsections'=>{},
1195 'metadata'=>[],
1196 'text'=>""};
1197 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
1198
1199 # work out the full section number
1200 my $section = $parent_section;
1201 $section .= "." unless $section eq "";
1202 $section .= $section_num;
1203
1204 return $section;
1205}
1206
1207# creates a pre-named section
1208sub create_named_section {
1209 my $self = shift (@_);
1210 my ($mastersection) = @_;
1211
1212 my ($num);
1213 my $section = $mastersection;
1214 my $sectionref = $self;
1215
1216 while ($section ne "") {
1217 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
1218 $num =~ s/^0+(\d)/$1/; # remove leading 0s
1219 $section = "" unless defined $section;
1220
1221 if (defined $num) {
1222 if (!defined $sectionref->{'subsections'}->{$num}) {
1223 push (@{$sectionref->{'subsection_order'}}, $num);
1224 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
1225 'next_subsection'=>1,
1226 'subsections'=>{},
1227 'metadata'=>[],
1228 'text'=>""};
1229 if ($num >= $sectionref->{'next_subsection'}) {
1230 $sectionref->{'next_subsection'} = $num + 1;
1231 }
1232 }
1233 $sectionref = $sectionref->{'subsections'}->{$num};
1234
1235 } else {
1236 print STDERR "doc::create_named_section couldn't create section ";
1237 print STDERR "$mastersection\n";
1238 last;
1239 }
1240 }
1241}
1242
1243# returns a reference to a list of subsections
1244sub list_subsections {
1245 my $self = shift (@_);
1246 my ($section) = @_;
1247
1248 my $section_ptr = $self->_lookup_section ($section);
1249 if (!defined $section_ptr) {
1250 print STDERR "doc::list_subsections couldn't find section $section\n";
1251 return [];
1252 }
1253
1254 return [@{$section_ptr->{'subsection_order'}}];
1255}
1256
1257sub delete_section {
1258 my $self = shift (@_);
1259 my ($section) = @_;
1260
1261# my $section_ptr = {'subsection_order'=>[],
1262# 'next_subsection'=>1,
1263# 'subsections'=>{},
1264# 'metadata'=>[],
1265# 'text'=>""};
1266
1267 # if this is the top section reset everything
1268 if ($section eq "") {
1269 $self->{'subsection_order'} = [];
1270 $self->{'subsections'} = {};
1271 $self->{'metadata'} = [];
1272 $self->{'text'} = "";
1273 return;
1274 }
1275
1276 # find the parent of the section to delete
1277 my $parent_section = "";
1278 my $child = "0";
1279 my @section = split (/\./, $section);
1280 if (scalar(@section) > 0) {
1281 $child = pop (@section);
1282 $parent_section = join (".", @section);
1283 }
1284
1285 my $parent_section_ptr = $self->_lookup_section($parent_section);
1286 if (!defined $parent_section_ptr) {
1287 print STDERR "doc::delete_section couldn't find parent section " .
1288 "$parent_section\n";
1289 return;
1290 }
1291
1292 # remove this section from the subsection_order list
1293 my $i = 0;
1294 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
1295 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
1296 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
1297 last;
1298 }
1299 $i++;
1300 }
1301
1302 # remove this section from the subsection hash
1303 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
1304 undef $parent_section_ptr->{'subsections'}->{$child};
1305 }
1306}
1307
1308#--
1309# methods for dealing with metadata
1310
1311# set_metadata_element and get_metadata_element are for metadata
1312# which should only have one value. add_meta_data and get_metadata
1313# are for metadata which can have more than one value.
1314
1315# returns the first metadata value which matches field
1316
1317# This version of get metadata element works much like the one above,
1318# except it allows for the namespace portion of a metadata element to
1319# be ignored, thus if you are searching for dc.Title, the first piece
1320# of matching metadata ending with the name Title (once any namespace
1321# is removed) would be returned.
1322# 28-11-2003 John Thompson
1323sub get_metadata_element {
1324 my $self = shift (@_);
1325 my ($section, $field, $ignore_namespace) = @_;
1326 my ($data);
1327
1328 $ignore_namespace = 0 unless defined $ignore_namespace;
1329
1330 my $section_ptr = $self->_lookup_section($section);
1331 if (!defined $section_ptr) {
1332 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
1333 return;
1334 }
1335
1336 # Remove the any namespace if we are being told to ignore them
1337 if($ignore_namespace) {
1338 $field =~ s/^\w*\.//;
1339 }
1340
1341 foreach $data (@{$section_ptr->{'metadata'}}) {
1342
1343 my $data_name = $data->[0];
1344 # Remove the any namespace if we are being told to ignore them
1345 if($ignore_namespace) {
1346 $data_name =~ s/^\w*\.//;
1347 }
1348
1349 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
1350 }
1351
1352 return undef; # was not found
1353}
1354
1355# returns a list of the form [value1, value2, ...]
1356sub get_metadata {
1357 my $self = shift (@_);
1358 my ($section, $field, $ignore_namespace) = @_;
1359 my ($data);
1360
1361 $ignore_namespace = 0 unless defined $ignore_namespace;
1362
1363 my $section_ptr = $self->_lookup_section($section);
1364 if (!defined $section_ptr) {
1365 print STDERR "doc::get_metadata couldn't find section ",
1366 $section, "\n";
1367 return;
1368 }
1369
1370 # Remove the any namespace if we are being told to ignore them
1371 if($ignore_namespace) {
1372 $field =~ s/^\w*\.//;
1373 }
1374
1375 my @metadata = ();
1376 foreach $data (@{$section_ptr->{'metadata'}}) {
1377
1378 my $data_name = $data->[0];
1379 # Remove the any namespace if we are being told to ignore them
1380 if($ignore_namespace) {
1381 $data_name =~ s/^\w*\.//;
1382 }
1383
1384 push (@metadata, $data->[1]) if ($data_name eq $field);
1385 }
1386
1387 return \@metadata;
1388}
1389
1390# returns a list of the form [[field,value],[field,value],...]
1391sub get_all_metadata {
1392 my $self = shift (@_);
1393 my ($section) = @_;
1394
1395 my $section_ptr = $self->_lookup_section($section);
1396 if (!defined $section_ptr) {
1397 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
1398 return;
1399 }
1400
1401 return $section_ptr->{'metadata'};
1402}
1403
1404# returns a xml element of the form <MetadataList><Metadata name="metadata-name">metadata_value</Metadata>...</MetadataList>
1405sub get_top_metadata_list{
1406 my $self = shift (@_);
1407
1408 my @topmetadata =$self->get_all_metadata($self->get_top_section());
1409 my $metadatalist ='<MetadataList>';
1410
1411 foreach my $i (@topmetadata){
1412 foreach my $j (@$i){
1413 my %metaMap = @$j;
1414 foreach my $key (keys %metaMap){
1415 $metadatalist .='<Metadata name='."\"$key\"".'>'.$metaMap{$key}.'</Metadata>'."\n";
1416 }
1417 }
1418 }
1419
1420 $metadatalist .='</MetadataList>';
1421
1422 return $metadatalist;
1423}
1424
1425
1426# $value is optional
1427sub delete_metadata {
1428 my $self = shift (@_);
1429 my ($section, $field, $value) = @_;
1430
1431 my $section_ptr = $self->_lookup_section($section);
1432 if (!defined $section_ptr) {
1433 print STDERR "doc::delete_metadata couldn't find section ", $section, "\n";
1434 return;
1435 }
1436
1437 my $i = 0;
1438 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
1439 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
1440 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
1441 splice (@{$section_ptr->{'metadata'}}, $i, 1);
1442 } else {
1443 $i++;
1444 }
1445 }
1446}
1447
1448sub delete_all_metadata {
1449 my $self = shift (@_);
1450 my ($section) = @_;
1451
1452 my $section_ptr = $self->_lookup_section($section);
1453 if (!defined $section_ptr) {
1454 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
1455 return;
1456 }
1457
1458 $section_ptr->{'metadata'} = [];
1459}
1460
1461sub set_metadata_element {
1462 my $self = shift (@_);
1463 my ($section, $field, $value) = @_;
1464
1465 $self->set_utf8_metadata_element ($section, $field,
1466 &unicode::ascii2utf8(\$value));
1467}
1468
1469# set_utf8_metadata_element assumes the text has already been
1470# converted to the UTF-8 encoding.
1471sub set_utf8_metadata_element {
1472 my $self = shift (@_);
1473 my ($section, $field, $value) = @_;
1474
1475 $self->delete_metadata ($section, $field);
1476 $self->add_utf8_metadata ($section, $field, $value);
1477}
1478
1479
1480# add_metadata assumes the text is in (extended) ascii form. For
1481# text which has already been converted to the UTF-8 format use
1482# add_utf8_metadata.
1483sub add_metadata {
1484 my $self = shift (@_);
1485 my ($section, $field, $value) = @_;
1486
1487 $self->add_utf8_metadata ($section, $field,
1488 &unicode::ascii2utf8(\$value));
1489}
1490
1491sub add_utf8_metadata {
1492 my $self = shift (@_);
1493 my ($section, $field, $value) = @_;
1494
1495 my $section_ptr = $self->_lookup_section($section);
1496 if (!defined $section_ptr) {
1497 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
1498 return;
1499 }
1500 if (!defined $value) {
1501 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
1502 return;
1503 }
1504 if (!defined $field) {
1505 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
1506 return;
1507 }
1508
1509 #print STDERR "###$field=$value\n";
1510 # double check that the value is utf-8
1511 if (unicode::ensure_utf8(\$value)) {
1512 print STDERR "doc::add_utf8_metadata: warning: '$field' wasn't utf8\n";
1513 }
1514
1515 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1516}
1517
1518
1519# methods for dealing with text
1520
1521# returns the text for a section
1522sub get_text {
1523 my $self = shift (@_);
1524 my ($section) = @_;
1525
1526 my $section_ptr = $self->_lookup_section($section);
1527 if (!defined $section_ptr) {
1528 print STDERR "doc::get_text couldn't find section " .
1529 "$section\n";
1530 return "";
1531 }
1532
1533 return $section_ptr->{'text'};
1534}
1535
1536# returns the (utf-8 encoded) length of the text for a section
1537sub get_text_length {
1538 my $self = shift (@_);
1539 my ($section) = @_;
1540
1541 my $section_ptr = $self->_lookup_section($section);
1542 if (!defined $section_ptr) {
1543 print STDERR "doc::get_text_length couldn't find section " .
1544 "$section\n";
1545 return 0;
1546 }
1547
1548 return length ($section_ptr->{'text'});
1549}
1550
1551sub delete_text {
1552 my $self = shift (@_);
1553 my ($section) = @_;
1554
1555 my $section_ptr = $self->_lookup_section($section);
1556 if (!defined $section_ptr) {
1557 print STDERR "doc::delete_text couldn't find section " .
1558 "$section\n";
1559 return;
1560 }
1561
1562 $section_ptr->{'text'} = "";
1563}
1564
1565# add_text assumes the text is in (extended) ascii form. For
1566# text which has been already converted to the UTF-8 format
1567# use add_utf8_text.
1568sub add_text {
1569 my $self = shift (@_);
1570 my ($section, $text) = @_;
1571
1572 # convert the text to UTF-8 encoded unicode characters
1573 # and add the text
1574 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1575}
1576
1577
1578# add_utf8_text assumes the text to be added has already
1579# been converted to the UTF-8 encoding. For ascii text use
1580# add_text
1581sub add_utf8_text {
1582 my $self = shift (@_);
1583 my ($section, $text) = @_;
1584
1585 my $section_ptr = $self->_lookup_section($section);
1586 if (!defined $section_ptr) {
1587 print STDERR "doc::add_utf8_text couldn't find section " .
1588 "$section\n";
1589 return;
1590 }
1591
1592 $section_ptr->{'text'} .= $text;
1593}
1594
1595
1596# methods for dealing with associated files
1597
1598# a file is associated with a document, NOT a section.
1599# if section is defined it is noted in the data structure
1600# only so that files associated from a particular section
1601# may be removed later (using delete_section_assoc_files)
1602sub associate_file {
1603 my $self = shift (@_);
1604 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1605 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1606
1607 # remove all associated files with the same name
1608 $self->delete_assoc_file ($assoc_filename);
1609
1610 push (@{$self->{'associated_files'}},
1611 [$real_filename, $assoc_filename, $mime_type, $section]);
1612}
1613
1614# returns a list of associated files in the form
1615# [[real_filename, assoc_filename, mimetype], ...]
1616sub get_assoc_files {
1617 my $self = shift (@_);
1618
1619 return $self->{'associated_files'};
1620}
1621
1622sub delete_section_assoc_files {
1623 my $self = shift (@_);
1624 my ($section) = @_;
1625
1626 my $i=0;
1627 while ($i < scalar (@{$self->{'associated_files'}})) {
1628 if (defined $self->{'associated_files'}->[$i]->[3] &&
1629 $self->{'associated_files'}->[$i]->[3] eq $section) {
1630 splice (@{$self->{'associated_files'}}, $i, 1);
1631 } else {
1632 $i++;
1633 }
1634 }
1635}
1636
1637sub delete_assoc_file {
1638 my $self = shift (@_);
1639 my ($assoc_filename) = @_;
1640
1641 my $i=0;
1642 while ($i < scalar (@{$self->{'associated_files'}})) {
1643 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1644 splice (@{$self->{'associated_files'}}, $i, 1);
1645 } else {
1646 $i++;
1647 }
1648 }
1649}
1650
1651sub reset_nextsection_ptr {
1652 my $self = shift (@_);
1653 my ($section) = @_;
1654
1655 my $section_ptr = $self->_lookup_section($section);
1656 $section_ptr->{'next_subsection'} = 1;
1657}
1658
16591;
Note: See TracBrowser for help on using the repository browser.