source: trunk/gsdl/perllib/doc.pm@ 12601

Last change on this file since 12601 was 12448, checked in by kjdon, 18 years ago

changed output_dc_section to use the old buffer_dc_section not the new new_buffer_dc_section which was added by Jeffrey but we don't know why

  • Property svn:executable set to *
  • Property svn:keywords set to Author Date Id Revision
File size: 44.2 KB
Line 
1###########################################################################
2#
3# doc.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistr te it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# base class to hold documents
27
28package doc;
29eval {require bytes};
30
31BEGIN {
32 die "GSDLHOME not set\n" unless defined $ENV{'GSDLHOME'};
33 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/dynamic/lib/site_perl/5.005/i686-linux");
34}
35
36use unicode;
37use util;
38use ghtml;
39use File::stat;
40##use hashdoc;
41
42# the document type may be indexed_doc, nonindexed_doc, or
43# classification
44
45my $OIDcount = 0;
46
47sub new {
48 my $class = shift (@_);
49 my ($source_filename, $doc_type) = @_;
50
51 my $self = bless {'associated_files'=>[],
52 'subsection_order'=>[],
53 'next_subsection'=>1,
54 'subsections'=>{},
55 'metadata'=>[],
56 'text'=>"",
57 'OIDtype'=>"hash"}, $class;
58
59 # used to set lastmodified here, but this can screw up the HASH ids, so
60 # the docsave processor now calls set_lastmodified
61
62 if (defined $source_filename) {
63 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
64
65 if (defined $collect_dir) {
66 my $dirsep = &util::get_dirsep();
67 if ($collect_dir !~ m/$dirsep$/) {
68 $collect_dir .= $dirsep;
69 }
70
71 $collect_dir =~ s/\\/\\\\/g; # escape DOS style file separator
72
73 # if from within GSDLCOLLECTDIR, then remove directory prefix
74 # so source_filename is realative to it. This is done to aid
75 # portability, i.e. the collection can be moved to somewhere
76 # else on the file system and the archives directory will still
77 # work. This is needed, for example in the applet version of
78 # GLI where GSDLHOME/collect on the server will be different to
79 # the collect directory of the remove user. Of course,
80 # GSDLCOLLECTDIR subsequently needs to be put back on to turn
81 # it back into a full pathname.
82
83 if ($source_filename =~ /^$collect_dir(.*)$/) {
84 $source_filename = $1;
85 }
86 }
87
88 $self->set_source_filename ($source_filename);
89 }
90
91 $self->set_doc_type ($doc_type) if defined $doc_type;
92
93 return $self;
94}
95# set lastmodified for OAI purposes, added by GRB, moved by kjdon
96sub set_lastmodified {
97 my $self = shift (@_);
98
99 my $source_filename = $self->get_source_filename();
100 if ((defined $self->get_metadata_element ($self->get_top_section(), "gsdldoctype")) &&
101 (defined $source_filename) && (-e $source_filename)) {
102 my $file_stat = stat($source_filename);
103 my $mtime = $file_stat->mtime;
104 $self->add_utf8_metadata($self->get_top_section(), "lastmodified", $file_stat->mtime);
105 }
106}
107
108# clone the $self object
109sub duplicate {
110 my $self = shift (@_);
111
112 my $newobj = {};
113
114 foreach my $k (keys %$self) {
115 $newobj->{$k} = &clone ($self->{$k});
116 }
117
118 bless $newobj, ref($self);
119 return $newobj;
120}
121
122sub clone {
123 my ($from) = @_;
124 my $type = ref ($from);
125
126 if ($type eq "HASH") {
127 my $to = {};
128 foreach my $key (keys %$from) {
129 $to->{$key} = &clone ($from->{$key});
130 }
131 return $to;
132 } elsif ($type eq "ARRAY") {
133 my $to = [];
134 foreach my $v (@$from) {
135 push (@$to, &clone ($v));
136 }
137 return $to;
138 } else {
139 return $from;
140 }
141}
142
143sub set_OIDtype {
144 my $self = shift (@_);
145 my ($type, $metadata) = @_;
146
147 if (defined $type && $type =~ /^(hash|incremental|dirname|assigned)$/) {
148 $self->{'OIDtype'} = $type;
149 } else {
150 $self->{'OIDtype'} = "hash";
151 }
152 if ($type =~ /^assigned$/) {
153 if (defined $metadata) {
154 $self->{'OIDmetadata'} = $metadata;
155 } else {
156 $self->{'OIDmetadata'} = "dc.Identifier";
157 }
158 }
159}
160
161sub set_source_filename {
162 my $self = shift (@_);
163 my ($source_filename) = @_;
164
165 $self->set_metadata_element ($self->get_top_section(),
166 "gsdlsourcefilename",
167 $source_filename);
168}
169
170sub set_converted_filename {
171 my $self = shift (@_);
172 my ($converted_filename) = @_;
173
174 $self->set_metadata_element ($self->get_top_section(),
175 "gsdlconvertedfilename",
176 $converted_filename);
177}
178
179
180# returns the source_filename as it was provided
181sub get_source_filename {
182 my $self = shift (@_);
183
184 return $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
185}
186
187# returns converted filename if available else returns source filename
188sub get_filename_for_hashing {
189 my $self = shift (@_);
190
191 my $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlconvertedfilename");
192
193 if (!defined $filename) {
194 my $plugin_name = $self->get_metadata_element ($self->get_top_section(), "Plugin");
195 # if NULPlug processed file, then don't give a filename
196 if (defined $plugin_name && $plugin_name eq "NULPlug") {
197 $filename = undef;
198 } else {
199 $filename = $self->get_metadata_element ($self->get_top_section(), "gsdlsourcefilename");
200 }
201 }
202 return $filename;
203}
204
205sub set_doc_type {
206 my $self = shift (@_);
207 my ($doc_type) = @_;
208
209 $self->set_metadata_element ($self->get_top_section(),
210 "gsdldoctype",
211 $doc_type);
212}
213
214# returns the gsdldoctype as it was provided
215# the default of "indexed_doc" is used if no document
216# type was provided
217sub get_doc_type {
218 my $self = shift (@_);
219
220 my $doc_type = $self->get_metadata_element ($self->get_top_section(), "gsdldoctype");
221 return $doc_type if (defined $doc_type);
222 return "indexed_doc";
223}
224
225sub _escape_text {
226 my ($text) = @_;
227 # special characters in the gml encoding
228 $text =~ s/&/&/g; # this has to be first...
229 $text =~ s/</&lt;/g;
230 $text =~ s/>/&gt;/g;
231 $text =~ s/\"/&quot;/g;
232
233 return $text;
234}
235
236sub buffer_section_xml {
237 my $self = shift (@_);
238 my ($section) = @_;
239
240 my $section_ptr = $self->_lookup_section ($section);
241 return "" unless defined $section_ptr;
242
243 my $all_text = "<Section>\n";
244 $all_text .= " <Description>\n";
245
246 # output metadata
247 foreach my $data (@{$section_ptr->{'metadata'}}) {
248 my $escaped_value = &_escape_text($data->[1]);
249 $all_text .= ' <Metadata name="' . $data->[0] . '">' . $escaped_value . "</Metadata>\n";
250 }
251
252 $all_text .= " </Description>\n";
253
254 # output the text
255 $all_text .= " <Content>";
256 $all_text .= &_escape_text($section_ptr->{'text'});
257 $all_text .= "</Content>\n";
258
259 # output all the subsections
260 foreach my $subsection (@{$section_ptr->{'subsection_order'}}) {
261 $all_text .= $self->buffer_section_xml("$section.$subsection");
262 }
263
264 $all_text .= "</Section>\n";
265
266 # make sure no nasty control characters have snuck through
267 # (XML::Parser will barf on anything it doesn't consider to be
268 # valid UTF-8 text, including things like \c@, \cC etc.)
269 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
270
271 return $all_text;
272}
273
274sub buffer_txt_section_xml {
275 my $self = shift(@_);
276 my ($section) = @_;
277
278 my $section_ptr = $self->_lookup_section ($section);
279
280 return "" unless defined $section_ptr;
281
282 my $all_text = "<Section>\n";
283
284 ##output the text
285 #$all_text .= " <Content>";
286 $all_text .= &_escape_text($section_ptr->{'text'});
287 #$all_text .= " </Content>\n";
288
289
290 #output all the subsections
291 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
292 $all_text .= $self->buffer_txt_section_xml("$section.$subsection");
293 }
294
295 $all_text .= "</Section>\n";
296
297
298 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
299 return $all_text;
300}
301
302sub buffer_mets_fileSection_section_xml() {
303 my $self = shift(@_);
304 my ($section,$version) = @_;
305
306 #$section="" unless defined $section;
307
308
309 my $section_ptr=$self->_lookup_section($section);
310 return "" unless defined $section_ptr;
311
312
313 # output fileSection by sections
314 my $section_num ="1". $section;
315
316 my $filePath = 'doctxt.xml';
317
318 my $opt_owner_id = "";
319 if ($version eq "fedora") {
320 $opt_owner_id = "OWNERID=\"M\"";
321 }
322
323 # output the fileSection details
324 my $all_text = ' <mets:fileGrp ID="FILEGROUP_PRELUDE' . $section_num . '">'. "\n";
325 $all_text .= " <mets:file MIMETYPE=\"text/xml\" ID=\"FILE$section_num\" $opt_owner_id >\n";
326 $all_text .= ' <mets:FLocat LOCTYPE="URL" xlink:href="file:'.$filePath.'#xpointer(/Section[';
327
328 my $xpath = "1".$section;
329 $xpath =~ s/\./]\/Section[/g;
330
331 $all_text .= $xpath;
332
333 $all_text .= ']/text())" xlink:title="Hierarchical Document Structure"/>' . "\n";
334 $all_text .= " </mets:file>\n";
335 $all_text .= " </mets:fileGrp>\n";
336
337
338 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
339 $all_text .= $self->buffer_mets_fileSection_section_xml("$section.$subsection",$version);
340 }
341
342 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
343
344 return $all_text;
345}
346
347sub buffer_mets_fileWhole_section_xml(){
348 my $self = shift(@_);
349 my ($section,$version,$working_dir) = @_;
350
351 my $section_ptr = $self-> _lookup_section($section);
352 return "" unless defined $section_ptr;
353
354 my $all_text="" unless defined $all_txt;
355
356 my $fileID=0;
357
358 # Output the fileSection for the whole section
359 # => get the sourcefile and associative file
360
361 my $id_root = "";
362 my $opt_owner_id = "";
363
364 if ($version eq "fedora") {
365 $opt_owner_id = "OWNERID=\"M\"";
366 }
367 else {
368 $id_root = "default";
369 }
370
371 if ($version ne "fedora") {
372 $all_text .= " <mets:fileGrp ID=\"$id_root\">\n";
373 }
374
375 foreach my $data (@{$section_ptr->{'metadata'}}){
376 my $escaped_value = &_escape_text($data->[1]);
377
378 if (($data->[0] eq "gsdlsourcefilename") && ($version ne "fedora")) {
379 my ($dirPath) = $escaped_value =~ m/^(.*)[\/\\][^\/\\]*$/;
380
381 ++$fileID;
382 $all_text .= " <mets:file MIMETYPE=\"text/xml\" ID=\"$id_root.$fileID\" $opt_owner_id >\n";
383
384 $all_text .= ' <mets:FLocat LOCTYPE="URL" xlink:href="file:'.$data->[1].'" />'."\n";
385
386 $all_text .= " </mets:file>\n";
387 }
388
389 if ($data->[0] eq "gsdlassocfile"){
390
391 $escaped_value =~ m/^(.*?):(.*):(.*)$/;
392 my $assoc_file = $1;
393 my $mime_type = $2;
394 my $assoc_dir = $3;
395
396 if ($version eq "fedora") {
397 $id_root = $assoc_file;
398 $id_root =~ s/\//_/g;
399 $all_text .= " <mets:fileGrp ID=\"$id_root\">\n";
400 }
401
402 my $assfilePath = ($assoc_dir eq "") ? $assoc_file : "$assoc_dir/$assoc_file";
403 ++$fileID;
404
405 my $mime_attr = "MIMETYPE=\"$mime_type\"";
406 my $xlink_title = "xlink:title=\"$assoc_file\"";
407
408 my $id_attr;
409 my $xlink_href;
410
411 if ($version eq "fedora") {
412 $id_attr = "ID=\"$id_root.0\"";
413
414 my $fedora_prefix = $ENV{'FEDORA_PREFIX'};
415 if (!defined $fedora_prefix) {
416 $xlink_href = "xlink:href=\"$assfilePath\"";
417 }
418 else
419 {
420 my $gsdlhome = $ENV{'GSDLHOME'};
421 my $gsdl_href = "$working_dir/$assfilePath";
422
423 $gsdl_href =~ s/^$gsdlhome(\/)?//;
424 $gsdl_href = "/gsdl/$gsdl_href";
425
426 my $fserver = $ENV{'FEDORA_HOSTNAME'};
427 my $fport = $ENV{'FEDORA_SERVER_PORT'};
428
429 my $fdomain = "http://$fserver:$fport";
430 $xlink_href = "xlink:href=\"$fdomain$gsdl_href\"";
431 }
432
433 my $top_section = $self->get_top_section();
434 my $id = $self->get_metadata_element($top_section,"Identifier");
435 }
436 else {
437 $id_attr = "ID=\"$id_root.$fileID\"";
438 $xlink_href = "xlink:href=\"$assfilePath\"";
439 }
440
441 $all_text .= " <mets:file $mime_attr $id_attr $opt_owner_id >\n";
442 $all_text .= " <mets:FLocat LOCTYPE=\"URL\" $xlink_href $xlink_title />\n";
443
444 $all_text .= " </mets:file>\n";
445
446 if ($version eq "fedora") {
447 $all_text .= " </mets:fileGrp>\n";
448 }
449
450 }
451 }
452
453 if ($version ne "fedora") {
454 $all_text .= " </mets:fileGrp>\n";
455 }
456
457 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
458
459 return $all_text;
460}
461
462sub buffer_mets_StructMapSection_section_xml(){
463 my $self = shift(@_);
464 my ($section, $order_numref) = @_;
465
466 $section="" unless defined $section;
467
468
469 my $section_ptr=$self->_lookup_section($section);
470 return "" unless defined $section_ptr;
471
472
473 # output fileSection by Sections
474 my $section_num ="1". $section;
475 my $dmd_num = $section_num;
476
477 ##**output the dmdSection details
478 #if ($section_num eq "1") {
479 # $dmd_num = "0";
480 #}
481
482 #**output the StructMap details
483
484 my $dmdid_attr = "DM$dmd_num";
485
486 my $all_text = " <mets:div ID=\"DS$section_num\" TYPE=\"Section\" \n";
487 $all_text .= ' ORDER="'.$$order_numref++.'" ORDERLABEL="'. $section_num .'" '."\n";
488 $all_text .= " LABEL=\"$section_num\" DMDID=\"$dmdid_attr\">\n";
489
490 $all_text .= ' <mets:fptr FILEID="FILEGROUP_PRELUDE'.$section_num.'" />'. "\n";
491
492
493 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
494 $all_text .= $self->buffer_mets_StructMapSection_section_xml("$section.$subsection", $order_numref);
495 }
496
497 $all_text .= " </mets:div>\n";
498
499 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
500
501 return $all_text;
502}
503
504
505sub buffer_mets_StructMapWhole_section_xml(){
506 my $self = shift(@_);
507 my ($section) = @_;
508
509 my $section_ptr = $self-> _lookup_section($section);
510 return "" unless defined $section_ptr;
511
512 my $all_text="" unless defined $all_txt;
513 my $fileID=0;
514 my $order_num = 0;
515
516 $all_text .= ' <mets:div ID="DSAll" TYPE="Document" ORDER="'.$order_num.'" ORDERLABEL="All" LABEL="Whole Documemt" DMDID="DM1">' . "\n";
517
518 #** output the StructMapSection for the whole section
519 # get the sourcefile and associative file
520
521 foreach my $data (@{$section_ptr->{'metadata'}}){
522 my $escaped_value = &_escape_text($data->[1]);
523
524 if ($data->[0] eq "gsdlsourcefilename") {
525 ++$fileID;
526 $all_text .= ' <mets:fptr FILEID="default.'.$fileID.'" />'."\n";
527 }
528
529 if ($data->[0] eq "gsdlassocfile"){
530 ++$fileID;
531 $all_text .= ' <mets:fptr FILEID="default.'.$fileID. '" />'. "\n";
532 }
533 }
534 $all_text .= " </mets:div>\n";
535
536 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
537
538 return $all_text;
539}
540
541
542sub buffer_mets_dmdSection_section_xml(){
543 my $self = shift(@_);
544 my ($section,$version) = @_;
545
546 $section="" unless defined $section;
547
548 my $section_ptr=$self->_lookup_section($section);
549 return "" unless defined $section_ptr;
550
551 # convert section number
552 my $section_num ="1". $section;
553 my $dmd_num = $section_num;
554
555 # #**output the dmdSection details
556 # if ($section_num eq "1") {
557 # $dmd_num = "0";
558 # }
559
560
561 my $all_text = "";
562
563 my $label_attr = "";
564 if ($version eq "fedora") {
565 $all_text .= "<mets:amdSec ID=\"DC\" >\n";
566 $all_text .= " <mets:techMD ID=\"DC.0\">\n"; # .0 fedora version number?
567
568 $label_attr = "LABEL=\"Dublin Core Metadata\"";
569 }
570 else {
571 # TODO::
572 #print STDERR "***** Check that GROUPID in dmdSec is valid!!!\n";
573 #print STDERR "***** Check to see if <techMD> required\n";
574 # if it isn't allowed, go back and set $mdTag = dmdSec/amdSec
575
576 $all_text .= "<mets:dmdSec ID=\"DM$dmd_num\" GROUPID=\"$section_num\">\n";
577 }
578
579 $all_text .= " <mets:mdWrap $label_attr MDTYPE=\"OTHER\" OTHERMDTYPE=\"gsdl3\" ID=\"gsdl$section_num\">\n";
580 $all_text .= " <mets:xmlData>\n";
581
582 if ($version eq "fedora") {
583 my $dc_namespace = "";
584 $dc_namespace .= "xmlns:dc=\"http://purl.org/dc/elements/1.1/\"";
585 $dc_namespace .= " xmlns:oai_dc=\"http://www.openarchives.org/OAI/2.0/oai_dc/\">\n";
586
587 $all_text .= " <oai_dc:dc $dc_namespace>\n";
588
589 $all_text .= $self->buffer_dc_section($section,"oai_dc");
590 $all_text .= " </oai_dc:dc>\n";
591 }
592 else {
593 foreach my $data (@{$section_ptr->{'metadata'}}){
594 my $escaped_value = &_escape_text($data->[1]);
595 $all_text .= ' <gsdl3:Metadata name="'. $data->[0].'">'. $escaped_value. "</gsdl3:Metadata>\n";
596 if ($data->[0] eq "dc.Title") {
597 $all_text .= ' <gsdl3:Metadata name="Title">'. $escaped_value."</gsdl3:Metadata>\n";
598 }
599 }
600 }
601
602 $all_text .= " </mets:xmlData>\n";
603 $all_text .= " </mets:mdWrap>\n";
604
605 if ($version eq "fedora") {
606 $all_text .= " </mets:techMD>\n";
607 $all_text .= "</mets:amdSec>\n";
608 }
609 else {
610 $all_text .= "</mets:dmdSec>\n";
611 }
612
613
614 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
615 $all_text .= $self->buffer_mets_dmdSection_section_xml("$section.$subsection",$version);
616 }
617
618 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
619
620 return $all_text;
621}
622
623sub output_section {
624 my $self = shift (@_);
625 my ($handle, $section) = @_;
626
627 print $handle $self->buffer_section_xml($section);
628}
629
630# print out DSpace dublin_core metadata section
631sub output_dspace_section {
632 my $self = shift (@_);
633 my ($handle, $section) = @_;
634
635 my $section_ptr = $self->_lookup_section ($section);
636 return "" unless defined $section_ptr;
637
638 my $all_text = "<Section>\n";
639 $all_text .= " <Description>\n";
640
641 # output metadata
642 foreach my $data (@{$section_ptr->{'metadata'}}) {
643 my $escaped_value = &_escape_text($data->[1]);
644 $all_text .= ' <Metadata name="' . $data->[0] . '">' . $escaped_value . "</Metadata>\n";
645 }
646
647 $all_text .= " </Description>\n";
648 $all_text .= "</Section>\n";
649
650 # make sure no nasty control characters have snuck through
651 # (XML::Parser will barf on anything it doesn't consider to be
652 # valid UTF-8 text, including things like \c@, \cC etc.)
653 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
654
655 return $all_text;
656}
657
658# print out doctxt.xml file
659sub output_txt_section {
660 my $self = shift (@_);
661 my ($handle, $section) = @_;
662
663 print $handle $self->buffer_txt_section_xml($section);
664}
665
666# print out docmets.xml file
667sub output_mets_section {
668 my $self = shift(@_);
669 my ($handle, $section, $version, $working_dir) = @_;
670
671 # print out the dmdSection
672 print $handle $self->buffer_mets_dmdSection_section_xml($section, $version);
673
674 print $handle "<mets:fileSec>\n";
675 if ($version eq "fedora") {
676 print $handle " <mets:fileGrp ID=\"DATASTREAMS\">\n";
677 }
678
679 # print out the fileSection by sections
680 print $handle $self->buffer_mets_fileSection_section_xml($section,$version);
681
682 # print out the whole fileSection
683 print $handle $self->buffer_mets_fileWhole_section_xml($section,$version,$working_dir);
684
685 if ($version eq "fedora") {
686 print $handle " </mets:fileGrp>\n";
687 }
688 print $handle "</mets:fileSec>\n";
689
690 # print out the StructMapSection by sections
691
692 my $struct_type;
693 if ($version eq "fedora") {
694 $struct_type = "fedora:dsBindingMap";
695 }
696 else {
697 $struct_type = "Section";
698 }
699
700 if ($version ne "fedora") {
701 print $handle "<mets:structMap ID=\"Section\" TYPE=\"$struct_type\" LABEL=\"Section\">\n";
702 my $order_num=0;
703 print $handle $self->buffer_mets_StructMapSection_section_xml($section, \$order_num);
704 print $handle "</mets:structMap>\n";
705
706 print $handle '<mets:structMap ID="All" TYPE="Whole Document" LABEL="All">'."\n";
707 print $handle $self->buffer_mets_StructMapWhole_section_xml($section);
708 print $handle "</mets:structMap>\n";
709 }
710
711}
712
713my $dc_set = { Title => 1,
714 Creator => 1,
715 Subject => 1,
716 Description => 1,
717 Publisher => 1,
718 Contributor => 1,
719 Date => 1,
720 Type => 1,
721 Format => 1,
722 Identifier => 1,
723 Source => 1,
724 Language => 1,
725 Relation => 1,
726 Coverage => 1,
727 Rights => 1};
728
729
730
731
732# Build up dublin_core metadata. Priority given to dc.* over ex.*
733# This method was apparently added by Jeffrey and committed by Shaoqun.
734# But we don't know why it was added, so not using it anymore.
735sub new_buffer_dc_section {
736
737 my $self = shift(@_);
738 my ($section, $version) = @_;
739
740 # build up string of dublin core metadata
741 $section="" unless defined $section;
742
743 my $section_ptr=$self->_lookup_section($section);
744 return "" unless defined $section_ptr;
745 foreach my $data (@{$section_ptr->{'metadata'}}){
746 my $escaped_value = &_escape_text($data->[1]);
747 my $dc_element = $data->[0];
748
749 my @array = split('\.',$dc_element);
750 my ($type,$name);
751
752 if(defined $array[1])
753 {
754 $type = $array[0];
755 $name = $array[1];
756 }
757 else
758 {
759 $type = "ex";
760 $name = $array[0];
761 }
762
763 $all_text .= ' <Metadata Type="'. $type.'" Name="'.$name.'">'. $escaped_value. "</Metadata>\n";
764 }
765 return $all_text;
766}
767
768
769sub buffer_dc_section {
770 my $self = shift(@_);
771 my ($section, $version) = @_;
772
773 # build up string of dublin core metadata
774 $section="" unless defined $section;
775
776 my $section_ptr=$self->_lookup_section($section);
777 return "" unless defined $section_ptr;
778
779
780 my $explicit_dc = {};
781 my $explicit_ex = {};
782
783 my $all_text="";
784 foreach my $data (@{$section_ptr->{'metadata'}}){
785 foreach my $temp (@$data)
786 {
787 print "($temp) ";
788 }
789 print "\n";
790 my $escaped_value = &_escape_text($data->[1]);
791 if ($data->[0]=~ m/^dc\./) {
792 $data->[0] =~ tr/[A-Z]/[a-z]/;
793
794 $data->[0] =~ m/^dc\.(.*)/;
795 my $dc_element = $1;
796
797 if (!defined $explicit_dc->{$dc_element}) {
798 $explicit_dc->{$dc_element} = [];
799 }
800 push(@{$explicit_dc->{$dc_element}},$escaped_value);
801
802 #$all_text .= ' <dcvalue element="'. $data->[0].'" qualifier="#####">'. $escaped_value. "</dcvalue>\n";
803 if (defined $version && ($version eq "oai_dc")) {
804 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
805 }
806 else {
807 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
808 }
809
810 }
811 elsif (($data->[0] =~ m/^ex\./) || ($data->[0] !~ m/\./)) {
812 $data->[0] =~ m/^(ex\.)?(.*)/;
813 my $ex_element = $2;
814 my $lc_ex_element = lc($ex_element);
815
816 if (defined $dc_set->{$ex_element}) {
817 if (!defined $explicit_ex->{$lc_ex_element}) {
818 $explicit_ex->{$lc_ex_element} = [];
819 }
820 push(@{$explicit_ex->{$lc_ex_element}},$escaped_value);
821 }
822 }
823 }
824
825 # go through dc_set and for any element *not* defined in explicit_dc
826 # that do exist in explicit_ex, add it in as metadata
827 foreach my $k ( keys %$dc_set ) {
828 my $lc_k = lc($k);
829
830 if (!defined $explicit_dc->{$lc_k}) {
831 if (defined $explicit_ex->{$lc_k}) {
832
833 foreach my $v (@{$explicit_ex->{$lc_k}}) {
834 my $dc_element = $lc_k;
835 my $escaped_value = $v;
836
837 if (defined $version && ($version eq "oai_dc")) {
838 $all_text .= " <dc:$dc_element>$escaped_value</dc:$dc_element>\n";
839 }
840 else {
841 $all_text .= ' <dcvalue element="'. $dc_element.'">'. $escaped_value. "</dcvalue>\n";
842 }
843
844 }
845 }
846 }
847 }
848
849 if ($all_text eq "") {
850 $all_text .= " There is no Dublin Core metatdata in this document\n";
851 }
852 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
853
854 return $all_text;
855}
856
857
858# Print out dublin_core metadata
859# changed back to using old method, don't know why Jeffrey changed this
860sub output_dc_section {
861 my $self = shift(@_);
862 my ($handle, $section, $version) = @_;
863
864 my $all_text = $self->buffer_dc_section($section,$version);
865 #my $all_text = $self->new_buffer_dc_section($section,$version);
866
867 print $handle $all_text;
868}
869
870
871# look up the reference to the a particular section
872sub _lookup_section {
873 my $self = shift (@_);
874 my ($section) = @_;
875
876 my ($num);
877 my $sectionref = $self;
878
879 while (defined $section && $section ne "") {
880
881 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
882
883 $num =~ s/^0+(\d)/$1/ if defined $num ; # remove leading 0s
884
885 $section = "" unless defined $section;
886
887
888 if (defined $num && defined $sectionref->{'subsections'}->{$num}) {
889 $sectionref = $sectionref->{'subsections'}->{$num};
890 } else {
891 return undef;
892 }
893 }
894
895 return $sectionref;
896}
897
898# calculate OID by hashing the contents of the document
899sub _calc_OID {
900 my $self = shift (@_);
901 my ($filename) = @_;
902
903 my $osexe = &util::get_os_exe();
904
905 my $hashfile_exe = &util::filename_cat($ENV{'GSDLHOME'},"bin",
906 $ENV{'GSDLOS'},"hashfile$osexe");
907
908 my $result = "NULL";
909
910 if (-e "$hashfile_exe") {
911# $result = `\"$hashfile_exe\" \"$filename\"`;
912 $result = `hashfile$osexe \"$filename\"`;
913 ($result) = $result =~ /:\s*([0-9a-f]+)/i;
914
915 } else {
916 print STDERR "doc::_calc_OID $hashfile_exe could not be found\n";
917 }
918 return "HASH$result";
919}
920
921# methods dealing with OID, not groups of them.
922
923# if $OID is not provided one is calculated
924sub set_OID {
925 my $self = shift (@_);
926 my ($OID) = @_;
927
928 my $use_hash_oid = 0;
929 # if an OID wasn't provided claculate one
930 if (!defined $OID) {
931 $OID = "NULL";
932 if ($self->{'OIDtype'} eq "hash") {
933 $use_hash_oid = 1;
934 } elsif ($self->{'OIDtype'} eq "incremental") {
935 $OID = "D" . $OIDcount;
936 $OIDcount ++;
937
938 } elsif ($self->{'OIDtype'} eq "dirname") {
939 $OID = 'J';
940 my $filename = $self->get_source_filename();
941 if (defined($filename)) { # && -e $filename) {
942 $OID = &File::Basename::dirname($filename);
943 if (defined $OID) {
944 $OID = 'J'.&File::Basename::basename($OID);
945 $OID =~ s/\.//; #remove any periods
946 } else {
947 print STDERR "Failed to find base for filename ($filename)...generating hash id\n";
948 $use_hash_oid = 1;
949 }
950 } else {
951 print STDERR "Failed to find filename, generating hash id\n";
952 $use_hash_oid = 1;
953 }
954
955 } elsif ($self->{'OIDtype'} eq "assigned") {
956 my $identifier = $self->get_metadata_element ($self->get_top_section(), $self->{'OIDmetadata'});
957 if (defined $identifier && $identifier ne "") {
958 $OID = "D" . $identifier;
959 $OID =~ s/\.//; #remove any periods
960 } else {
961 # need a hash id
962 print STDERR "no $self->{'OIDmetadata'} metadata found, generating hash id\n";
963 $use_hash_oid = 1;
964 }
965
966 } else {
967 $use_hash_oid = 1;
968 }
969
970 if ($use_hash_oid) {
971
972 # "hash" OID - feed file to hashfile.exe
973 #my $filename = $self->get_source_filename();
974 # we want to use the converted file for hashing if available
975 # cos its quicker
976 my $filename = $self->get_filename_for_hashing();
977 # -z: don't want to hash on the file if it is zero size
978 if (defined($filename) && -e $filename && !-z $filename) {
979 $OID = $self->_calc_OID ($filename);
980 } else {
981 $filename = &util::get_tmp_filename();
982 if (!open (OUTFILE, ">$filename")) {
983 print STDERR "doc::set_OID could not write to $filename\n";
984 } else {
985 $self->output_section('OUTFILE', $self->get_top_section(),
986 undef, 1);
987 close (OUTFILE);
988 }
989 $OID = $self->_calc_OID ($filename);
990 &util::rm ($filename);
991 }
992 }
993 }
994 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
995}
996
997# this uses hashdoc (embedded c thingy) which is faster but still
998# needs a little work to be suffiently stable
999sub ___set_OID {
1000 my $self = shift (@_);
1001 my ($OID) = @_;
1002
1003 # if an OID wasn't provided then calculate hash value based on document
1004 if (!defined $OID)
1005 {
1006 my $hash_text = $self->buffer_section_gml($self->get_top_section(),
1007 undef, 1);
1008 my $hash_len = length($hash_text);
1009
1010 $OID = &hashdoc::buffer($hash_text,$hash_len);
1011 }
1012
1013 $self->set_metadata_element ($self->get_top_section(), "Identifier", $OID);
1014}
1015
1016# returns the OID for this document
1017sub get_OID {
1018 my $self = shift (@_);
1019 my $OID = $self->get_metadata_element ($self->get_top_section(), "Identifier");
1020 return $OID if (defined $OID);
1021 return "NULL";
1022}
1023
1024sub delete_OID {
1025 my $self = shift (@_);
1026
1027 $self->set_metadata_element ($self->get_top_section(), "Identifier", "NULL");
1028}
1029
1030
1031# methods for manipulating section names
1032
1033# returns the name of the top-most section (the top
1034# level of the document
1035sub get_top_section {
1036 my $self = shift (@_);
1037
1038 return "";
1039}
1040
1041# returns a section
1042sub get_parent_section {
1043 my $self = shift (@_);
1044 my ($section) = @_;
1045
1046 $section =~ s/(^|\.)\d+$//;
1047
1048 return $section;
1049}
1050
1051# returns the first child section (or the end child
1052# if there isn't any)
1053sub get_begin_child {
1054 my $self = shift (@_);
1055 my ($section) = @_;
1056
1057 my $section_ptr = $self->_lookup_section($section);
1058 return "" unless defined $section_ptr;
1059
1060 if (defined $section_ptr->{'subsection_order'}->[0]) {
1061 return "$section.$section_ptr->{'subsection_order'}->[0]";
1062 }
1063
1064 return $self->get_end_child ($section);
1065}
1066
1067# returns the next child of a parent section
1068sub get_next_child {
1069 my $self = shift (@_);
1070 my ($section) = @_;
1071
1072 my $parent_section = $self->get_parent_section($section);
1073 my $parent_section_ptr = $self->_lookup_section($parent_section);
1074 return undef unless defined $parent_section_ptr;
1075
1076 my ($section_num) = $section =~ /(\d+)$/;
1077 return undef unless defined $section_num;
1078
1079 my $i = 0;
1080 my $section_order = $parent_section_ptr->{'subsection_order'};
1081 while ($i < scalar(@$section_order)) {
1082 last if $section_order->[$i] eq $section_num;
1083 $i++;
1084 }
1085
1086 $i++; # the next child
1087 if ($i < scalar(@$section_order)) {
1088 return $section_order->[$i] if $parent_section eq "";
1089 return "$parent_section.$section_order->[$i]";
1090 }
1091
1092 # no more sections in this level
1093 return undef;
1094}
1095
1096# returns a reference to a list of children
1097sub get_children {
1098 my $self = shift (@_);
1099 my ($section) = @_;
1100
1101 my $section_ptr = $self->_lookup_section($section);
1102 return [] unless defined $section_ptr;
1103
1104 my @children = @{$section_ptr->{'subsection_order'}};
1105
1106 map {$_ = "$section.$_"; $_ =~ s/^\.+//;} @children;
1107 return \@children;
1108}
1109
1110# returns the child section one past the last one (which
1111# is coded as "0")
1112sub get_end_child {
1113 my $self = shift (@_);
1114 my ($section) = @_;
1115
1116 return $section . ".0" unless $section eq "";
1117 return "0";
1118}
1119
1120# returns the next section in book order
1121sub get_next_section {
1122 my $self = shift (@_);
1123 my ($section) = @_;
1124
1125 return undef unless defined $section;
1126
1127 my $section_ptr = $self->_lookup_section($section);
1128 return undef unless defined $section_ptr;
1129
1130 # first try to find first child
1131 if (defined $section_ptr->{'subsection_order'}->[0]) {
1132 return $section_ptr->{'subsection_order'}->[0] if ($section eq "");
1133 return "$section.$section_ptr->{'subsection_order'}->[0]";
1134 }
1135
1136 do {
1137 # try to find sibling
1138 my $next_child = $self->get_next_child ($section);
1139 return $next_child if (defined $next_child);
1140
1141 # move up one level
1142 $section = $self->get_parent_section ($section);
1143 } while $section =~ /\d/;
1144
1145 return undef;
1146}
1147
1148sub is_leaf_section {
1149 my $self = shift (@_);
1150 my ($section) = @_;
1151
1152 my $section_ptr = $self->_lookup_section($section);
1153 return 1 unless defined $section_ptr;
1154
1155 return (scalar (@{$section_ptr->{'subsection_order'}}) == 0);
1156}
1157
1158# methods for dealing with sections
1159
1160# returns the name of the inserted section
1161sub insert_section {
1162 my $self = shift (@_);
1163 my ($before_section) = @_;
1164
1165 # get the child to insert before and its parent section
1166 my $parent_section = "";
1167 my $before_child = "0";
1168 my @before_section = split (/\./, $before_section);
1169 if (scalar(@before_section) > 0) {
1170 $before_child = pop (@before_section);
1171 $parent_section = join (".", @before_section);
1172 }
1173
1174 my $parent_section_ptr = $self->_lookup_section($parent_section);
1175 if (!defined $parent_section_ptr) {
1176 print STDERR "doc::insert_section couldn't find parent section " .
1177 "$parent_section\n";
1178 return;
1179 }
1180
1181 # get the next section number
1182 my $section_num = $parent_section_ptr->{'next_subsection'}++;
1183
1184 my $i = 0;
1185 while ($i < scalar(@{$parent_section_ptr->{'subsection_order'}}) &&
1186 $parent_section_ptr->{'subsection_order'}->[$i] ne $before_child) {
1187 $i++;
1188 }
1189
1190 # insert the section number into the order list
1191 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 0, $section_num);
1192
1193 # add this section to the parent section
1194 my $section_ptr = {'subsection_order'=>[],
1195 'next_subsection'=>1,
1196 'subsections'=>{},
1197 'metadata'=>[],
1198 'text'=>""};
1199 $parent_section_ptr->{'subsections'}->{$section_num} = $section_ptr;
1200
1201 # work out the full section number
1202 my $section = $parent_section;
1203 $section .= "." unless $section eq "";
1204 $section .= $section_num;
1205
1206 return $section;
1207}
1208
1209# creates a pre-named section
1210sub create_named_section {
1211 my $self = shift (@_);
1212 my ($mastersection) = @_;
1213
1214 my ($num);
1215 my $section = $mastersection;
1216 my $sectionref = $self;
1217
1218 while ($section ne "") {
1219 ($num, $section) = $section =~ /^\.?(\d+)(.*)$/;
1220 $num =~ s/^0+(\d)/$1/; # remove leading 0s
1221 $section = "" unless defined $section;
1222
1223 if (defined $num) {
1224 if (!defined $sectionref->{'subsections'}->{$num}) {
1225 push (@{$sectionref->{'subsection_order'}}, $num);
1226 $sectionref->{'subsections'}->{$num} = {'subsection_order'=>[],
1227 'next_subsection'=>1,
1228 'subsections'=>{},
1229 'metadata'=>[],
1230 'text'=>""};
1231 if ($num >= $sectionref->{'next_subsection'}) {
1232 $sectionref->{'next_subsection'} = $num + 1;
1233 }
1234 }
1235 $sectionref = $sectionref->{'subsections'}->{$num};
1236
1237 } else {
1238 print STDERR "doc::create_named_section couldn't create section ";
1239 print STDERR "$mastersection\n";
1240 last;
1241 }
1242 }
1243}
1244
1245# returns a reference to a list of subsections
1246sub list_subsections {
1247 my $self = shift (@_);
1248 my ($section) = @_;
1249
1250 my $section_ptr = $self->_lookup_section ($section);
1251 if (!defined $section_ptr) {
1252 print STDERR "doc::list_subsections couldn't find section $section\n";
1253 return [];
1254 }
1255
1256 return [@{$section_ptr->{'subsection_order'}}];
1257}
1258
1259sub delete_section {
1260 my $self = shift (@_);
1261 my ($section) = @_;
1262
1263# my $section_ptr = {'subsection_order'=>[],
1264# 'next_subsection'=>1,
1265# 'subsections'=>{},
1266# 'metadata'=>[],
1267# 'text'=>""};
1268
1269 # if this is the top section reset everything
1270 if ($section eq "") {
1271 $self->{'subsection_order'} = [];
1272 $self->{'subsections'} = {};
1273 $self->{'metadata'} = [];
1274 $self->{'text'} = "";
1275 return;
1276 }
1277
1278 # find the parent of the section to delete
1279 my $parent_section = "";
1280 my $child = "0";
1281 my @section = split (/\./, $section);
1282 if (scalar(@section) > 0) {
1283 $child = pop (@section);
1284 $parent_section = join (".", @section);
1285 }
1286
1287 my $parent_section_ptr = $self->_lookup_section($parent_section);
1288 if (!defined $parent_section_ptr) {
1289 print STDERR "doc::delete_section couldn't find parent section " .
1290 "$parent_section\n";
1291 return;
1292 }
1293
1294 # remove this section from the subsection_order list
1295 my $i = 0;
1296 while ($i < scalar (@{$parent_section_ptr->{'subsection_order'}})) {
1297 if ($parent_section_ptr->{'subsection_order'}->[$i] eq $child) {
1298 splice (@{$parent_section_ptr->{'subsection_order'}}, $i, 1);
1299 last;
1300 }
1301 $i++;
1302 }
1303
1304 # remove this section from the subsection hash
1305 if (defined ($parent_section_ptr->{'subsections'}->{$child})) {
1306 undef $parent_section_ptr->{'subsections'}->{$child};
1307 }
1308}
1309
1310#--
1311# methods for dealing with metadata
1312
1313# set_metadata_element and get_metadata_element are for metadata
1314# which should only have one value. add_meta_data and get_metadata
1315# are for metadata which can have more than one value.
1316
1317# returns the first metadata value which matches field
1318
1319# This version of get metadata element works much like the one above,
1320# except it allows for the namespace portion of a metadata element to
1321# be ignored, thus if you are searching for dc.Title, the first piece
1322# of matching metadata ending with the name Title (once any namespace
1323# is removed) would be returned.
1324# 28-11-2003 John Thompson
1325sub get_metadata_element {
1326 my $self = shift (@_);
1327 my ($section, $field, $ignore_namespace) = @_;
1328 my ($data);
1329
1330 $ignore_namespace = 0 unless defined $ignore_namespace;
1331
1332 my $section_ptr = $self->_lookup_section($section);
1333 if (!defined $section_ptr) {
1334 print STDERR "doc::get_metadata_element couldn't find section ", $section, "\n";
1335 return;
1336 }
1337
1338 # Remove the any namespace if we are being told to ignore them
1339 if($ignore_namespace) {
1340 $field =~ s/^\w*\.//;
1341 }
1342
1343 foreach $data (@{$section_ptr->{'metadata'}}) {
1344
1345 my $data_name = $data->[0];
1346 # Remove the any namespace if we are being told to ignore them
1347 if($ignore_namespace) {
1348 $data_name =~ s/^\w*\.//;
1349 }
1350
1351 return $data->[1] if (scalar(@$data) >= 2 && $data_name eq $field);
1352 }
1353
1354 return undef; # was not found
1355}
1356
1357# returns a list of the form [value1, value2, ...]
1358sub get_metadata {
1359 my $self = shift (@_);
1360 my ($section, $field, $ignore_namespace) = @_;
1361 my ($data);
1362
1363 $ignore_namespace = 0 unless defined $ignore_namespace;
1364
1365 my $section_ptr = $self->_lookup_section($section);
1366 if (!defined $section_ptr) {
1367 print STDERR "doc::get_metadata couldn't find section ",
1368 $section, "\n";
1369 return;
1370 }
1371
1372 # Remove the any namespace if we are being told to ignore them
1373 if($ignore_namespace) {
1374 $field =~ s/^\w*\.//;
1375 }
1376
1377 my @metadata = ();
1378 foreach $data (@{$section_ptr->{'metadata'}}) {
1379
1380 my $data_name = $data->[0];
1381 # Remove the any namespace if we are being told to ignore them
1382 if($ignore_namespace) {
1383 $data_name =~ s/^\w*\.//;
1384 }
1385
1386 push (@metadata, $data->[1]) if ($data_name eq $field);
1387 }
1388
1389 return \@metadata;
1390}
1391
1392# returns a list of the form [[field,value],[field,value],...]
1393sub get_all_metadata {
1394 my $self = shift (@_);
1395 my ($section) = @_;
1396
1397 my $section_ptr = $self->_lookup_section($section);
1398 if (!defined $section_ptr) {
1399 print STDERR "doc::get_all_metadata couldn't find section ", $section, "\n";
1400 return;
1401 }
1402
1403 return $section_ptr->{'metadata'};
1404}
1405
1406# returns a xml element of the form <MetadataList><Metadata name="metadata-name">metadata_value</Metadata>...</MetadataList>
1407sub get_top_metadata_list{
1408 my $self = shift (@_);
1409
1410 my @topmetadata =$self->get_all_metadata($self->get_top_section());
1411 my $metadatalist ='<MetadataList>';
1412
1413 foreach my $i (@topmetadata){
1414 foreach my $j (@$i){
1415 my %metaMap = @$j;
1416 foreach my $key (keys %metaMap){
1417 $metadatalist .='<Metadata name='."\"$key\"".'>'.$metaMap{$key}.'</Metadata>'."\n";
1418 }
1419 }
1420 }
1421
1422 $metadatalist .='</MetadataList>';
1423
1424 return $metadatalist;
1425}
1426
1427
1428# $value is optional
1429sub delete_metadata {
1430 my $self = shift (@_);
1431 my ($section, $field, $value) = @_;
1432
1433 my $section_ptr = $self->_lookup_section($section);
1434 if (!defined $section_ptr) {
1435 print STDERR "doc::delete_metadata couldn't find section ", $section, "\n";
1436 return;
1437 }
1438
1439 my $i = 0;
1440 while ($i < scalar (@{$section_ptr->{'metadata'}})) {
1441 if (($section_ptr->{'metadata'}->[$i]->[0] eq $field) &&
1442 (!defined $value || $section_ptr->{'metadata'}->[$i]->[1] eq $value)) {
1443 splice (@{$section_ptr->{'metadata'}}, $i, 1);
1444 } else {
1445 $i++;
1446 }
1447 }
1448}
1449
1450sub delete_all_metadata {
1451 my $self = shift (@_);
1452 my ($section) = @_;
1453
1454 my $section_ptr = $self->_lookup_section($section);
1455 if (!defined $section_ptr) {
1456 print STDERR "doc::delete_all_metadata couldn't find section ", $section, "\n";
1457 return;
1458 }
1459
1460 $section_ptr->{'metadata'} = [];
1461}
1462
1463sub set_metadata_element {
1464 my $self = shift (@_);
1465 my ($section, $field, $value) = @_;
1466
1467 $self->set_utf8_metadata_element ($section, $field,
1468 &unicode::ascii2utf8(\$value));
1469}
1470
1471# set_utf8_metadata_element assumes the text has already been
1472# converted to the UTF-8 encoding.
1473sub set_utf8_metadata_element {
1474 my $self = shift (@_);
1475 my ($section, $field, $value) = @_;
1476
1477 $self->delete_metadata ($section, $field);
1478 $self->add_utf8_metadata ($section, $field, $value);
1479}
1480
1481
1482# add_metadata assumes the text is in (extended) ascii form. For
1483# text which has already been converted to the UTF-8 format use
1484# add_utf8_metadata.
1485sub add_metadata {
1486 my $self = shift (@_);
1487 my ($section, $field, $value) = @_;
1488
1489 $self->add_utf8_metadata ($section, $field,
1490 &unicode::ascii2utf8(\$value));
1491}
1492
1493sub add_utf8_metadata {
1494 my $self = shift (@_);
1495 my ($section, $field, $value) = @_;
1496
1497 my $section_ptr = $self->_lookup_section($section);
1498 if (!defined $section_ptr) {
1499 print STDERR "doc::add_utf8_metadata couldn't find section ", $section, "\n";
1500 return;
1501 }
1502 if (!defined $value) {
1503 print STDERR "doc::add_utf8_metadata undefined value for $field\n";
1504 return;
1505 }
1506 if (!defined $field) {
1507 print STDERR "doc::add_utf8_metadata undefined metadata type \n";
1508 return;
1509 }
1510
1511 #print STDERR "###$field=$value\n";
1512 # double check that the value is utf-8
1513 if (unicode::ensure_utf8(\$value)) {
1514 print STDERR "doc::add_utf8_metadata: warning: '$field' wasn't utf8\n";
1515 }
1516
1517 push (@{$section_ptr->{'metadata'}}, [$field, $value]);
1518}
1519
1520
1521# methods for dealing with text
1522
1523# returns the text for a section
1524sub get_text {
1525 my $self = shift (@_);
1526 my ($section) = @_;
1527
1528 my $section_ptr = $self->_lookup_section($section);
1529 if (!defined $section_ptr) {
1530 print STDERR "doc::get_text couldn't find section " .
1531 "$section\n";
1532 return "";
1533 }
1534
1535 return $section_ptr->{'text'};
1536}
1537
1538# returns the (utf-8 encoded) length of the text for a section
1539sub get_text_length {
1540 my $self = shift (@_);
1541 my ($section) = @_;
1542
1543 my $section_ptr = $self->_lookup_section($section);
1544 if (!defined $section_ptr) {
1545 print STDERR "doc::get_text_length couldn't find section " .
1546 "$section\n";
1547 return 0;
1548 }
1549
1550 return length ($section_ptr->{'text'});
1551}
1552
1553sub delete_text {
1554 my $self = shift (@_);
1555 my ($section) = @_;
1556
1557 my $section_ptr = $self->_lookup_section($section);
1558 if (!defined $section_ptr) {
1559 print STDERR "doc::delete_text couldn't find section " .
1560 "$section\n";
1561 return;
1562 }
1563
1564 $section_ptr->{'text'} = "";
1565}
1566
1567# add_text assumes the text is in (extended) ascii form. For
1568# text which has been already converted to the UTF-8 format
1569# use add_utf8_text.
1570sub add_text {
1571 my $self = shift (@_);
1572 my ($section, $text) = @_;
1573
1574 # convert the text to UTF-8 encoded unicode characters
1575 # and add the text
1576 $self->add_utf8_text($section, &unicode::ascii2utf8(\$text));
1577}
1578
1579
1580# add_utf8_text assumes the text to be added has already
1581# been converted to the UTF-8 encoding. For ascii text use
1582# add_text
1583sub add_utf8_text {
1584 my $self = shift (@_);
1585 my ($section, $text) = @_;
1586
1587 my $section_ptr = $self->_lookup_section($section);
1588 if (!defined $section_ptr) {
1589 print STDERR "doc::add_utf8_text couldn't find section " .
1590 "$section\n";
1591 return;
1592 }
1593
1594 $section_ptr->{'text'} .= $text;
1595}
1596
1597
1598# methods for dealing with associated files
1599
1600# a file is associated with a document, NOT a section.
1601# if section is defined it is noted in the data structure
1602# only so that files associated from a particular section
1603# may be removed later (using delete_section_assoc_files)
1604sub associate_file {
1605 my $self = shift (@_);
1606 my ($real_filename, $assoc_filename, $mime_type, $section) = @_;
1607 $mime_type = &ghtml::guess_mime_type ($real_filename) unless defined $mime_type;
1608
1609 # remove all associated files with the same name
1610 $self->delete_assoc_file ($assoc_filename);
1611
1612 push (@{$self->{'associated_files'}},
1613 [$real_filename, $assoc_filename, $mime_type, $section]);
1614}
1615
1616# returns a list of associated files in the form
1617# [[real_filename, assoc_filename, mimetype], ...]
1618sub get_assoc_files {
1619 my $self = shift (@_);
1620
1621 return $self->{'associated_files'};
1622}
1623
1624sub delete_section_assoc_files {
1625 my $self = shift (@_);
1626 my ($section) = @_;
1627
1628 my $i=0;
1629 while ($i < scalar (@{$self->{'associated_files'}})) {
1630 if (defined $self->{'associated_files'}->[$i]->[3] &&
1631 $self->{'associated_files'}->[$i]->[3] eq $section) {
1632 splice (@{$self->{'associated_files'}}, $i, 1);
1633 } else {
1634 $i++;
1635 }
1636 }
1637}
1638
1639sub delete_assoc_file {
1640 my $self = shift (@_);
1641 my ($assoc_filename) = @_;
1642
1643 my $i=0;
1644 while ($i < scalar (@{$self->{'associated_files'}})) {
1645 if ($self->{'associated_files'}->[$i]->[1] eq $assoc_filename) {
1646 splice (@{$self->{'associated_files'}}, $i, 1);
1647 } else {
1648 $i++;
1649 }
1650 }
1651}
1652
1653sub reset_nextsection_ptr {
1654 my $self = shift (@_);
1655 my ($section) = @_;
1656
1657 my $section_ptr = $self->_lookup_section($section);
1658 $section_ptr->{'next_subsection'} = 1;
1659}
1660
16611;
Note: See TracBrowser for help on using the repository browser.