source: gsdl/trunk/perllib/plugouts/FedoraMETSPlugout.pm@ 19216

Last change on this file since 19216 was 19216, checked in by kjdon, 15 years ago

fixed a string key mis-name

File size: 19.1 KB
Line 
1###########################################################################
2#
3# FedoraMETSPlugout.pm -- the plugout module for METS archives
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# But WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package FedoraMETSPlugout;
27
28use strict;
29no strict 'refs';
30
31#eval {require bytes};
32#use util;
33use METSPlugout;
34#use docprint; # for escape_text
35
36sub BEGIN {
37 @FedoraMETSPlugout::ISA = ('METSPlugout');
38}
39
40my $arguments = [
41 { 'name' => "fedora_namespace",
42 'desc' => "{FedoraMETSPlugout.fedora_namespace}",
43 'type' => "string",
44 'deft' => "greenstone",
45 'reqd' => "no",
46 'hiddengli' => "no"}
47 ];
48
49
50
51my $options = { 'name' => "FedoraMETSPlugout",
52 'desc' => "{FedoraMETSPlugout.desc}",
53 'abstract' => "no",
54 'inherits' => "yes",
55 'args' => $arguments
56 };
57
58
59sub new
60{
61 my ($class) = shift (@_);
62 my ($plugoutlist, $inputargs,$hashArgOptLists) = @_;
63 push(@$plugoutlist, $class);
64
65
66 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
67 push(@{$hashArgOptLists->{"OptList"}},$options);
68
69 my $self = new METSPlugout($plugoutlist,$inputargs,$hashArgOptLists);
70
71 return bless $self, $class;
72}
73
74
75sub output_mets_xml_header
76{
77 my $self = shift(@_);
78 my ($handle, $OID, $doc_title) = @_;
79
80 my $fnamespace = $self->{'fedora_namespace'};
81 my $oid_namespace = (defined $fnamespace) ? $fnamespace : "test";
82
83 my $collection = $ENV{'GSDLCOLLECTION'};
84
85 # Might need the following in the schemeLocation attribute for Fedora3
86 # http://www.fedora.info/definitions/1/0/mets-fedora-ext1-1.xsd
87 my $extra_attr = "OBJID=\"$oid_namespace:$collection-$OID\" TYPE=\"FedoraObject\" LABEL=\"$doc_title\"";
88
89 my $extra_schema = undef;
90
91 if ($ENV{'FEDORA_VERSION'} =~ m/^2/) { # checking if major version is 2
92 $extra_schema = "http://www.fedora.info/definitions/1/0/mets-fedora-ext.xsd";
93 }
94 else {
95 $extra_attr .= " EXT_VERSION=\"1.1\"";
96 }
97
98 $self->output_mets_xml_header_extra_attribute($handle,$extra_attr,$extra_schema);
99
100 print $handle '<mets:metsHdr RECORDSTATUS="A"/>'. "\n"; # A = active
101
102}
103
104#
105# Print out "family" of doctxt.xml files
106#
107
108sub saveas_doctxt_section
109{
110 my $self = shift (@_);
111 my ($doc_obj,$working_dir,$section) = @_;
112
113 my $section_ptr=$doc_obj->_lookup_section($section);
114 return unless defined $section_ptr;
115
116 my $section_fnum ="1". $section;
117 $section_fnum =~ s/\./_/g;
118
119 my $doc_txt_file = &util::filename_cat ($working_dir,"doctxt$section_fnum.xml");
120
121 $self->open_xslt_pipe($doc_txt_file,$self->{'xslt_txt'});
122
123 my $outhandler;
124
125 if (defined $self->{'xslt_writer'}){
126 $outhandler = $self->{'xslt_writer'};
127 }
128 else{
129 $outhandler = $self->get_output_handler($doc_txt_file);
130 }
131
132 $self->output_xml_header($outhandler);
133 $self->output_txt_section($outhandler,$doc_obj, $section);
134 $self->output_xml_footer($outhandler);
135
136
137 if (defined $self->{'xslt_writer'}){
138 $self->close_xslt_pipe();
139 }
140 else{
141 close($outhandler);
142 }
143
144
145 # Output all the subsections as separate files
146 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
147
148 $self->saveas_doctxt_section($doc_obj, $working_dir, "$section.$subsection");
149 }
150
151
152}
153
154
155sub saveas_doctxt
156{
157 my $self = shift (@_);
158 my ($doc_obj,$working_dir) = @_;
159
160 my $section = $doc_obj->get_top_section();
161
162 $self->saveas_doctxt_section($doc_obj,$working_dir,$section);
163
164 $self->saveas_toc($doc_obj,$working_dir);
165}
166
167sub buffer_toc
168{
169 my $self = shift (@_);
170 my ($doc_obj,$working_dir,$section,$depth) = @_;
171
172 my $section_ptr=$doc_obj->_lookup_section($section);
173 return "" unless defined $section_ptr;
174
175 my $all_text = "";
176
177 my $section_num ="1". $section;
178 my $indent = " " x ($depth*2);
179
180 $all_text .= "$indent<Section id=\"$section_num\">\n";
181
182 # Output all the subsections as separate files
183 foreach my $subsection (@{$section_ptr->{'subsection_order'}})
184 {
185 $all_text
186 .= $self->buffer_toc($doc_obj, $working_dir,
187 "$section.$subsection",$depth+1);
188 }
189
190 $all_text .= "$indent</Section>\n";
191
192 return $all_text;
193}
194
195
196sub saveas_toc
197{
198 my $self = shift (@_);
199 my ($doc_obj,$working_dir) = @_;
200
201 my $section = $doc_obj->get_top_section();
202 my $section_ptr=$doc_obj->_lookup_section($section);
203 my $num_subsections = scalar(@{$section_ptr->{'subsection_order'}});
204
205 # If num_subsections is 0, then there is no nested TOC
206
207 if ($num_subsections>0) {
208
209 my $doc_txt_file = &util::filename_cat ($working_dir,"doctoc.xml");
210
211 $self->open_xslt_pipe($doc_txt_file,$self->{'xslt_txt'});
212
213 my $outhandler;
214
215 if (defined $self->{'xslt_writer'}){
216 $outhandler = $self->{'xslt_writer'};
217 }
218 else{
219 $outhandler = $self->get_output_handler($doc_txt_file);
220 }
221 print $outhandler $self->buffer_toc($doc_obj, $working_dir, $section, 0);
222
223 if (defined $self->{'xslt_writer'}){
224 $self->close_xslt_pipe();
225 }
226 else{
227 close($outhandler);
228 }
229 }
230
231}
232
233
234sub buffer_mets_relsext_xml
235{
236 my $self = shift(@_);
237 my ($doc_obj) = @_;
238
239 my $OID = $doc_obj->get_OID();
240
241 my $fnamespace = $self->{'fedora_namespace'};
242 my $oid_namespace = (defined $fnamespace) ? $fnamespace : "test";
243 my $collection = $ENV{'GSDLCOLLECTION'};
244
245 my $fed_id = "$oid_namespace:$collection-$OID";
246
247 my $all_text = "";
248
249 my $top_section = $doc_obj->get_top_section();
250 my $plugin_type = $doc_obj->get_metadata_element($top_section,"Plugin");
251
252# Images do not get ingested into Fedora when on Linux if the following is included
253# Needs more investigation, since we'd like a working version of the following
254# in order to get thumbnails working and other stuff.
255# if ((defined $plugin_type) && ($plugin_type eq "ImagePlugin"))
256# {
257#
258# $all_text .= "<mets:amdSec ID=\"RELS-EXT\">\n";
259# $all_text .= " <mets:techMD ID=\"RELS-EXT1.0\" STATUS=\"A\">\n";
260# $all_text .= " <mets:mdWrap LABEL=\"RELS-EXT - RDF formatted relationship metadata\" MDTYPE=\"OTHER\" MIMETYPE=\"text/xml\">\n";
261# $all_text .= " <mets:xmlData>\n";
262# $all_text .= " <rdf:RDF xmlns:rdf=\"http://www.w3.org/1999/02/22-rdf-syntax-ns#\" xmlns:fedora-model=\"info:fedora/fedora-system:def/model#\">\n";
263# $all_text .= " <rdf:Description rdf:about=\"info:fedora/$fed_id\">\n";
264# $all_text .= " <fedora-model:hasContentModel rdf:resource=\"info:fedora/demo:UVA_STD_IMAGE\"/>\n";
265# $all_text .= " </rdf:Description>\n";
266# $all_text .= " </rdf:RDF>\n";
267# $all_text .= " </mets:xmlData>\n";
268# $all_text .= " </mets:mdWrap>\n";
269# $all_text .= " </mets:techMD>\n";
270# $all_text .= "</mets:amdSec>\n";
271# }
272
273 return $all_text;
274}
275
276
277#
278# Print out docmets.xml file
279#
280sub output_mets_section
281{
282 my $self = shift(@_);
283 my ($handle, $doc_obj, $section, $working_dir) = @_;
284
285 # print out the dmdSection
286 print $handle $self->buffer_mets_dmdSection_section_xml($doc_obj,$section);
287
288 print $handle $self->buffer_mets_relsext_xml($doc_obj);
289
290 print $handle "<mets:fileSec>\n";
291 print $handle " <mets:fileGrp ID=\"DATASTREAMS\">\n";
292
293 # Generate Filestream for Table of Contents (TOC)
294 my $section_ptr=$doc_obj->_lookup_section($section);
295 my $num_subsections = scalar(@{$section_ptr->{'subsection_order'}});
296
297 # If num_subsections is 0, then there is no nested TOC
298
299 if ($num_subsections>0) {
300 print $handle $self->buffer_mets_fileSection_toc($doc_obj,$section,$working_dir);
301 }
302
303 # print out the fileSection by sections
304 print $handle $self->buffer_mets_fileSection_section_xml($doc_obj,$section,$working_dir);
305
306 # print out the whole fileSection
307 print $handle $self->buffer_mets_fileWhole_section_xml($doc_obj,$section,$working_dir);
308
309 print $handle " </mets:fileGrp>\n";
310 print $handle "</mets:fileSec>\n";
311
312 # print out the StructMapSection by sections
313
314 my $struct_type = "fedora:dsBindingMap";
315
316 # If document is going to make use of deminators (BMech and BDef) then
317 # need to code up more output XML here (structMap)and in
318 # METS:behaviorSec (Fedora extension?) sections
319
320}
321
322sub buffer_mets_amdSec_header
323{
324 my $self = shift(@_);
325 my ($section,$id) = @_;
326
327 # convert section number
328 my $section_num ="1". $section;
329
330 my $all_text = "";
331
332 my $label_attr = "";
333
334 $all_text .= "<mets:amdSec ID=\"$id$section\" >\n";
335 $all_text .= " <mets:techMD ID=\"$id$section.0\">\n"; # .0 fedora version number?
336
337 $label_attr = "LABEL=\"Metadata\"";
338
339 $all_text .= " <mets:mdWrap $label_attr MDTYPE=\"OTHER\" OTHERMDTYPE=\"gsdl3\" ID=\"".$id."gsdl$section_num\">\n";
340 $all_text .= " <mets:xmlData>\n";
341
342 return $all_text;
343
344}
345
346sub buffer_mets_amdSec_footer
347{
348 my $self = shift(@_);
349
350 my $all_text = "";
351
352 $all_text .= " </mets:xmlData>\n";
353 $all_text .= " </mets:mdWrap>\n";
354
355 $all_text .= " </mets:techMD>\n";
356 $all_text .= "</mets:amdSec>\n";
357
358 return $all_text;
359
360}
361
362sub oai_dc_metadata_xml
363{
364 my $self = shift(@_);
365 my ($doc_obj,$section) = @_;
366
367 my $all_text = "";
368
369 my $dc_namespace = "";
370 $dc_namespace .= "xmlns:dc=\"http://purl.org/dc/elements/1.1/\"";
371 $dc_namespace .= " xmlns:oai_dc=\"http://www.openarchives.org/OAI/2.0/oai_dc/\" ";
372
373 $all_text .= " <oai_dc:dc $dc_namespace>\n";
374
375 $all_text .= $self->get_dc_metadata($doc_obj, $section,"oai_dc");
376 $all_text .= " </oai_dc:dc>\n";
377
378 return $all_text;
379}
380
381
382
383
384
385# Work out the what the metadata set prefixes (dc,dls etc.) are for
386# this document
387
388sub metadata_set_prefixes
389{
390 my $self = shift(@_);
391 my ($doc_obj, $section) = @_;
392
393 $section="" unless defined $section;
394
395 my $section_ptr = $doc_obj->_lookup_section($section);
396 return {} unless defined $section_ptr;
397
398 my $unique_prefix = {};
399
400 foreach my $data (@{$section_ptr->{'metadata'}})
401 {
402 my ($prefix) = ($data->[0]=~ m/^(.*?)\./);
403
404 if (defined $prefix)
405 {
406 next if ($prefix eq "dc"); # skip dublin core as handled separately elsewhere
407
408 $unique_prefix->{$prefix} = 1;
409 }
410 else
411 {
412 $unique_prefix->{"ex"} = 1;
413 }
414
415 }
416
417 return $unique_prefix;
418}
419
420
421sub mds_metadata_xml
422{
423 my $self = shift(@_);
424 my ($doc_obj, $section, $mds_prefix, $namespace) = @_;
425
426 # build up string of metadata with $mds_prefix
427 $section="" unless defined $section;
428
429 my $section_ptr = $doc_obj->_lookup_section($section);
430 return "" unless defined $section_ptr;
431
432 my $all_text="";
433 $all_text .= " <$mds_prefix:$mds_prefix $namespace>\n";
434
435
436 foreach my $data (@{$section_ptr->{'metadata'}})
437 {
438 if ($data->[0]=~ m/^(?:(.*?)\.)?(.*)$/)
439 {
440 my $curr_mds_prefix = $1;
441 my $mds_full_element = $2;
442
443 $curr_mds_prefix = "ex" unless defined $curr_mds_prefix;
444
445 if ($curr_mds_prefix eq $mds_prefix)
446 {
447 # split up full element in the form Title^en into element=Title, attr="en"
448 my ($mds_element,$subelem) = ($mds_full_element =~ m/^(.*?)(?:\^(.*))?$/);
449 my $mds_attr = (defined $subelem) ? "qualifier=\"$subelem\"" : "";
450
451 my $escaped_value = &docprint::escape_text($data->[1]);
452
453 $all_text .= " <$mds_prefix:metadata name=\"$mds_element\" $mds_attr>$escaped_value</$mds_prefix:metadata>\n";
454 }
455 }
456 }
457
458 $all_text .= " </$mds_prefix:$mds_prefix>\n";
459
460
461 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
462
463 return $all_text;
464}
465
466
467
468sub buffer_mets_dmdSection_section_xml
469{
470 my $self = shift(@_);
471 my ($doc_obj,$section) = @_;
472
473 $section="" unless defined $section;
474
475 my $section_ptr=$doc_obj->_lookup_section($section);
476 return "" unless defined $section_ptr;
477
478 my $all_text = "";
479
480 $all_text .= $self->buffer_mets_amdSec_header($section,"DC");
481 $all_text .= $self->oai_dc_metadata_xml($doc_obj,$section);
482 $all_text .= $self->buffer_mets_amdSec_footer($section);
483
484 # for each metadata set
485 my $md_sets = $self->metadata_set_prefixes($doc_obj,$section);
486
487 foreach my $md_set (keys %$md_sets)
488 {
489 # Greenstone's agnostic approach to metadata sets conflicts with
490 # Fedoras more clinically prescribed one. Fake a namespace for
491 # each $md_set to keep both sides happy
492
493 my $fake_namespace
494 = "xmlns:$md_set=\"http://www.greenstone.org/namespace/fake/$md_set\"";
495 my $id_caps = $md_set;
496 $id_caps =~ tr/[a-z]/[A-Z]/;
497
498 $all_text .= $self->buffer_mets_amdSec_header($section,$id_caps);
499 $all_text .= $self->mds_metadata_xml($doc_obj,$section,$md_set,$fake_namespace);
500 $all_text .= $self->buffer_mets_amdSec_footer($section);
501 }
502
503
504 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
505 $all_text .= $self->buffer_mets_dmdSection_section_xml($doc_obj,"$section.$subsection");
506 }
507
508 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
509
510 return $all_text;
511}
512
513
514sub doctxt_to_xlink
515{
516 my $self = shift @_;
517 my ($fname,$working_dir) = @_;
518
519 my $xlink_href;
520
521 my $fedora_prefix = $ENV{'FEDORA_HOME'};
522 if (!defined $fedora_prefix) {
523 $xlink_href = "file:$fname";
524 }
525 else
526 {
527 my $collectparent;
528 if (defined $ENV{'GSDL3SRCHOME'}) { # we're dealing with a GS3 server
529 if(defined $ENV{'GSDL3HOME'}) { # in case the web directory is located in a separate place
530 $collectparent = &util::filename_cat($ENV{'GSDL3HOME'},"sites","localsite");
531 }
532 else { # try the default location for the web directory
533 $collectparent = &util::filename_cat($ENV{'GSDL3SRCHOME'},"web","sites","localsite");
534 }
535 }
536 else {
537 # greenstone 2
538 $collectparent = $ENV{'GSDLHOME'};
539 }
540
541 my $gsdl_href = &util::filename_cat($working_dir, $fname);
542 $collectparent =~ s/\\/\\\\/g; # escape reserved metacharacter \ in path (by replacing it with \\) for substitution
543 $gsdl_href =~ s/^$collectparent(\/|\\)?//; # remove the collectparent path in gsdl_href and any trailing slash
544 $gsdl_href =~ s/\\/\//g; # make sure we have url paths (which only use / not \)
545 $gsdl_href = "/gsdl/$gsdl_href"; # prepend gsdl
546
547 my $fserver = $ENV{'FEDORA_HOSTNAME'};
548 my $fport = $ENV{'FEDORA_SERVER_PORT'};
549
550 my $fdomain = "http://$fserver:$fport";
551 $xlink_href = "$fdomain$gsdl_href";
552 }
553
554
555 return $xlink_href;
556
557}
558
559
560sub buffer_mets_fileSection_toc
561{
562 my $self = shift(@_);
563 my ($doc_obj,$section,$working_dir) = @_;
564
565 my $opt_attr = "OWNERID=\"M\"";
566
567 my $all_text = ' <mets:fileGrp ID="TOC">'. "\n";
568 $all_text .= " <mets:file MIMETYPE=\"text/xml\" ID=\"FILETOC\" $opt_attr >\n";
569 my $xlink = $self->doctxt_to_xlink("doctoc.xml",$working_dir);
570
571 $all_text .= ' <mets:FLocat LOCTYPE="URL" xlink:href="'.$xlink.'"';
572
573 $all_text .= ' xlink:title="Table of Contents"/>' . "\n";
574 $all_text .= " </mets:file>\n";
575 $all_text .= " </mets:fileGrp>\n";
576
577 return $all_text;
578}
579
580
581sub buffer_mets_fileSection_section_xml
582{
583 my $self = shift(@_);
584 my ($doc_obj,$section,$working_dir) = @_;
585
586 my $is_txt_split = 1;
587 my $opt_owner_id = "OWNERID=\"M\"";
588
589 my $all_text
590 = $self->SUPER::buffer_mets_fileSection_section_xml($doc_obj,$section,$working_dir,$is_txt_split, $opt_owner_id,"SECTION");
591
592
593 return $all_text;
594}
595
596sub buffer_mets_fileWhole_section_xml
597{
598 my $self = shift(@_);
599 my ($doc_obj,$section,$working_dir) = @_;
600
601 my $section_ptr = $doc_obj-> _lookup_section($section);
602 return "" unless defined $section_ptr;
603
604 my $all_text="";
605
606 my $fileID=0;
607
608 # Output the fileSection for the whole section
609 # => get the sourcefile and associative file
610
611 my $id_root = "";
612 my $opt_owner_id = "OWNERID=\"M\"";
613
614
615 my $first_assocfile = 1;
616
617 foreach my $data (@{$section_ptr->{'metadata'}}){
618 my $escaped_value = &docprint::escape_text($data->[1]);
619
620 if ($data->[0] eq "gsdlassocfile"){
621
622 $escaped_value =~ m/^(.*?):(.*):(.*)$/;
623 my $assoc_file = $1;
624 my $mime_type = $2;
625 my $assoc_dir = $3;
626
627 if ($first_assocfile) {
628 $id_root = "url";
629 $first_assocfile = 0;
630 }
631 else {
632 $id_root = "FG$assoc_file";
633 }
634
635 $id_root =~ s/\//_/g;
636 $all_text .= " <mets:fileGrp ID=\"$id_root\">\n";
637
638 # The assoc_file's name may be url-encoded, so the xlink_href in the <mets:FLocat>
639 # element must be the url to this (possibly url-encoded) filename
640 my $assocfile_url = &unicode::filename_to_url($assoc_file);
641 my $assfilePath = ($assoc_dir eq "") ? $assocfile_url : "$assoc_dir/$assocfile_url";
642 ++$fileID;
643
644 my $mime_attr = "MIMETYPE=\"$mime_type\"";
645 my $xlink_title = "xlink:title=\"$assoc_file\"";
646
647 my $id_attr;
648 my $xlink_href;
649
650 $id_attr = "ID=\"F$id_root.0\"";
651
652 my $fedora_prefix = $ENV{'FEDORA_HOME'};
653 if (!defined $fedora_prefix) {
654 $xlink_href = "xlink:href=\"$assfilePath\"";
655 }
656 else
657 {
658 my $collectparent;
659 if (defined $ENV{'GSDL3SRCHOME'}) { # we're dealing with a GS3 server
660 if(defined $ENV{'GSDL3HOME'}) { # in case the web directory is located in a separate place
661 $collectparent = &util::filename_cat($ENV{'GSDL3HOME'},"sites","localsite");
662 }
663 else { # try the default location for the web directory
664 $collectparent = &util::filename_cat($ENV{'GSDL3SRCHOME'},"web","sites","localsite");
665 }
666 }
667 else {
668 # greenstone 2
669 $collectparent = $ENV{'GSDLHOME'};
670 }
671
672 my $gsdl_href = &util::filename_cat($working_dir,$assfilePath);
673 $collectparent =~ s/\\/\\\\/g; # escape reserved metacharacter \ in path (by replacing it with \\) for substitution
674 $gsdl_href =~ s/^$collectparent(\/|\\)?//; # remove the collectparent path in gsdl_href and any trailing slash
675 $gsdl_href =~ s/\\/\//g; # make sure we have url paths (which only use / not \)
676 $gsdl_href = "/gsdl/$gsdl_href"; # prepend gsdl
677
678 my $fserver = $ENV{'FEDORA_HOSTNAME'};
679 my $fport = $ENV{'FEDORA_SERVER_PORT'};
680
681 my $fdomain = "http://$fserver:$fport";
682 $xlink_href = "xlink:href=\"$fdomain$gsdl_href\"";
683 }
684
685 my $top_section = $doc_obj->get_top_section();
686 my $id = $doc_obj->get_metadata_element($top_section,"Identifier");
687
688### print STDERR "**** mime-type: $mime_attr\n";
689
690 $all_text .= " <mets:file $mime_attr $id_attr $opt_owner_id >\n";
691 $all_text .= " <mets:FLocat LOCTYPE=\"URL\" $xlink_href $xlink_title />\n";
692
693 $all_text .= " </mets:file>\n";
694
695 $all_text .= " </mets:fileGrp>\n";
696 }
697 }
698
699 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
700
701 return $all_text;
702}
703
704
7051;
Note: See TracBrowser for help on using the repository browser.