source: main/trunk/greenstone2/perllib/plugouts/METSPlugout.pm@ 22839

Last change on this file since 22839 was 22839, checked in by davidb, 14 years ago

More explicit use of utf8 for input and output file handling. Relies on strings in Perl being Unicode aware (and not merely binary bytes) otherwise binary bytes will then be incorrectly re-incoded as UTF-8 (which is not what you want as they already are in UTF-8 form).

  • Property svn:keywords set to Author Date Id Revision
File size: 11.9 KB
Line 
1###########################################################################
2#
3# METSPlugout.pm -- the plugout module for METS archives
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package METSPlugout;
27
28use strict;
29no strict 'subs';
30no strict 'refs';
31
32use gsprintf 'gsprintf';
33
34eval {require bytes};
35use util;
36use BasePlugout;
37use docprint; # for escape_text
38
39sub BEGIN {
40 @METSPlugout::ISA = ('BasePlugout');
41}
42
43my $arguments = [
44 { 'name' => "xslt_txt",
45 'desc' => "{METSPlugout.xslt_txt}",
46 'type' => "string",
47 'reqd' => "no",
48 'hiddengli' => "no"},
49 { 'name' => "xslt_mets",
50 'desc' => "{METSPlugout.xslt_mets}",
51 'type' => "string",
52 'reqd' => "no",
53 'hiddengli' => "no"}
54 ];
55
56my $options = { 'name' => "METSPlugout",
57 'desc' => "{METSPlugout.desc}",
58 'abstract' => "yes",
59 'inherits' => "yes",
60 'args' => $arguments
61 };
62
63sub new {
64 my ($class) = shift (@_);
65 my ($plugoutlist, $inputargs,$hashArgOptLists) = @_;
66 push(@$plugoutlist, $class);
67
68
69 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
70 push(@{$hashArgOptLists->{"OptList"}},$options);
71
72 my $self = new BasePlugout($plugoutlist,$inputargs,$hashArgOptLists);
73 return bless $self, $class;
74}
75
76
77sub saveas_doctxt
78{
79 my $self = shift (@_);
80 my ($doc_obj,$working_dir) = @_;
81
82 my $is_recursive = 1;
83
84 my $doc_txt_file = &util::filename_cat ($working_dir,"doctxt.xml");
85
86 $self->open_xslt_pipe($doc_txt_file,$self->{'xslt_txt'});
87
88 my $outhandler;
89
90 if (defined $self->{'xslt_writer'}){
91 $outhandler = $self->{'xslt_writer'};
92 }
93 else{
94 $outhandler = $self->get_output_handler($doc_txt_file);
95 }
96
97 binmode($outhandler,":utf8");
98
99 $self->output_xml_header($outhandler);
100 my $section = $doc_obj->get_top_section();
101 $self->output_txt_section($outhandler,$doc_obj, $section, $is_recursive);
102 $self->output_xml_footer($outhandler);
103
104
105 if (defined $self->{'xslt_writer'}){
106 $self->close_xslt_pipe();
107 }
108 else{
109 close($outhandler);
110 }
111
112}
113
114sub saveas_docmets
115{
116 my $self = shift (@_);
117 my ($doc_obj,$working_dir) = @_;
118
119 my $doc_mets_file = &util::filename_cat ($working_dir, "docmets.xml");
120
121 my $doc_title = $doc_obj->get_metadata_element($doc_obj->get_top_section(),"dc.Title");
122 if (!defined $doc_title) {
123 $doc_title = $doc_obj->get_metadata_element($doc_obj->get_top_section(),"Title");
124 }
125
126 $self->open_xslt_pipe($doc_mets_file,$self->{'xslt_mets'});
127
128 my $outhandler;
129
130 if (defined $self->{'xslt_writer'}){
131 $outhandler = $self->{'xslt_writer'};
132 }
133 else{
134 $outhandler = $self->get_output_handler($doc_mets_file);
135 }
136
137 binmode($outhandler,":utf8");
138
139 $self->output_mets_xml_header($outhandler, $doc_obj->get_OID(), $doc_title);
140 $self->output_mets_section($outhandler, $doc_obj, $doc_obj->get_top_section(),$working_dir);
141 $self->output_mets_xml_footer($outhandler);
142
143 if (defined $self->{'xslt_writer'}){
144 $self->close_xslt_pipe();
145 }
146 else{
147 close($outhandler);
148 }
149
150
151}
152
153sub saveas
154{
155 my $self = shift (@_);
156 my ($doc_obj,$doc_dir) = @_;
157
158 $self->process_assoc_files ($doc_obj, $doc_dir, '');
159
160 $self->process_metafiles_metadata ($doc_obj);
161
162 my $output_dir = $self->get_output_dir();
163 &util::mk_all_dir ($output_dir) unless -e $output_dir;
164
165 my $working_dir = &util::filename_cat ($output_dir, $doc_dir);
166
167 &util::mk_all_dir ($working_dir) unless -e $working_dir;
168
169 ###
170 # Save the text as a filefile
171 ###
172 $self->saveas_doctxt($doc_obj,$working_dir);
173
174 ###
175 # Save the structure and metadata as a METS file
176 ###
177 $self->saveas_docmets($doc_obj,$working_dir);
178
179 $self->{'short_doc_file'} = &util::filename_cat ($doc_dir, "docmets.xml");
180
181 $self->store_output_info_reference($doc_obj);
182
183}
184
185
186sub output_mets_xml_header
187{
188 my $self = shift(@_);
189 my ($handle, $OID, $doc_title) = @_;
190
191 gsprintf(STDERR, "METSPlugout::output_mets_xml_header {common.must_be_implemented}\n") && die "\n";
192}
193
194sub output_mets_xml_header_extra_attribute
195{
196 my $self = shift(@_);
197 my ($handle, $extra_attr, $extra_schema) = @_;
198
199 print $handle '<?xml version="1.0" encoding="UTF-8" standalone="no"?>' . "\n";
200 print $handle '<mets:mets xmlns:mets="http://www.loc.gov/METS/"' . "\n";
201 print $handle ' xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"' . "\n";
202 print $handle ' xmlns:gsdl3="http://www.greenstone.org/namespace/gsdlmetadata/1.0/"' . "\n";
203 if (defined ($ENV{'FEDORA_VERSION'}) && $ENV{'FEDORA_VERSION'} =~ m/^2/) { # checking if major version is 2
204 print $handle ' xmlns:xlink="http://www.w3.org/TR/xlink"' ."\n";
205 }
206 else {
207 print $handle ' xmlns:xlink="http://www.w3.org/1999/xlink"' ."\n";
208 }
209 print $handle ' xsi:schemaLocation="http://www.loc.gov/METS/' . "\n";
210 print $handle ' http://www.loc.gov/standards/mets/mets.xsd' . "\n";
211 print $handle " $extra_schema\n" if (defined $extra_schema);
212 print $handle ' http://www.greenstone.org/namespace/gsdlmetadata/1.0/' . "\n";
213 print $handle ' http://www.greenstone.org/namespace/gsdlmetadata/1.0/gsdl_metadata.xsd"' . "\n";
214
215 print $handle " $extra_attr>\n";
216
217}
218
219sub output_mets_xml_footer
220{
221 my $self = shift(@_);
222 my ($handle) = @_;
223 print $handle '</mets:mets>' . "\n";
224}
225
226# print out doctxt.xml file
227sub output_txt_section {
228 my $self = shift (@_);
229 my ($handle, $doc_obj, $section, $is_recursive) = @_;
230
231 print $handle $self->buffer_txt_section_xml($doc_obj, $section, $is_recursive);
232}
233
234sub buffer_txt_section_xml {
235 my $self = shift(@_);
236 my ($doc_obj, $section, $is_recursive) = @_;
237
238 my $section_ptr = $doc_obj->_lookup_section ($section);
239
240 return "" unless defined $section_ptr;
241
242 my $all_text = "<Section>\n";
243 $all_text .= &docprint::escape_text("$section_ptr->{'text'}");
244
245 if (defined $is_recursive && $is_recursive)
246 {
247 # Output all the subsections
248 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
249 $all_text .= $self->buffer_txt_section_xml($doc_obj, "$section.$subsection", $is_recursive);
250 }
251 }
252
253 $all_text .= "</Section>\n";
254
255
256 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
257 return $all_text;
258}
259
260#
261# Print out docmets.xml file
262#
263sub output_mets_section
264{
265 my $self = shift(@_);
266 my ($handle, $doc_obj, $section, $working_dir) = @_;
267
268 gsprintf(STDERR, "METSPlugout::output_mets_section {common.must_be_implemented}\n") && die "\n";
269
270}
271
272
273sub buffer_mets_dmdSection_section_xml
274{
275 my $self = shift(@_);
276 my ($doc_obj,$section) = @_;
277
278 gsprintf(STDERR, "METSPlugout::buffer_mets_dmdSection_section_xml {common.must_be_implemented}\n") && die "\n";
279}
280
281sub buffer_mets_StructMapSection_section_xml
282{
283 my $self = shift(@_);
284 my ($doc_obj,$section, $order_numref, $fileid_base) = @_;
285
286 $section="" unless defined $section;
287
288
289 my $section_ptr=$doc_obj->_lookup_section($section);
290 return "" unless defined $section_ptr;
291
292 $fileid_base = "FILEGROUP_PRELUDE" unless defined $fileid_base;
293
294 # output fileSection by Sections
295 my $section_num ="1". $section;
296 my $dmd_num = $section_num;
297
298 #**output the StructMap details
299
300 my $dmdid_attr = "DM$dmd_num";
301
302 my $all_text = " <mets:div ID=\"DS$section_num\" TYPE=\"Section\" \n";
303 $all_text .= ' ORDER="'.$$order_numref++.'" ORDERLABEL="'. $section_num .'" '."\n";
304 $all_text .= " LABEL=\"$section_num\" DMDID=\"$dmdid_attr\">\n";
305
306 $all_text .= ' <mets:fptr FILEID="'.$fileid_base.$section_num.'" />'. "\n";
307
308
309 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
310 $all_text .= $self->buffer_mets_StructMapSection_section_xml($doc_obj,"$section.$subsection", $order_numref, $fileid_base);
311 }
312
313 $all_text .= " </mets:div>\n";
314
315 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
316
317 return $all_text;
318}
319
320
321sub buffer_mets_StructMapWhole_section_xml
322{
323 my $self = shift(@_);
324 my ($doc_obj,$section) = @_;
325
326 my $section_ptr = $doc_obj->_lookup_section($section);
327 return "" unless defined $section_ptr;
328
329 my $all_text="";
330 my $fileID=0;
331 my $order_num = 0;
332
333 $all_text .= ' <mets:div ID="DSAll" TYPE="Document" ORDER="'.$order_num.'" ORDERLABEL="All" LABEL="Whole Documemt" DMDID="DM1">' . "\n";
334
335 #** output the StructMapSection for the whole section
336 # get the sourcefile and associative file
337
338 foreach my $data (@{$section_ptr->{'metadata'}}){
339 my $escaped_value = &docprint::escape_text($data->[1]);
340
341 if ($data->[0] eq "gsdlsourcefilename") {
342 ++$fileID;
343 $all_text .= ' <mets:fptr FILEID="default.'.$fileID.'" />'."\n";
344 }
345
346 if ($data->[0] eq "gsdlassocfile"){
347 ++$fileID;
348 $all_text .= ' <mets:fptr FILEID="default.'.$fileID. '" />'. "\n";
349 }
350 }
351 $all_text .= " </mets:div>\n";
352
353 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
354
355 return $all_text;
356}
357
358
359
360sub doctxt_to_xlink
361{
362 my $self = shift @_;
363 my ($fname,$working_dir) = @_;
364
365 gsprintf(STDERR, "METSPlugout::doxtxt_to_xlink {common.must_be_implemented}\n") && die "\n";
366}
367
368sub buffer_mets_fileSection_section_xml
369{
370 my $self = shift(@_);
371 my ($doc_obj,$section,$working_dir, $is_txt_split,$opt_attr,$fileid_base) = @_;
372
373 #$section="" unless defined $section;
374
375 my $section_ptr=$doc_obj->_lookup_section($section);
376 return "" unless defined $section_ptr;
377
378 $fileid_base = "FILEGROUP_PRELUDE" unless defined $fileid_base;
379
380 # output fileSection by sections
381 my $section_num ="1". $section;
382
383 $opt_attr = "" unless defined $opt_attr;
384
385 # output the fileSection details
386 my $all_text = ' <mets:fileGrp ID="'.$fileid_base.$section_num . '">'. "\n";
387 $all_text .= " <mets:file MIMETYPE=\"text/xml\" ID=\"FILE$section_num\" $opt_attr >\n";
388
389 my $xlink;
390 if (defined $is_txt_split && $is_txt_split)
391 {
392 my $section_fnum ="1". $section;
393 $section_fnum =~ s/\./_/g;
394
395 $xlink = $self->doctxt_to_xlink("doctxt$section_fnum.xml",$working_dir);
396 }
397 else
398 {
399 $xlink = $self->doctxt_to_xlink("doctxt.xml",$working_dir);
400
401 $xlink .= '#xpointer(/Section[';
402
403 my $xpath = "1".$section;
404 $xpath =~ s/\./\]\/Section\[/g;
405
406 $xlink .= $xpath;
407
408 $xlink .= ']/text())';
409 }
410
411
412
413 $all_text .= ' <mets:FLocat LOCTYPE="URL" xlink:href="'.$xlink.'"';
414
415 $all_text .= ' xlink:title="Hierarchical Document Structure"/>' . "\n";
416 $all_text .= " </mets:file>\n";
417 $all_text .= " </mets:fileGrp>\n";
418
419
420 foreach my $subsection (@{$section_ptr->{'subsection_order'}}){
421 $all_text .= $self->buffer_mets_fileSection_section_xml($doc_obj,"$section.$subsection",$working_dir, $is_txt_split, $opt_attr, $fileid_base);
422 }
423
424 $all_text =~ s/[\x00-\x09\x0B\x0C\x0E-\x1F]//g;
425
426 return $all_text;
427}
428
429sub buffer_mets_fileWhole_section_xml
430{
431 my $self = shift(@_);
432 my ($doc_obj,$section,$working_dir) = @_;
433
434 gsprintf(STDERR, "METSPlugout::buffer_mets_fileWhole_section_xml {common.must_be_implemented}\n") && die "\n";
435
436}
437
4381;
Note: See TracBrowser for help on using the repository browser.