source: main/trunk/greenstone2/perllib/plugouts/DSpacePlugout.pm@ 24402

Last change on this file since 24402 was 24402, checked in by ak19, 13 years ago

DSpacePlugout modified to make use of embedded dc (ex.dc) metadata for those metadata fields for which dc.meta was not assigned. Tested with DSpace example re-exported as DSpace.

  • Property svn:keywords set to Author Date Id Revision
File size: 10.3 KB
RevLine 
[12330]1###########################################################################
2#
3# DSpacePlugout.pm -- the plugout module for DSpace archives
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2006 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package DSpacePlugout;
27
28use strict;
29no strict 'refs';
[23824]30use utf8;
[12330]31eval {require bytes};
32use util;
[17203]33use BasePlugout;
[12330]34
35sub BEGIN {
[17203]36 @DSpacePlugout::ISA = ('BasePlugout');
[12330]37}
38
[23824]39my $arguments = [
40 { 'name' => "metadata_prefix",
41 'desc' => "{DSpacePlugout.metadata_prefix}",
42 'type' => "string",
43 'reqd' => "no",
44 'hiddengli' => "no"} ];
[12330]45
[23824]46
[12330]47my $options = { 'name' => "DSpacePlugout",
48 'desc' => "{DSpacePlugout.desc}",
49 'abstract' => "no",
[23824]50 'inherits' => "yes",
51 'args' => $arguments };
[12330]52
53sub new {
54 my ($class) = shift (@_);
55 my ($plugoutlist, $inputargs,$hashArgOptLists) = @_;
56 push(@$plugoutlist, $class);
57
[17203]58 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
59 push(@{$hashArgOptLists->{"OptList"}},$options);
[12330]60
[17203]61 my $self = new BasePlugout($plugoutlist,$inputargs,$hashArgOptLists);
[23824]62
[24402]63# print STDERR "***** metadata prefix = \"", $self->{'metadata_prefix'}, "\"\n";
[23824]64
[17203]65 return bless $self, $class;
[12330]66}
67
[23824]68sub saveas_dspace_metadata
69{
70 my $self = shift (@_);
71 my ($doc_obj,$working_dir,$metadata_file,$docroot,$metadata_prefix) = @_;
72
73 # my $docroot_attributes = ($metadata_prefix eq "dc") ? undef : "schema=\"$metadata_prefix\"";
74 my $docroot_attributes = "schema=\"$metadata_prefix\"";
75
76 my $doc_dc_file = &util::filename_cat ($working_dir, $metadata_file);
77 $self->open_xslt_pipe($doc_dc_file,$self->{'xslt_file'});
78
79 my $outhandler;
80 if (defined $self->{'xslt_writer'}){
81 $outhandler = $self->{'xslt_writer'};
82 }
83 else{
84 $outhandler = $self->get_output_handler($doc_dc_file);
85 }
86
87 $self->output_general_xml_header($outhandler, $docroot, $docroot_attributes);
88
89 my $metadata_hashmap = $doc_obj->get_metadata_hashmap($doc_obj->get_top_section(),
90 $metadata_prefix);
91
[24402]92 if(defined $metadata_prefix && $metadata_prefix ne "") {
93 # merge dc with any ex.dc
94
95 my $ex_dc_metadata_hashmap = $doc_obj->get_metadata_hashmap($doc_obj->get_top_section(),
96 "ex.dc");
97
98 foreach my $metaname (keys %$ex_dc_metadata_hashmap) {
99 my $metaname_without_ex_prefix = $metaname;
100 $metaname_without_ex_prefix =~ s/^ex\.(.*)/$1/; # remove any ex from the ex.dc prefix
101
102 # if there's an ex.dc value for a metaname for which no dc
103 # value was assigned, put the ex.dc value into the hashmap
104 if(!defined $metadata_hashmap->{$metaname_without_ex_prefix}) {
105 $metadata_hashmap->{$metaname_without_ex_prefix} = [];
106 push(@{$metadata_hashmap->{$metaname_without_ex_prefix}},@{$ex_dc_metadata_hashmap->{$metaname}});
107 }
108 }
109
110 }
111
[23824]112 foreach my $metaname (keys %$metadata_hashmap) {
113 my $metavals = $metadata_hashmap->{$metaname};
114
115 my $qualifier = undef;
116 my $element;
117 if ($metaname =~ m/^(.*?)\^(.*?)$/) {
118 $element = $1;
119 $qualifier = $2;
120 $qualifier = lc($qualifier);
121 }
122 else {
123 $element = $metaname;
124 }
125 $element =~ s/^.*?\.//;
126 $element = lc($element);
127
128 foreach my $metaval (@$metavals) {
129
130 #if element is empty then no need to export it.
131
132 if ($metaval =~ /\S/) {
133 print $outhandler " <dcvalue element=\"$element\"";
134 #If no qualifier then add qualifier="none"
135 #print $outhandler " qualifier=\"$qualifier\"" if defined $qualifier;
136 if (defined $qualifier) {
137 print $outhandler " qualifier=\"$qualifier\"" ;
138 }
139 else {
140 print $outhandler " qualifier=\"none\" language=\"\"" ;
141 }
142 print $outhandler ">$metaval";
143 print $outhandler "</dcvalue>\n";
144 }
145
146
147
148 }
149 }
150
151 $self->output_general_xml_footer($outhandler,$docroot);
152
153 if (defined $self->{'xslt_writer'}){
154 $self->close_xslt_pipe();
155 }
156 else{
157 close($outhandler);
158 }
159
160}
161
[12330]162sub saveas {
163 my $self = shift (@_);
164 my ($doc_obj,$doc_dir) = @_;
165
166 my $output_dir = $self->get_output_dir();
167 &util::mk_all_dir ($output_dir) unless -e $output_dir;
168
169 my $working_dir = &util::filename_cat ($output_dir, $doc_dir);
170 &util::mk_all_dir ($working_dir, $doc_dir);
171
172 #########################
173 # save the handle file
174 #########################
175 my $outhandle = $self->{'output_handle'};
176
[23824]177 my $generate_handle = 0;
178 if ($generate_handle) {
179 # Genereate handle file
180 # (Note: this section of code would benefit from being restructured)
181 my $doc_handle_file = &util::filename_cat ($working_dir, "handle");
[12330]182
[23824]183 my $env_hp = $ENV{'DSPACE_HANDLE_PREFIX'};
184 my $handle_prefix = (defined $env_hp) ? $env_hp : "123456789";
[12330]185
[23824]186 my $outhandler = $self->get_output_handler($doc_handle_file);
[12330]187
[23824]188 my ($handle) = ($doc_dir =~ m/^(.*)(:?\.dir)?$/);
[12330]189
[23824]190 print $outhandler "$handle_prefix/$handle\n";
[12330]191
[23824]192 close ($outhandler);
193 }
194
[12330]195 #########################
196 # save the content file
197 #########################
198 my $doc_contents_file = &util::filename_cat ($working_dir, "contents");
199
[23824]200 my $outhandler = $self->get_output_handler($doc_contents_file);
[12330]201
202 $self->process_assoc_files ($doc_obj, $doc_dir, $outhandler);
203
[20642]204 $self->process_metafiles_metadata ($doc_obj);
205
[12330]206 close($outhandler);
207
[23824]208 #############################
[12330]209 # save the dublin_core.xml file
210 ###############################
[23824]211# my $doc_dc_file = &util::filename_cat ($working_dir, "dublin_core.xml");
212# $self->open_xslt_pipe($doc_dc_file,$self->{'xslt_file'});
[12330]213
[23824]214# if (defined $self->{'xslt_writer'}){
215# $outhandler = $self->{'xslt_writer'};
216# }
217# else{
218# $outhandler = $self->get_output_handler($doc_dc_file);
219# }
[12330]220
[23824]221# $self->output_general_xml_header($outhandler, "dublin_core");
[13172]222
[23824]223# my $all_text = $self->get_dc_metadata($doc_obj, $doc_obj->get_top_section());
224# print $outhandler $all_text;
[13172]225
[23824]226# $self->output_general_xml_footer($outhandler,"dublin_core");
[12330]227
[23824]228# if (defined $self->{'xslt_writer'}){
229# $self->close_xslt_pipe();
230# }
231# else{
232# close($outhandler);
233# }
234
235 $self->saveas_dspace_metadata($doc_obj,$working_dir,
236 "dublin_core.xml","dublin_core","dc");
237
238 my $metadata_prefix_list = $self->{'metadata_prefix'};
[24402]239# print STDERR "***!! md prefix = $metadata_prefix_list\n";
[23824]240
241 my @metadata_prefixes = split(/,\s*/,$metadata_prefix_list);
242 foreach my $ep (@metadata_prefixes) {
243 $self->saveas_dspace_metadata($doc_obj,$working_dir,
244 "metadata_$ep.xml","dublin_core",$ep);
[12330]245 }
[23824]246
[12330]247 $self->{'short_doc_file'} = &util::filename_cat ($doc_dir, "dublin_core.xml");
[12363]248 $self->store_output_info_reference($doc_obj);
[12330]249}
250
251 sub process_assoc_files {
252 my $self = shift (@_);
253 my ($doc_obj, $doc_dir, $handle) = @_;
254
255 my $outhandler = $self->{'output_handle'};
256
257 my $output_dir = $self->get_output_dir();
258 return if (!defined $output_dir);
259
260 my $working_dir = &util::filename_cat($output_dir, $doc_dir);
261
262 my @assoc_files = ();
263 my $filename;;
264
265 my $source_filename = $doc_obj->get_source_filename();
266
267 my $collect_dir = $ENV{'GSDLCOLLECTDIR'};
268
269 if (defined $collect_dir) {
270 my $dirsep_regexp = &util::get_os_dirsep();
271
272 if ($collect_dir !~ /$dirsep_regexp$/) {
273 $collect_dir .= &util::get_dirsep(); # ensure there is a slash at the end
274 }
275
276 # This test is never going to fail on Windows -- is this a problem?
277 if ($source_filename !~ /^$dirsep_regexp/) {
278 $source_filename = &util::filename_cat($collect_dir, $source_filename);
279 }
280 }
281
[20903]282 my ($tail_filename) = ($source_filename =~ m/([^\/\\]*)$/);
[12330]283
284 print $handle "$tail_filename\n";
285
286 $filename = &util::filename_cat($working_dir, $tail_filename);
[18463]287 &util::hard_link ($source_filename, $filename, $self->{'verbosity'});
[12330]288
289 # set the assocfile path (even if we have no assoc files - need this for lucene)
290 $doc_obj->set_utf8_metadata_element ($doc_obj->get_top_section(),
291 "assocfilepath",
292 "$doc_dir");
293 foreach my $assoc_file_rec (@{$doc_obj->get_assoc_files()}) {
294 my ($dir, $afile) = $assoc_file_rec->[1] =~ /^(.*?)([^\/\\]+)$/;
295 $dir = "" unless defined $dir;
296
297
298 my $real_filename = $assoc_file_rec->[0];
299 # for some reasons the image associate file has / before the full path
300 $real_filename =~ s/^\\(.*)/$1/i;
301 if (-e $real_filename) {
[22921]302 # escape backslashes in path for upcoming regex match
303 my $escaped_source_filename = $source_filename;
304 $escaped_source_filename =~ s/\\/\\\\/g;
305 if ($real_filename =~ m/$escaped_source_filename$/) {
[12330]306 next;
307 }
308 else {
309 my $bundle = "bundle:ORIGINAL";
310
311 if ($afile =~ m/^thumbnail\./) {
312 $bundle = "bundle:THUMBNAIL";
313 }
314
[23824]315 # Store the associated file to the "contents" file. Cover.pdf not needed.
316 if ($afile ne "cover.jpg") {
317 print $handle "$assoc_file_rec->[1]\t$bundle\n";
[12330]318 }
[23824]319 }
[12330]320
321 $filename = &util::filename_cat($working_dir, $afile);
322
[23824]323 if ($afile ne "cover.jpg") {
324 &util::hard_link ($real_filename, $filename, $self->{'verbosity'});
325 $doc_obj->add_utf8_metadata ($doc_obj->get_top_section(),
326 "gsdlassocfile",
327 "$afile:$assoc_file_rec->[2]:$dir");
328 }
[12330]329 } elsif ($self->{'verbosity'} > 2) {
330 print $outhandler "DSpacePlugout::process couldn't copy the associated file " .
331 "$real_filename to $afile\n";
332 }
333 }
334}
335
336
337sub get_new_doc_dir{
338 my $self = shift (@_);
339 my($working_info,$working_dir,$OID) = @_;
340
341 return $OID;
342
343}
Note: See TracBrowser for help on using the repository browser.