source: main/trunk/greenstone2/perllib/plugins/DSpacePlugin.pm@ 24404

Last change on this file since 24404 was 24355, checked in by ak19, 13 years ago

More minor changes to do with the ex.dc.* metadata and embeddedmatadataplugin

  • Property svn:keywords set to Author Date Id Revision
File size: 12.8 KB
RevLine 
[10549]1
[8511]2###########################################################################
3#
[15872]4# DSpacePlugin.pm -- plugin for importing a collection from DSpace
[8511]5#
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
[15872]10# Copyright (C) 2004 New Zealand Digital Library Project
[8511]11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
[15872]28
[8511]29# This plugin takes "contents" and dublin_core.xml file, which contain
30# Metadata and lists of associated files for a particular document
31# and produces a document containing sections, one for each page.
32# The files should be named "contents" and "dublin_core.xml". For each of
33# document in DSpace, it is stored in one directory
34#
35# The format of the "contents" file is as follows:
36#
37# File.type bundle:ORIGINAL
38# license.txt bundle:LICENSE
39# The format of the "dublin_core.xml" file is as follows:
40# The first line contains any metadata for the whole document
41# <dublin_core>
42# eg.
43# <dcvalue element="Title" qualifier="">Snail farming</dcvalue>
44# <dcvalue element="date" qualifier="">2004-10-15</dcvalue>
45#
46
[15872]47package DSpacePlugin;
[8511]48
[19746]49use ReadTextFile;
[8511]50use plugin;
51use XMLParser;
[10254]52use strict;
53no strict 'refs'; # allow filehandles to be variables and viceversa
[8511]54
55sub BEGIN {
[19746]56 @DSpacePlugin::ISA = ('ReadTextFile');
[8511]57}
58
59my $arguments =
60 [ { 'name' => "process_exp",
[15872]61 'desc' => "{BasePlugin.process_exp}",
[8511]62 'type' => "string",
63 'deft' => &get_default_process_exp(),
64 'reqd' => "no" },
65 { 'name' => "only_first_doc",
[15872]66 'desc' => "{DSpacePlugin.only_first_doc}",
[8511]67 'type' => "flag",
68 'reqd' => "no" },
[9168]69 { 'name' => "first_inorder_ext",
[15872]70 'desc' => "{DSpacePlugin.first_inorder_ext}",
[8891]71 'type' => "string",
[8511]72 'reqd' => "no" },
73 { 'name' => "first_inorder_mime",
[15872]74 'desc' => "{DSpacePlugin.first_inorder_mime}",
[8511]75 'type' => "flag",
76 'reqd' => "no" },
77 { 'name' => "block_exp",
[15872]78 'desc' => "{BasePlugin.block_exp}",
[8891]79 'type' => "regexp",
[8511]80 'deft' => &get_default_block_exp(),
81 'reqd' => "no" }];
82
83
[15872]84my $options = { 'name' => "DSpacePlugin",
85 'desc' => "{DSpacePlugin.desc}",
[8511]86 'inherits' => "yes",
[11676]87 'abstract' => "no",
[8511]88 'args' => $arguments };
89
90
91my $primary_doc_lookup = { 'text/html' => '(?i)\.(gif|jpe?g|jpe|jpg|png|css)$' };
92
[8913]93# Important variation to regular plugin structure. Need to declare
94# $self as global variable to file so XMLParser callback routines
95# can access the content of the object.
96my ($self);
97
[8511]98sub new {
[10218]99 my ($class) = shift (@_);
100 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
101 push(@$pluginlist, $class);
[8511]102
[15872]103 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
104 push(@{$hashArgOptLists->{"OptList"}},$options);
[8511]105
[19746]106 $self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists);
[8511]107
[15872]108 if ($self->{'info_only'}) {
109 # don't worry about creating the XML parser as all we want is the
110 # list of plugin options
111 return bless $self, $class;
112 }
113
[8511]114 #create XML::Parser object for parsing dublin_core.xml files
115 my $parser = new XML::Parser('Style' => 'Stream',
116 'Handlers' => {'Char' => \&Char,
117 'Doctype' => \&Doctype
118 });
119 $self->{'parser'} = $parser;
120 $self->{'extra_blocks'} = {};
121
122 return bless $self, $class;
123}
124
125sub get_default_process_exp {
126 my $self = shift (@_);
127
[19746]128 return q^(?i)(contents)$^;
[8511]129}
130
131# want to block all files except the "contents"
132sub get_default_block_exp {
133 my $self = shift (@_);
134
[19746]135 # Block handle and txt files if present. Specifically don't block dublin_core xml
136 return q^(?i)(handle|\.tx?t)$^;
[8511]137}
138
[19746]139sub store_block_files_BACKUP
140{
141 # Option of making blocking sensitive to files that are in directory
142 # This subroutine is not currently used! (relies on default block expression stopping all handle and .txt files)
143
144 my $self =shift (@_);
145 my ($filename_full_path, $block_hash) = @_;
146
147 my ($tailname, $contents_basedir, $suffix) = &File::Basename::fileparse($filename_full_path, "\\.[^\\.]+\$");
148 my $handle_filename = &util::filename_cat($contents_basedir,"handle");
149
[23419]150 if (&util::fd_exists($handle_filename)) {
[23561]151 &util::block_filename($block_hash,$handle_filename);
[23419]152 }
[19746]153}
154
[8511]155sub read_content
156{
157 my $self = shift (@_);
158 my ($dir, $only_first_doc, $first_inorder_ext, $first_inorder_mime, $mimetype_list) = @_;
159 my $outhandle = $self->{'outhandle'};
160
161 my @fnamemime_list = ();
162 my @assocmime_list = ();
163
164 my $content_fname = &util::filename_cat($dir,"contents");
165
166 open(CIN,"<$content_fname")
167 || die "Unable to open $content_fname: $!\n";
168
169 my $line;
170 my $pos = 0;
171
172 while (defined ($line = <CIN>)) {
173 if ($line =~ m/^(.*)\s+bundle:ORIGINAL\s*$/) {
174 my $fname = $1;
175 my $mtype = $mimetype_list->[$pos];
176 my $fm_rec = { 'file' => $fname, 'mimetype' => $mtype};
177 push(@fnamemime_list,$fm_rec);
178 $pos++;
179 }
180 }
181 close CIN;
182
183 if ($only_first_doc){
184 my ($first_fname, @rest_fnames) = @fnamemime_list;
185 @fnamemime_list = ($first_fname);
186 @assocmime_list = @rest_fnames;
187 }
188
189 # allow user to specify the types of files (inorder)they would like to assign as
190 # a primary bitstream
191 if ($first_inorder_ext) {
192 # parse user-define file extension names
193 my @extfiles_list = split /,/, $first_inorder_ext;
194 my (@rest_fnames) = @fnamemime_list;
195 my @matched_list = ();
196 foreach my $file_ext (@extfiles_list) {
197 $pos = 0;
198 foreach my $allfiles (@fnamemime_list){
199 $allfiles->{'file'} =~ /^(.*)\.(.*?)$/;
200 my $allfiles_ext = $2;
201
202 if ($allfiles_ext =~ /$file_ext/) {
203 print $outhandle "Existing file:$allfiles->{'file'} match the user-define File Extension:$file_ext\n";
204 push (@matched_list, $allfiles);
205
206 # delete the matched extension file from the array
207 splice(@rest_fnames,$pos,1);
208
209 return (\@matched_list, \@rest_fnames);
210
211 }
212 $pos++;
213 }
214 }
215 }
216
217 if ($first_inorder_mime) {
218 # parse user-define file mimetype
219 my @file_mime_list = split /,/, $first_inorder_mime;
220 my (@rest_fnames) = @fnamemime_list;
221 my @matched_list = ();
222 foreach my $file_mime (@file_mime_list) {
223 $pos = 0;
224 foreach my $allfiles (@fnamemime_list){
225 my $allfiles_mime = $allfiles->{'mimetype'};
226
227 if ($allfiles_mime =~ /$file_mime/) {
228 print $outhandle "Existing file:$allfiles->{'file'} match the user-defined File MimeType:$file_mime\n";
229 push (@matched_list, $allfiles);
230
231 # delete the matched MIMEType file from the array
232 splice(@rest_fnames,$pos,1);
233 return (\@matched_list, \@rest_fnames);
234 }
235 $pos++;
236 }
237 }
238 }
239 return (\@fnamemime_list, \@assocmime_list);
240}
241
242
243sub filemime_list_to_re
244{
245 my $self = shift (@_);
246 my ($fnamemime_list) = @_;
247
248 my @fname_list = map { "(".$_->{'file'}.")" } @$fnamemime_list;
249 my $fname_re = join("|",@fname_list);
250 $fname_re =~ s/\./\\\./g;
251
252 return $fname_re;
253}
254
255# Read dublin_core metadata from DSpace collection
256sub metadata_read {
257 my $self = shift (@_);
[19493]258 my ($pluginfo, $base_dir, $file, $block_hash,
259 $extrametakeys, $extrametadata, $extrametafile,
[23212]260 $processor, $gli, $aux) = @_;
[8511]261
262 my $only_first_doc = $self->{'only_first_doc'};
263 my $first_inorder_ext = $self->{'first_inorder_ext'};
264 my $first_inorder_mime = $self->{'first_inorder_mime'};
265
266 my $outhandle = $self->{'outhandle'};
267
268 my $filename = &util::filename_cat($base_dir, $file);
269 # return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
270
271 if ($filename !~ /dublin_core\.xml$/ || !-f $filename) {
272 return undef;
273 }
274
[15872]275 print $outhandle "DSpacePlugin: extracting metadata from $file\n"
[8511]276 if $self->{'verbosity'} > 1;
277
278 my ($dir) = $filename =~ /^(.*?)[^\/\\]*$/;
279
280 eval {
281 $self->{'parser'}->parsefile($filename);
282 };
283
284 if ($@) {
[15872]285 die "DSpacePlugin: ERROR $filename is not a well formed dublin_core.xml file ($@)\n";
[8511]286 }
287
[24306]288 my $mimetype_list = $self->{'saved_metadata'}->{'ex.dc.Format^mimetype'};
[8511]289 my ($doc_file_mimes, $assoc_file_mimes) = $self->read_content($dir, $only_first_doc, $first_inorder_ext,
290 $first_inorder_mime, $mimetype_list);
[8913]291
[8511]292 my $file_re = $self->filemime_list_to_re($doc_file_mimes);
293
294 if ($only_first_doc || $first_inorder_ext || $first_inorder_mime) {
295 foreach my $afm ( @$assoc_file_mimes ) {
296 my $full_af = &util::filename_cat($dir,$afm->{'file'});
297 $self->{'extra_blocks'}->{$full_af} = 1;
298 }
299 }
300 push(@$extrametakeys,$file_re);
301
[24355]302 # See Format's Extent section in http://dublincore.org/documents/usageguide/qualifiers.shtml
303 # it could specify duration, size or even dimensions of the resource. It may be a useful piece
304 # of metadata to preserve after all.
305 #if (defined $self->{'saved_metadata'}->{'ex.dc.Format^extent'}) {
306 #delete $self->{'saved_metadata'}->{'ex.dc.Format^extent'};
307 #}
[8511]308
309 if (defined $mimetype_list) {
[24306]310 delete $self->{'saved_metadata'}->{'ex.dc.Format^mimetype'};
[8511]311
312 # Temporarily store associate file info in metadata table
[15872]313 # This will be removed in 'extra_metadata' in BasePlugin and used
[8511]314 # to perform the actual file association (once the doc obj has
315 # been formed
316
317 my $main_doc = $doc_file_mimes->[0];
318 my $md_mimetype = $main_doc->{'mimetype'};
319
320 my $pd_lookup = $primary_doc_lookup->{$md_mimetype};
321
322 if (defined $pd_lookup) {
323 my $filter_re = $pd_lookup;
324 @$assoc_file_mimes = grep { $_->{'file'} !~ m/$filter_re/ } @$assoc_file_mimes;
325 }
326
327 my @gsdlassocfile_tobe
[24219]328 = map { &util::filename_cat($dir,$_->{'file'}) .":".$_->{'mimetype'}.":" } @$assoc_file_mimes if @$assoc_file_mimes;
[8511]329 $self->{'saved_metadata'}->{'gsdlassocfile_tobe'} = \@gsdlassocfile_tobe;
330
331 }
[8913]332
[8511]333 $extrametadata->{$file_re} = $self->{'saved_metadata'};
334
335 return 1;
336}
337
338
[15872]339# The DSpacePlugin read() function. We are not actually reading any documents
340# here, just blocking ones that have been processed by metadata read.
[8511]341#
[15872]342# Returns 0 for a file its blocking, undef for any other
[8511]343sub read {
344 my $self = shift (@_);
[16392]345 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
[8511]346 my $outhandle = $self->{'outhandle'};
347
348 # Block all files except contents
349 my $filename = &util::filename_cat($base_dir, $file);
350 return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
351
352 my $assocfile = $metadata->{'assocfile'};
353
354 return 0 if (($filename =~ /dublin_core\.xml$/) || ($filename =~ /contents$/));
355 return 0 if (defined $self->{'extra_blocks'}->{$filename});
356 return undef;
357}
358
359sub Doctype {
360 my ($expat, $name, $sysid, $pubid, $internal) = @_;
361
362 die if ($name !~ /^dublin_core$/);
363}
364
365sub StartTag {
366 my ($expat, $element) = @_;
367 if ($element eq "dublin_core") {
368 $self->{'saved_metadata'} = {};
[8913]369 } elsif ($element eq "dcvalue") {
[8511]370 my $metaname = $_{'element'};
[10549]371 my $qualifier = $_{'qualifier'}||"";
[21367]372 if ($metaname ne "description" || $qualifier ne "provenance") {
[9694]373 $metaname .= "^$qualifier" if ($qualifier ne "none" && $qualifier ne "");
[24306]374 $self->{'metaname'} = "ex.dc.\u$metaname";
[8511]375 }
376 }
377}
378
379sub EndTag {
380 my ($expat, $element) = @_;
381
382 if ($element eq "dcvalue") {
383 $self->{'metaname'} = "";
384 }
385}
386
387sub Text {
388 if (defined ($self->{'metaname'}) && $self->{'metaname'} ne "") {
389 # $_ == Metadata content
390 my $mname = $self->{'metaname'};
[21367]391 my $mvalue = prepareMetadataValue($_);
[8511]392 if (defined $self->{'saved_metadata'}->{$mname}) {
393 # accumulate - add value to existing value(s)
394 if (ref ($self->{'saved_metadata'}->{$mname}) eq "ARRAY") {
[21367]395 push (@{$self->{'saved_metadata'}->{$mname}}, $mvalue);
[8511]396 } else {
397 $self->{'saved_metadata'}->{$mname} =
[21367]398 [$self->{'saved_metadata'}->{$mname}, $mvalue];
[8511]399 }
400 } else {
401 # accumulate - add value into (currently empty) array
[21367]402 $self->{'saved_metadata'}->{$mname} = [$mvalue];
[8511]403 }
404
405 }
406}
407
[21367]408# Prepare DSpace metadata for using with Greenstone.
409# Some value must be escaped.
410sub prepareMetadataValue {
411 my ($value) = @_;
412
413 $value =~ s/\[/&#091;/g;
414 $value =~ s/\]/&#093;/g;
415
416 return $value;
417 }
[8511]418# This Char function overrides the one in XML::Parser::Stream to overcome a
419# problem where $expat->{Text} is treated as the return value, slowing
420# things down significantly in some cases.
421sub Char {
[9462]422 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
423 $_[0]->{'Text'} .= $_[1];
424 return undef;
[8511]425}
426
4271;
Note: See TracBrowser for help on using the repository browser.