source: main/trunk/greenstone2/perllib/plugins/DSpacePlugin.pm@ 24306

Last change on this file since 24306 was 24306, checked in by ak19, 13 years ago

More changes to do with the ex. prefixed to embedded metadata (that may have an additional metadata set as namespace qualifier). The C code now removes the ex. prefix only if there are no other metadataset qualifiers in the metadata name.

  • Property svn:keywords set to Author Date Id Revision
File size: 12.5 KB
RevLine 
[10549]1
[8511]2###########################################################################
3#
[15872]4# DSpacePlugin.pm -- plugin for importing a collection from DSpace
[8511]5#
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
[15872]10# Copyright (C) 2004 New Zealand Digital Library Project
[8511]11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
[15872]28
[8511]29# This plugin takes "contents" and dublin_core.xml file, which contain
30# Metadata and lists of associated files for a particular document
31# and produces a document containing sections, one for each page.
32# The files should be named "contents" and "dublin_core.xml". For each of
33# document in DSpace, it is stored in one directory
34#
35# The format of the "contents" file is as follows:
36#
37# File.type bundle:ORIGINAL
38# license.txt bundle:LICENSE
39# The format of the "dublin_core.xml" file is as follows:
40# The first line contains any metadata for the whole document
41# <dublin_core>
42# eg.
43# <dcvalue element="Title" qualifier="">Snail farming</dcvalue>
44# <dcvalue element="date" qualifier="">2004-10-15</dcvalue>
45#
46
[15872]47package DSpacePlugin;
[8511]48
[19746]49use ReadTextFile;
[8511]50use plugin;
51use XMLParser;
[10254]52use strict;
53no strict 'refs'; # allow filehandles to be variables and viceversa
[8511]54
55sub BEGIN {
[19746]56 @DSpacePlugin::ISA = ('ReadTextFile');
[8511]57}
58
59my $arguments =
60 [ { 'name' => "process_exp",
[15872]61 'desc' => "{BasePlugin.process_exp}",
[8511]62 'type' => "string",
63 'deft' => &get_default_process_exp(),
64 'reqd' => "no" },
65 { 'name' => "only_first_doc",
[15872]66 'desc' => "{DSpacePlugin.only_first_doc}",
[8511]67 'type' => "flag",
68 'reqd' => "no" },
[9168]69 { 'name' => "first_inorder_ext",
[15872]70 'desc' => "{DSpacePlugin.first_inorder_ext}",
[8891]71 'type' => "string",
[8511]72 'reqd' => "no" },
73 { 'name' => "first_inorder_mime",
[15872]74 'desc' => "{DSpacePlugin.first_inorder_mime}",
[8511]75 'type' => "flag",
76 'reqd' => "no" },
77 { 'name' => "block_exp",
[15872]78 'desc' => "{BasePlugin.block_exp}",
[8891]79 'type' => "regexp",
[8511]80 'deft' => &get_default_block_exp(),
81 'reqd' => "no" }];
82
83
[15872]84my $options = { 'name' => "DSpacePlugin",
85 'desc' => "{DSpacePlugin.desc}",
[8511]86 'inherits' => "yes",
[11676]87 'abstract' => "no",
[8511]88 'args' => $arguments };
89
90
91my $primary_doc_lookup = { 'text/html' => '(?i)\.(gif|jpe?g|jpe|jpg|png|css)$' };
92
[8913]93# Important variation to regular plugin structure. Need to declare
94# $self as global variable to file so XMLParser callback routines
95# can access the content of the object.
96my ($self);
97
[8511]98sub new {
[10218]99 my ($class) = shift (@_);
100 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
101 push(@$pluginlist, $class);
[8511]102
[15872]103 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
104 push(@{$hashArgOptLists->{"OptList"}},$options);
[8511]105
[19746]106 $self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists);
[8511]107
[15872]108 if ($self->{'info_only'}) {
109 # don't worry about creating the XML parser as all we want is the
110 # list of plugin options
111 return bless $self, $class;
112 }
113
[8511]114 #create XML::Parser object for parsing dublin_core.xml files
115 my $parser = new XML::Parser('Style' => 'Stream',
116 'Handlers' => {'Char' => \&Char,
117 'Doctype' => \&Doctype
118 });
119 $self->{'parser'} = $parser;
120 $self->{'extra_blocks'} = {};
121
122 return bless $self, $class;
123}
124
125sub get_default_process_exp {
126 my $self = shift (@_);
127
[19746]128 return q^(?i)(contents)$^;
[8511]129}
130
131# want to block all files except the "contents"
132sub get_default_block_exp {
133 my $self = shift (@_);
134
[19746]135 # Block handle and txt files if present. Specifically don't block dublin_core xml
136 return q^(?i)(handle|\.tx?t)$^;
[8511]137}
138
[19746]139sub store_block_files_BACKUP
140{
141 # Option of making blocking sensitive to files that are in directory
142 # This subroutine is not currently used! (relies on default block expression stopping all handle and .txt files)
143
144 my $self =shift (@_);
145 my ($filename_full_path, $block_hash) = @_;
146
147 my ($tailname, $contents_basedir, $suffix) = &File::Basename::fileparse($filename_full_path, "\\.[^\\.]+\$");
148 my $handle_filename = &util::filename_cat($contents_basedir,"handle");
149
[23419]150 if (&util::fd_exists($handle_filename)) {
[23561]151 &util::block_filename($block_hash,$handle_filename);
[23419]152 }
[19746]153}
154
[8511]155sub read_content
156{
157 my $self = shift (@_);
158 my ($dir, $only_first_doc, $first_inorder_ext, $first_inorder_mime, $mimetype_list) = @_;
159 my $outhandle = $self->{'outhandle'};
160
161 my @fnamemime_list = ();
162 my @assocmime_list = ();
163
164 my $content_fname = &util::filename_cat($dir,"contents");
165
166 open(CIN,"<$content_fname")
167 || die "Unable to open $content_fname: $!\n";
168
169 my $line;
170 my $pos = 0;
171
172 while (defined ($line = <CIN>)) {
173 if ($line =~ m/^(.*)\s+bundle:ORIGINAL\s*$/) {
174 my $fname = $1;
175 my $mtype = $mimetype_list->[$pos];
176 my $fm_rec = { 'file' => $fname, 'mimetype' => $mtype};
177 push(@fnamemime_list,$fm_rec);
178 $pos++;
179 }
180 }
181 close CIN;
182
183 if ($only_first_doc){
184 my ($first_fname, @rest_fnames) = @fnamemime_list;
185 @fnamemime_list = ($first_fname);
186 @assocmime_list = @rest_fnames;
187 }
188
189 # allow user to specify the types of files (inorder)they would like to assign as
190 # a primary bitstream
191 if ($first_inorder_ext) {
192 # parse user-define file extension names
193 my @extfiles_list = split /,/, $first_inorder_ext;
194 my (@rest_fnames) = @fnamemime_list;
195 my @matched_list = ();
196 foreach my $file_ext (@extfiles_list) {
197 $pos = 0;
198 foreach my $allfiles (@fnamemime_list){
199 $allfiles->{'file'} =~ /^(.*)\.(.*?)$/;
200 my $allfiles_ext = $2;
201
202 if ($allfiles_ext =~ /$file_ext/) {
203 print $outhandle "Existing file:$allfiles->{'file'} match the user-define File Extension:$file_ext\n";
204 push (@matched_list, $allfiles);
205
206 # delete the matched extension file from the array
207 splice(@rest_fnames,$pos,1);
208
209 return (\@matched_list, \@rest_fnames);
210
211 }
212 $pos++;
213 }
214 }
215 }
216
217 if ($first_inorder_mime) {
218 # parse user-define file mimetype
219 my @file_mime_list = split /,/, $first_inorder_mime;
220 my (@rest_fnames) = @fnamemime_list;
221 my @matched_list = ();
222 foreach my $file_mime (@file_mime_list) {
223 $pos = 0;
224 foreach my $allfiles (@fnamemime_list){
225 my $allfiles_mime = $allfiles->{'mimetype'};
226
227 if ($allfiles_mime =~ /$file_mime/) {
228 print $outhandle "Existing file:$allfiles->{'file'} match the user-defined File MimeType:$file_mime\n";
229 push (@matched_list, $allfiles);
230
231 # delete the matched MIMEType file from the array
232 splice(@rest_fnames,$pos,1);
233 return (\@matched_list, \@rest_fnames);
234 }
235 $pos++;
236 }
237 }
238 }
239 return (\@fnamemime_list, \@assocmime_list);
240}
241
242
243sub filemime_list_to_re
244{
245 my $self = shift (@_);
246 my ($fnamemime_list) = @_;
247
248 my @fname_list = map { "(".$_->{'file'}.")" } @$fnamemime_list;
249 my $fname_re = join("|",@fname_list);
250 $fname_re =~ s/\./\\\./g;
251
252 return $fname_re;
253}
254
255# Read dublin_core metadata from DSpace collection
256sub metadata_read {
257 my $self = shift (@_);
[19493]258 my ($pluginfo, $base_dir, $file, $block_hash,
259 $extrametakeys, $extrametadata, $extrametafile,
[23212]260 $processor, $gli, $aux) = @_;
[8511]261
262 my $only_first_doc = $self->{'only_first_doc'};
263 my $first_inorder_ext = $self->{'first_inorder_ext'};
264 my $first_inorder_mime = $self->{'first_inorder_mime'};
265
266 my $outhandle = $self->{'outhandle'};
267
268 my $filename = &util::filename_cat($base_dir, $file);
269 # return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
270
271 if ($filename !~ /dublin_core\.xml$/ || !-f $filename) {
272 return undef;
273 }
274
[15872]275 print $outhandle "DSpacePlugin: extracting metadata from $file\n"
[8511]276 if $self->{'verbosity'} > 1;
277
278 my ($dir) = $filename =~ /^(.*?)[^\/\\]*$/;
279
280 eval {
281 $self->{'parser'}->parsefile($filename);
282 };
283
284 if ($@) {
[15872]285 die "DSpacePlugin: ERROR $filename is not a well formed dublin_core.xml file ($@)\n";
[8511]286 }
287
[24306]288 my $mimetype_list = $self->{'saved_metadata'}->{'ex.dc.Format^mimetype'};
[8511]289 my ($doc_file_mimes, $assoc_file_mimes) = $self->read_content($dir, $only_first_doc, $first_inorder_ext,
290 $first_inorder_mime, $mimetype_list);
[8913]291
[8511]292 my $file_re = $self->filemime_list_to_re($doc_file_mimes);
293
294 if ($only_first_doc || $first_inorder_ext || $first_inorder_mime) {
295 foreach my $afm ( @$assoc_file_mimes ) {
296 my $full_af = &util::filename_cat($dir,$afm->{'file'});
297 $self->{'extra_blocks'}->{$full_af} = 1;
298 }
299 }
300 push(@$extrametakeys,$file_re);
301
[24306]302 if (defined $self->{'saved_metadata'}->{'ex.dc.Format^extent'}) {
303 delete $self->{'saved_metadata'}->{'ex.dc.Format^extent'};
[8511]304 }
305
306 if (defined $mimetype_list) {
[24306]307 delete $self->{'saved_metadata'}->{'ex.dc.Format^mimetype'};
[8511]308
309 # Temporarily store associate file info in metadata table
[15872]310 # This will be removed in 'extra_metadata' in BasePlugin and used
[8511]311 # to perform the actual file association (once the doc obj has
312 # been formed
313
314 my $main_doc = $doc_file_mimes->[0];
315 my $md_mimetype = $main_doc->{'mimetype'};
316
317 my $pd_lookup = $primary_doc_lookup->{$md_mimetype};
318
319 if (defined $pd_lookup) {
320 my $filter_re = $pd_lookup;
321 @$assoc_file_mimes = grep { $_->{'file'} !~ m/$filter_re/ } @$assoc_file_mimes;
322 }
323
324 my @gsdlassocfile_tobe
[24219]325 = map { &util::filename_cat($dir,$_->{'file'}) .":".$_->{'mimetype'}.":" } @$assoc_file_mimes if @$assoc_file_mimes;
[8511]326 $self->{'saved_metadata'}->{'gsdlassocfile_tobe'} = \@gsdlassocfile_tobe;
327
328 }
[8913]329
[8511]330 $extrametadata->{$file_re} = $self->{'saved_metadata'};
331
332 return 1;
333}
334
335
[15872]336# The DSpacePlugin read() function. We are not actually reading any documents
337# here, just blocking ones that have been processed by metadata read.
[8511]338#
[15872]339# Returns 0 for a file its blocking, undef for any other
[8511]340sub read {
341 my $self = shift (@_);
[16392]342 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
[8511]343 my $outhandle = $self->{'outhandle'};
344
345 # Block all files except contents
346 my $filename = &util::filename_cat($base_dir, $file);
347 return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
348
349 my $assocfile = $metadata->{'assocfile'};
350
351 return 0 if (($filename =~ /dublin_core\.xml$/) || ($filename =~ /contents$/));
352 return 0 if (defined $self->{'extra_blocks'}->{$filename});
353 return undef;
354}
355
356sub Doctype {
357 my ($expat, $name, $sysid, $pubid, $internal) = @_;
358
359 die if ($name !~ /^dublin_core$/);
360}
361
362sub StartTag {
363 my ($expat, $element) = @_;
364 if ($element eq "dublin_core") {
365 $self->{'saved_metadata'} = {};
[8913]366 } elsif ($element eq "dcvalue") {
[8511]367 my $metaname = $_{'element'};
[10549]368 my $qualifier = $_{'qualifier'}||"";
[21367]369 if ($metaname ne "description" || $qualifier ne "provenance") {
[9694]370 $metaname .= "^$qualifier" if ($qualifier ne "none" && $qualifier ne "");
[24306]371 $self->{'metaname'} = "ex.dc.\u$metaname";
[8511]372 }
373 }
374}
375
376sub EndTag {
377 my ($expat, $element) = @_;
378
379 if ($element eq "dcvalue") {
380 $self->{'metaname'} = "";
381 }
382}
383
384sub Text {
385 if (defined ($self->{'metaname'}) && $self->{'metaname'} ne "") {
386 # $_ == Metadata content
387 my $mname = $self->{'metaname'};
[21367]388 my $mvalue = prepareMetadataValue($_);
[8511]389 if (defined $self->{'saved_metadata'}->{$mname}) {
390 # accumulate - add value to existing value(s)
391 if (ref ($self->{'saved_metadata'}->{$mname}) eq "ARRAY") {
[21367]392 push (@{$self->{'saved_metadata'}->{$mname}}, $mvalue);
[8511]393 } else {
394 $self->{'saved_metadata'}->{$mname} =
[21367]395 [$self->{'saved_metadata'}->{$mname}, $mvalue];
[8511]396 }
397 } else {
398 # accumulate - add value into (currently empty) array
[21367]399 $self->{'saved_metadata'}->{$mname} = [$mvalue];
[8511]400 }
401
402 }
403}
404
[21367]405# Prepare DSpace metadata for using with Greenstone.
406# Some value must be escaped.
407sub prepareMetadataValue {
408 my ($value) = @_;
409
410 $value =~ s/\[/&#091;/g;
411 $value =~ s/\]/&#093;/g;
412
413 return $value;
414 }
[8511]415# This Char function overrides the one in XML::Parser::Stream to overcome a
416# problem where $expat->{Text} is treated as the return value, slowing
417# things down significantly in some cases.
418sub Char {
[9462]419 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
420 $_[0]->{'Text'} .= $_[1];
421 return undef;
[8511]422}
423
4241;
Note: See TracBrowser for help on using the repository browser.