source: gsdl/trunk/perllib/plugins/DSpacePlugin.pm@ 19746

Last change on this file since 19746 was 19746, checked in by ak19, 15 years ago

Dr Bainbridge fixed the DSpace problem where the crucial dublin_core.xml file was being blocked when it shouldn't have been which prevented the necessary files from being processed by the DSpacePlugin at all. Now the DSpace to Greenstone tutorial works again.

  • Property svn:keywords set to Author Date Id Revision
File size: 12.2 KB
Line 
1
2###########################################################################
3#
4# DSpacePlugin.pm -- plugin for importing a collection from DSpace
5#
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 2004 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# This plugin takes "contents" and dublin_core.xml file, which contain
30# Metadata and lists of associated files for a particular document
31# and produces a document containing sections, one for each page.
32# The files should be named "contents" and "dublin_core.xml". For each of
33# document in DSpace, it is stored in one directory
34#
35# The format of the "contents" file is as follows:
36#
37# File.type bundle:ORIGINAL
38# license.txt bundle:LICENSE
39# The format of the "dublin_core.xml" file is as follows:
40# The first line contains any metadata for the whole document
41# <dublin_core>
42# eg.
43# <dcvalue element="Title" qualifier="">Snail farming</dcvalue>
44# <dcvalue element="date" qualifier="">2004-10-15</dcvalue>
45#
46
47package DSpacePlugin;
48
49use ReadTextFile;
50use plugin;
51use XMLParser;
52use strict;
53no strict 'refs'; # allow filehandles to be variables and viceversa
54
55sub BEGIN {
56 @DSpacePlugin::ISA = ('ReadTextFile');
57}
58
59my $arguments =
60 [ { 'name' => "process_exp",
61 'desc' => "{BasePlugin.process_exp}",
62 'type' => "string",
63 'deft' => &get_default_process_exp(),
64 'reqd' => "no" },
65 { 'name' => "only_first_doc",
66 'desc' => "{DSpacePlugin.only_first_doc}",
67 'type' => "flag",
68 'reqd' => "no" },
69 { 'name' => "first_inorder_ext",
70 'desc' => "{DSpacePlugin.first_inorder_ext}",
71 'type' => "string",
72 'reqd' => "no" },
73 { 'name' => "first_inorder_mime",
74 'desc' => "{DSpacePlugin.first_inorder_mime}",
75 'type' => "flag",
76 'reqd' => "no" },
77 { 'name' => "block_exp",
78 'desc' => "{BasePlugin.block_exp}",
79 'type' => "regexp",
80 'deft' => &get_default_block_exp(),
81 'reqd' => "no" }];
82
83
84my $options = { 'name' => "DSpacePlugin",
85 'desc' => "{DSpacePlugin.desc}",
86 'inherits' => "yes",
87 'abstract' => "no",
88 'args' => $arguments };
89
90
91my $primary_doc_lookup = { 'text/html' => '(?i)\.(gif|jpe?g|jpe|jpg|png|css)$' };
92
93# Important variation to regular plugin structure. Need to declare
94# $self as global variable to file so XMLParser callback routines
95# can access the content of the object.
96my ($self);
97
98sub new {
99 my ($class) = shift (@_);
100 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
101 push(@$pluginlist, $class);
102
103 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
104 push(@{$hashArgOptLists->{"OptList"}},$options);
105
106 $self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists);
107
108 if ($self->{'info_only'}) {
109 # don't worry about creating the XML parser as all we want is the
110 # list of plugin options
111 return bless $self, $class;
112 }
113
114 #create XML::Parser object for parsing dublin_core.xml files
115 my $parser = new XML::Parser('Style' => 'Stream',
116 'Handlers' => {'Char' => \&Char,
117 'Doctype' => \&Doctype
118 });
119 $self->{'parser'} = $parser;
120 $self->{'extra_blocks'} = {};
121
122 return bless $self, $class;
123}
124
125sub get_default_process_exp {
126 my $self = shift (@_);
127
128 return q^(?i)(contents)$^;
129}
130
131# want to block all files except the "contents"
132sub get_default_block_exp {
133 my $self = shift (@_);
134
135 # Block handle and txt files if present. Specifically don't block dublin_core xml
136 return q^(?i)(handle|\.tx?t)$^;
137}
138
139sub store_block_files_BACKUP
140{
141 # Option of making blocking sensitive to files that are in directory
142 # This subroutine is not currently used! (relies on default block expression stopping all handle and .txt files)
143
144 my $self =shift (@_);
145 my ($filename_full_path, $block_hash) = @_;
146
147 my ($tailname, $contents_basedir, $suffix) = &File::Basename::fileparse($filename_full_path, "\\.[^\\.]+\$");
148 my $handle_filename = &util::filename_cat($contents_basedir,"handle");
149
150 $block_hash->{'file_blocks'}->{$handle_filename} = 1 if (-e $handle_filename);
151}
152
153sub read_content
154{
155 my $self = shift (@_);
156 my ($dir, $only_first_doc, $first_inorder_ext, $first_inorder_mime, $mimetype_list) = @_;
157 my $outhandle = $self->{'outhandle'};
158
159 my @fnamemime_list = ();
160 my @assocmime_list = ();
161
162 my $content_fname = &util::filename_cat($dir,"contents");
163
164 open(CIN,"<$content_fname")
165 || die "Unable to open $content_fname: $!\n";
166
167 my $line;
168 my $pos = 0;
169
170 while (defined ($line = <CIN>)) {
171 if ($line =~ m/^(.*)\s+bundle:ORIGINAL\s*$/) {
172 my $fname = $1;
173 my $mtype = $mimetype_list->[$pos];
174 my $fm_rec = { 'file' => $fname, 'mimetype' => $mtype};
175 push(@fnamemime_list,$fm_rec);
176 $pos++;
177 }
178 }
179 close CIN;
180
181 if ($only_first_doc){
182 my ($first_fname, @rest_fnames) = @fnamemime_list;
183 @fnamemime_list = ($first_fname);
184 @assocmime_list = @rest_fnames;
185 }
186
187 # allow user to specify the types of files (inorder)they would like to assign as
188 # a primary bitstream
189 if ($first_inorder_ext) {
190 # parse user-define file extension names
191 my @extfiles_list = split /,/, $first_inorder_ext;
192 my (@rest_fnames) = @fnamemime_list;
193 my @matched_list = ();
194 foreach my $file_ext (@extfiles_list) {
195 $pos = 0;
196 foreach my $allfiles (@fnamemime_list){
197 $allfiles->{'file'} =~ /^(.*)\.(.*?)$/;
198 my $allfiles_ext = $2;
199
200 if ($allfiles_ext =~ /$file_ext/) {
201 print $outhandle "Existing file:$allfiles->{'file'} match the user-define File Extension:$file_ext\n";
202 push (@matched_list, $allfiles);
203
204 # delete the matched extension file from the array
205 splice(@rest_fnames,$pos,1);
206
207 return (\@matched_list, \@rest_fnames);
208
209 }
210 $pos++;
211 }
212 }
213 }
214
215 if ($first_inorder_mime) {
216 # parse user-define file mimetype
217 my @file_mime_list = split /,/, $first_inorder_mime;
218 my (@rest_fnames) = @fnamemime_list;
219 my @matched_list = ();
220 foreach my $file_mime (@file_mime_list) {
221 $pos = 0;
222 foreach my $allfiles (@fnamemime_list){
223 my $allfiles_mime = $allfiles->{'mimetype'};
224
225 if ($allfiles_mime =~ /$file_mime/) {
226 print $outhandle "Existing file:$allfiles->{'file'} match the user-defined File MimeType:$file_mime\n";
227 push (@matched_list, $allfiles);
228
229 # delete the matched MIMEType file from the array
230 splice(@rest_fnames,$pos,1);
231 return (\@matched_list, \@rest_fnames);
232 }
233 $pos++;
234 }
235 }
236 }
237 return (\@fnamemime_list, \@assocmime_list);
238}
239
240
241sub filemime_list_to_re
242{
243 my $self = shift (@_);
244 my ($fnamemime_list) = @_;
245
246 my @fname_list = map { "(".$_->{'file'}.")" } @$fnamemime_list;
247 my $fname_re = join("|",@fname_list);
248 $fname_re =~ s/\./\\\./g;
249
250 return $fname_re;
251}
252
253# Read dublin_core metadata from DSpace collection
254sub metadata_read {
255 my $self = shift (@_);
256 my ($pluginfo, $base_dir, $file, $block_hash,
257 $extrametakeys, $extrametadata, $extrametafile,
258 $processor, $maxdocs, $gli) = @_;
259
260 my $only_first_doc = $self->{'only_first_doc'};
261 my $first_inorder_ext = $self->{'first_inorder_ext'};
262 my $first_inorder_mime = $self->{'first_inorder_mime'};
263
264 my $outhandle = $self->{'outhandle'};
265
266 my $filename = &util::filename_cat($base_dir, $file);
267 # return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
268
269 if ($filename !~ /dublin_core\.xml$/ || !-f $filename) {
270 return undef;
271 }
272
273 print $outhandle "DSpacePlugin: extracting metadata from $file\n"
274 if $self->{'verbosity'} > 1;
275
276 my ($dir) = $filename =~ /^(.*?)[^\/\\]*$/;
277
278 eval {
279 $self->{'parser'}->parsefile($filename);
280 };
281
282 if ($@) {
283 die "DSpacePlugin: ERROR $filename is not a well formed dublin_core.xml file ($@)\n";
284 }
285
286 my $mimetype_list = $self->{'saved_metadata'}->{'dc.Format^mimetype'};
287 my ($doc_file_mimes, $assoc_file_mimes) = $self->read_content($dir, $only_first_doc, $first_inorder_ext,
288 $first_inorder_mime, $mimetype_list);
289
290 my $file_re = $self->filemime_list_to_re($doc_file_mimes);
291
292 if ($only_first_doc || $first_inorder_ext || $first_inorder_mime) {
293 foreach my $afm ( @$assoc_file_mimes ) {
294 my $full_af = &util::filename_cat($dir,$afm->{'file'});
295 $self->{'extra_blocks'}->{$full_af} = 1;
296 }
297 }
298 push(@$extrametakeys,$file_re);
299
300 if (defined $self->{'saved_metadata'}->{'dc.Format^extent'}) {
301 delete $self->{'saved_metadata'}->{'dc.Format^extent'};
302 }
303
304 if (defined $mimetype_list) {
305 delete $self->{'saved_metadata'}->{'dc.Format^mimetype'};
306
307 # Temporarily store associate file info in metadata table
308 # This will be removed in 'extra_metadata' in BasePlugin and used
309 # to perform the actual file association (once the doc obj has
310 # been formed
311
312 my $main_doc = $doc_file_mimes->[0];
313 my $md_mimetype = $main_doc->{'mimetype'};
314
315 my $pd_lookup = $primary_doc_lookup->{$md_mimetype};
316
317 if (defined $pd_lookup) {
318 my $filter_re = $pd_lookup;
319 @$assoc_file_mimes = grep { $_->{'file'} !~ m/$filter_re/ } @$assoc_file_mimes;
320 }
321
322 my @gsdlassocfile_tobe
323 = map { &util::filename_cat($dir,$_->{'file'}).":".$_->{'mimetype'}.":" } @$assoc_file_mimes;
324 $self->{'saved_metadata'}->{'gsdlassocfile_tobe'} = \@gsdlassocfile_tobe;
325
326 }
327
328 $extrametadata->{$file_re} = $self->{'saved_metadata'};
329
330 return 1;
331}
332
333
334# The DSpacePlugin read() function. We are not actually reading any documents
335# here, just blocking ones that have been processed by metadata read.
336#
337# Returns 0 for a file its blocking, undef for any other
338sub read {
339 my $self = shift (@_);
340 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
341 my $outhandle = $self->{'outhandle'};
342
343 # Block all files except contents
344 my $filename = &util::filename_cat($base_dir, $file);
345 return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
346
347 my $assocfile = $metadata->{'assocfile'};
348
349 return 0 if (($filename =~ /dublin_core\.xml$/) || ($filename =~ /contents$/));
350 return 0 if (defined $self->{'extra_blocks'}->{$filename});
351 return undef;
352}
353
354sub Doctype {
355 my ($expat, $name, $sysid, $pubid, $internal) = @_;
356
357 die if ($name !~ /^dublin_core$/);
358}
359
360sub StartTag {
361 my ($expat, $element) = @_;
362 if ($element eq "dublin_core") {
363 $self->{'saved_metadata'} = {};
364 } elsif ($element eq "dcvalue") {
365 my $metaname = $_{'element'};
366 my $qualifier = $_{'qualifier'}||"";
367 if ($metaname ne "description") {
368 $metaname .= "^$qualifier" if ($qualifier ne "none" && $qualifier ne "");
369 $self->{'metaname'} = "dc.\u$metaname";
370 }
371 }
372}
373
374sub EndTag {
375 my ($expat, $element) = @_;
376
377 if ($element eq "dcvalue") {
378 $self->{'metaname'} = "";
379 }
380}
381
382sub Text {
383 if (defined ($self->{'metaname'}) && $self->{'metaname'} ne "") {
384 # $_ == Metadata content
385 my $mname = $self->{'metaname'};
386 if (defined $self->{'saved_metadata'}->{$mname}) {
387 # accumulate - add value to existing value(s)
388 if (ref ($self->{'saved_metadata'}->{$mname}) eq "ARRAY") {
389 push (@{$self->{'saved_metadata'}->{$mname}}, $_);
390 } else {
391 $self->{'saved_metadata'}->{$mname} =
392 [$self->{'saved_metadata'}->{$mname}, $_];
393 }
394 } else {
395 # accumulate - add value into (currently empty) array
396 $self->{'saved_metadata'}->{$mname} = [$_];
397 }
398
399 }
400}
401
402# This Char function overrides the one in XML::Parser::Stream to overcome a
403# problem where $expat->{Text} is treated as the return value, slowing
404# things down significantly in some cases.
405sub Char {
406 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
407 $_[0]->{'Text'} .= $_[1];
408 return undef;
409}
410
4111;
Note: See TracBrowser for help on using the repository browser.