source: main/trunk/greenstone2/perllib/plugins/DSpacePlugin.pm@ 24951

Last change on this file since 24951 was 24951, checked in by ak19, 12 years ago

All perlcode that accesses extrametakeys, extrametadata, extrametafile data structures has been moved into a new perl module called extrametautil.pm. The next step will be to ensure that the file_regexes used to index into these data structures are consistent (using consistent slashes, like URL style slashes).

  • Property svn:keywords set to Author Date Id Revision
File size: 12.8 KB
Line 
1
2###########################################################################
3#
4# DSpacePlugin.pm -- plugin for importing a collection from DSpace
5#
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 2004 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# This plugin takes "contents" and dublin_core.xml file, which contain
30# Metadata and lists of associated files for a particular document
31# and produces a document containing sections, one for each page.
32# The files should be named "contents" and "dublin_core.xml". For each of
33# document in DSpace, it is stored in one directory
34#
35# The format of the "contents" file is as follows:
36#
37# File.type bundle:ORIGINAL
38# license.txt bundle:LICENSE
39# The format of the "dublin_core.xml" file is as follows:
40# The first line contains any metadata for the whole document
41# <dublin_core>
42# eg.
43# <dcvalue element="Title" qualifier="">Snail farming</dcvalue>
44# <dcvalue element="date" qualifier="">2004-10-15</dcvalue>
45#
46
47package DSpacePlugin;
48
49use extrametautil;
50use ReadTextFile;
51use plugin;
52use XMLParser;
53use strict;
54no strict 'refs'; # allow filehandles to be variables and viceversa
55
56sub BEGIN {
57 @DSpacePlugin::ISA = ('ReadTextFile');
58}
59
60my $arguments =
61 [ { 'name' => "process_exp",
62 'desc' => "{BasePlugin.process_exp}",
63 'type' => "string",
64 'deft' => &get_default_process_exp(),
65 'reqd' => "no" },
66 { 'name' => "only_first_doc",
67 'desc' => "{DSpacePlugin.only_first_doc}",
68 'type' => "flag",
69 'reqd' => "no" },
70 { 'name' => "first_inorder_ext",
71 'desc' => "{DSpacePlugin.first_inorder_ext}",
72 'type' => "string",
73 'reqd' => "no" },
74 { 'name' => "first_inorder_mime",
75 'desc' => "{DSpacePlugin.first_inorder_mime}",
76 'type' => "flag",
77 'reqd' => "no" },
78 { 'name' => "block_exp",
79 'desc' => "{BasePlugin.block_exp}",
80 'type' => "regexp",
81 'deft' => &get_default_block_exp(),
82 'reqd' => "no" }];
83
84
85my $options = { 'name' => "DSpacePlugin",
86 'desc' => "{DSpacePlugin.desc}",
87 'inherits' => "yes",
88 'abstract' => "no",
89 'args' => $arguments };
90
91
92my $primary_doc_lookup = { 'text/html' => '(?i)\.(gif|jpe?g|jpe|jpg|png|css)$' };
93
94# Important variation to regular plugin structure. Need to declare
95# $self as global variable to file so XMLParser callback routines
96# can access the content of the object.
97my ($self);
98
99sub new {
100 my ($class) = shift (@_);
101 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
102 push(@$pluginlist, $class);
103
104 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
105 push(@{$hashArgOptLists->{"OptList"}},$options);
106
107 $self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists);
108
109 if ($self->{'info_only'}) {
110 # don't worry about creating the XML parser as all we want is the
111 # list of plugin options
112 return bless $self, $class;
113 }
114
115 #create XML::Parser object for parsing dublin_core.xml files
116 my $parser = new XML::Parser('Style' => 'Stream',
117 'Handlers' => {'Char' => \&Char,
118 'Doctype' => \&Doctype
119 });
120 $self->{'parser'} = $parser;
121 $self->{'extra_blocks'} = {};
122
123 return bless $self, $class;
124}
125
126sub get_default_process_exp {
127 my $self = shift (@_);
128
129 return q^(?i)(contents)$^;
130}
131
132# want to block all files except the "contents"
133sub get_default_block_exp {
134 my $self = shift (@_);
135
136 # Block handle and txt files if present. Specifically don't block dublin_core xml
137 return q^(?i)(handle|\.tx?t)$^;
138}
139
140sub store_block_files_BACKUP
141{
142 # Option of making blocking sensitive to files that are in directory
143 # This subroutine is not currently used! (relies on default block expression stopping all handle and .txt files)
144
145 my $self =shift (@_);
146 my ($filename_full_path, $block_hash) = @_;
147
148 my ($tailname, $contents_basedir, $suffix) = &File::Basename::fileparse($filename_full_path, "\\.[^\\.]+\$");
149 my $handle_filename = &util::filename_cat($contents_basedir,"handle");
150
151 if (&util::fd_exists($handle_filename)) {
152 &util::block_filename($block_hash,$handle_filename);
153 }
154}
155
156sub read_content
157{
158 my $self = shift (@_);
159 my ($dir, $only_first_doc, $first_inorder_ext, $first_inorder_mime, $mimetype_list) = @_;
160 my $outhandle = $self->{'outhandle'};
161
162 my @fnamemime_list = ();
163 my @assocmime_list = ();
164
165 my $content_fname = &util::filename_cat($dir,"contents");
166
167 open(CIN,"<$content_fname")
168 || die "Unable to open $content_fname: $!\n";
169
170 my $line;
171 my $pos = 0;
172
173 while (defined ($line = <CIN>)) {
174 if ($line =~ m/^(.*)\s+bundle:ORIGINAL\s*$/) {
175 my $fname = $1;
176 my $mtype = $mimetype_list->[$pos];
177 my $fm_rec = { 'file' => $fname, 'mimetype' => $mtype};
178 push(@fnamemime_list,$fm_rec);
179 $pos++;
180 }
181 }
182 close CIN;
183
184 if ($only_first_doc){
185 my ($first_fname, @rest_fnames) = @fnamemime_list;
186 @fnamemime_list = ($first_fname);
187 @assocmime_list = @rest_fnames;
188 }
189
190 # allow user to specify the types of files (inorder)they would like to assign as
191 # a primary bitstream
192 if ($first_inorder_ext) {
193 # parse user-define file extension names
194 my @extfiles_list = split /,/, $first_inorder_ext;
195 my (@rest_fnames) = @fnamemime_list;
196 my @matched_list = ();
197 foreach my $file_ext (@extfiles_list) {
198 $pos = 0;
199 foreach my $allfiles (@fnamemime_list){
200 $allfiles->{'file'} =~ /^(.*)\.(.*?)$/;
201 my $allfiles_ext = $2;
202
203 if ($allfiles_ext =~ /$file_ext/) {
204 print $outhandle "Existing file:$allfiles->{'file'} match the user-define File Extension:$file_ext\n";
205 push (@matched_list, $allfiles);
206
207 # delete the matched extension file from the array
208 splice(@rest_fnames,$pos,1);
209
210 return (\@matched_list, \@rest_fnames);
211
212 }
213 $pos++;
214 }
215 }
216 }
217
218 if ($first_inorder_mime) {
219 # parse user-define file mimetype
220 my @file_mime_list = split /,/, $first_inorder_mime;
221 my (@rest_fnames) = @fnamemime_list;
222 my @matched_list = ();
223 foreach my $file_mime (@file_mime_list) {
224 $pos = 0;
225 foreach my $allfiles (@fnamemime_list){
226 my $allfiles_mime = $allfiles->{'mimetype'};
227
228 if ($allfiles_mime =~ /$file_mime/) {
229 print $outhandle "Existing file:$allfiles->{'file'} match the user-defined File MimeType:$file_mime\n";
230 push (@matched_list, $allfiles);
231
232 # delete the matched MIMEType file from the array
233 splice(@rest_fnames,$pos,1);
234 return (\@matched_list, \@rest_fnames);
235 }
236 $pos++;
237 }
238 }
239 }
240 return (\@fnamemime_list, \@assocmime_list);
241}
242
243
244sub filemime_list_to_re
245{
246 my $self = shift (@_);
247 my ($fnamemime_list) = @_;
248
249 my @fname_list = map { "(".$_->{'file'}.")" } @$fnamemime_list;
250 my $fname_re = join("|",@fname_list);
251 $fname_re =~ s/\./\\\./g;
252
253 return $fname_re;
254}
255
256# Read dublin_core metadata from DSpace collection
257sub metadata_read {
258 my $self = shift (@_);
259 my ($pluginfo, $base_dir, $file, $block_hash,
260 $extrametakeys, $extrametadata, $extrametafile,
261 $processor, $gli, $aux) = @_;
262
263 my $only_first_doc = $self->{'only_first_doc'};
264 my $first_inorder_ext = $self->{'first_inorder_ext'};
265 my $first_inorder_mime = $self->{'first_inorder_mime'};
266
267 my $outhandle = $self->{'outhandle'};
268
269 my $filename = &util::filename_cat($base_dir, $file);
270 # return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
271
272 if ($filename !~ /dublin_core\.xml$/ || !-f $filename) {
273 return undef;
274 }
275
276 print $outhandle "DSpacePlugin: extracting metadata from $file\n"
277 if $self->{'verbosity'} > 1;
278
279 my ($dir) = $filename =~ /^(.*?)[^\/\\]*$/;
280
281 eval {
282 $self->{'parser'}->parsefile($filename);
283 };
284
285 if ($@) {
286 die "DSpacePlugin: ERROR $filename is not a well formed dublin_core.xml file ($@)\n";
287 }
288
289 my $mimetype_list = $self->{'saved_metadata'}->{'ex.dc.Format^mimetype'};
290 my ($doc_file_mimes, $assoc_file_mimes) = $self->read_content($dir, $only_first_doc, $first_inorder_ext,
291 $first_inorder_mime, $mimetype_list);
292
293 my $file_re = $self->filemime_list_to_re($doc_file_mimes);
294
295 if ($only_first_doc || $first_inorder_ext || $first_inorder_mime) {
296 foreach my $afm ( @$assoc_file_mimes ) {
297 my $full_af = &util::filename_cat($dir,$afm->{'file'});
298 $self->{'extra_blocks'}->{$full_af} = 1;
299 }
300 }
301 &extrametautil::addmetakey($extrametakeys, $file_re);
302
303 # See Format's Extent section in http://dublincore.org/documents/usageguide/qualifiers.shtml
304 # it could specify duration, size or even dimensions of the resource. It may be a useful piece
305 # of metadata to preserve after all.
306 #if (defined $self->{'saved_metadata'}->{'ex.dc.Format^extent'}) {
307 #delete $self->{'saved_metadata'}->{'ex.dc.Format^extent'};
308 #}
309
310 if (defined $mimetype_list) {
311 delete $self->{'saved_metadata'}->{'ex.dc.Format^mimetype'};
312
313 # Temporarily store associate file info in metadata table
314 # This will be removed in 'extra_metadata' in BasePlugin and used
315 # to perform the actual file association (once the doc obj has
316 # been formed
317
318 my $main_doc = $doc_file_mimes->[0];
319 my $md_mimetype = $main_doc->{'mimetype'};
320
321 my $pd_lookup = $primary_doc_lookup->{$md_mimetype};
322
323 if (defined $pd_lookup) {
324 my $filter_re = $pd_lookup;
325 @$assoc_file_mimes = grep { $_->{'file'} !~ m/$filter_re/ } @$assoc_file_mimes;
326 }
327
328 my @gsdlassocfile_tobe
329 = map { &util::filename_cat($dir,$_->{'file'}) .":".$_->{'mimetype'}.":" } @$assoc_file_mimes if @$assoc_file_mimes;
330 $self->{'saved_metadata'}->{'gsdlassocfile_tobe'} = \@gsdlassocfile_tobe;
331
332 }
333
334 &extrametautil::setmetadata($extrametadata, $file_re, $self->{'saved_metadata'});
335
336 return 1;
337}
338
339
340# The DSpacePlugin read() function. We are not actually reading any documents
341# here, just blocking ones that have been processed by metadata read.
342#
343# Returns 0 for a file its blocking, undef for any other
344sub read {
345 my $self = shift (@_);
346 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
347 my $outhandle = $self->{'outhandle'};
348
349 # Block all files except contents
350 my $filename = &util::filename_cat($base_dir, $file);
351 return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
352
353 my $assocfile = $metadata->{'assocfile'};
354
355 return 0 if (($filename =~ /dublin_core\.xml$/) || ($filename =~ /contents$/));
356 return 0 if (defined $self->{'extra_blocks'}->{$filename});
357 return undef;
358}
359
360sub Doctype {
361 my ($expat, $name, $sysid, $pubid, $internal) = @_;
362
363 die if ($name !~ /^dublin_core$/);
364}
365
366sub StartTag {
367 my ($expat, $element) = @_;
368 if ($element eq "dublin_core") {
369 $self->{'saved_metadata'} = {};
370 } elsif ($element eq "dcvalue") {
371 my $metaname = $_{'element'};
372 my $qualifier = $_{'qualifier'}||"";
373 if ($metaname ne "description" || $qualifier ne "provenance") {
374 $metaname .= "^$qualifier" if ($qualifier ne "none" && $qualifier ne "");
375 $self->{'metaname'} = "ex.dc.\u$metaname";
376 }
377 }
378}
379
380sub EndTag {
381 my ($expat, $element) = @_;
382
383 if ($element eq "dcvalue") {
384 $self->{'metaname'} = "";
385 }
386}
387
388sub Text {
389 if (defined ($self->{'metaname'}) && $self->{'metaname'} ne "") {
390 # $_ == Metadata content
391 my $mname = $self->{'metaname'};
392 my $mvalue = prepareMetadataValue($_);
393 if (defined $self->{'saved_metadata'}->{$mname}) {
394 # accumulate - add value to existing value(s)
395 if (ref ($self->{'saved_metadata'}->{$mname}) eq "ARRAY") {
396 push (@{$self->{'saved_metadata'}->{$mname}}, $mvalue);
397 } else {
398 $self->{'saved_metadata'}->{$mname} =
399 [$self->{'saved_metadata'}->{$mname}, $mvalue];
400 }
401 } else {
402 # accumulate - add value into (currently empty) array
403 $self->{'saved_metadata'}->{$mname} = [$mvalue];
404 }
405
406 }
407}
408
409# Prepare DSpace metadata for using with Greenstone.
410# Some value must be escaped.
411sub prepareMetadataValue {
412 my ($value) = @_;
413
414 $value =~ s/\[/&#091;/g;
415 $value =~ s/\]/&#093;/g;
416
417 return $value;
418 }
419# This Char function overrides the one in XML::Parser::Stream to overcome a
420# problem where $expat->{Text} is treated as the return value, slowing
421# things down significantly in some cases.
422sub Char {
423 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
424 $_[0]->{'Text'} .= $_[1];
425 return undef;
426}
427
4281;
Note: See TracBrowser for help on using the repository browser.