source: main/trunk/greenstone2/perllib/plugins/DSpacePlugin.pm@ 24971

Last change on this file since 24971 was 24971, checked in by ak19, 12 years ago
  1. Introduced the util::filepath_to_url_format subroutine which will be used to convert filenames to URL style filenames to match the slashes used in the filename regex-es in extrameta keys used to index into extrameta data structures. 2. Fixed bug on windows where metadata.xml specifies filenames as regex with backslash in front of the file extension's period mark: DirectoryPlugin needed to unregex the filepath before calling fileparse on it, else the escaping backslash would interfere with perl's fileparse routine (only on windows, since backslash also represents a dirsep here). 3. Updated all those perl plugins where the new util::filepath_to_url_format needs to be called so that they use URL style filenames (thereafter regexed) to index into the extrameta data structures.
  • Property svn:keywords set to Author Date Id Revision
File size: 13.3 KB
RevLine 
[10549]1
[8511]2###########################################################################
3#
[15872]4# DSpacePlugin.pm -- plugin for importing a collection from DSpace
[8511]5#
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
[15872]10# Copyright (C) 2004 New Zealand Digital Library Project
[8511]11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
[15872]28
[8511]29# This plugin takes "contents" and dublin_core.xml file, which contain
30# Metadata and lists of associated files for a particular document
31# and produces a document containing sections, one for each page.
32# The files should be named "contents" and "dublin_core.xml". For each of
33# document in DSpace, it is stored in one directory
34#
35# The format of the "contents" file is as follows:
36#
37# File.type bundle:ORIGINAL
38# license.txt bundle:LICENSE
39# The format of the "dublin_core.xml" file is as follows:
40# The first line contains any metadata for the whole document
41# <dublin_core>
42# eg.
43# <dcvalue element="Title" qualifier="">Snail farming</dcvalue>
44# <dcvalue element="date" qualifier="">2004-10-15</dcvalue>
45#
46
[15872]47package DSpacePlugin;
[8511]48
[24951]49use extrametautil;
[19746]50use ReadTextFile;
[8511]51use plugin;
[24971]52use util;
[8511]53use XMLParser;
[10254]54use strict;
55no strict 'refs'; # allow filehandles to be variables and viceversa
[8511]56
57sub BEGIN {
[19746]58 @DSpacePlugin::ISA = ('ReadTextFile');
[8511]59}
60
61my $arguments =
62 [ { 'name' => "process_exp",
[15872]63 'desc' => "{BasePlugin.process_exp}",
[8511]64 'type' => "string",
65 'deft' => &get_default_process_exp(),
66 'reqd' => "no" },
67 { 'name' => "only_first_doc",
[15872]68 'desc' => "{DSpacePlugin.only_first_doc}",
[8511]69 'type' => "flag",
70 'reqd' => "no" },
[9168]71 { 'name' => "first_inorder_ext",
[15872]72 'desc' => "{DSpacePlugin.first_inorder_ext}",
[8891]73 'type' => "string",
[8511]74 'reqd' => "no" },
75 { 'name' => "first_inorder_mime",
[15872]76 'desc' => "{DSpacePlugin.first_inorder_mime}",
[8511]77 'type' => "flag",
78 'reqd' => "no" },
79 { 'name' => "block_exp",
[15872]80 'desc' => "{BasePlugin.block_exp}",
[8891]81 'type' => "regexp",
[8511]82 'deft' => &get_default_block_exp(),
83 'reqd' => "no" }];
84
85
[15872]86my $options = { 'name' => "DSpacePlugin",
87 'desc' => "{DSpacePlugin.desc}",
[8511]88 'inherits' => "yes",
[11676]89 'abstract' => "no",
[8511]90 'args' => $arguments };
91
92
93my $primary_doc_lookup = { 'text/html' => '(?i)\.(gif|jpe?g|jpe|jpg|png|css)$' };
94
[8913]95# Important variation to regular plugin structure. Need to declare
96# $self as global variable to file so XMLParser callback routines
97# can access the content of the object.
98my ($self);
99
[8511]100sub new {
[10218]101 my ($class) = shift (@_);
102 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
103 push(@$pluginlist, $class);
[8511]104
[15872]105 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
106 push(@{$hashArgOptLists->{"OptList"}},$options);
[8511]107
[19746]108 $self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists);
[8511]109
[15872]110 if ($self->{'info_only'}) {
111 # don't worry about creating the XML parser as all we want is the
112 # list of plugin options
113 return bless $self, $class;
114 }
115
[8511]116 #create XML::Parser object for parsing dublin_core.xml files
117 my $parser = new XML::Parser('Style' => 'Stream',
118 'Handlers' => {'Char' => \&Char,
119 'Doctype' => \&Doctype
120 });
121 $self->{'parser'} = $parser;
122 $self->{'extra_blocks'} = {};
123
124 return bless $self, $class;
125}
126
127sub get_default_process_exp {
128 my $self = shift (@_);
129
[19746]130 return q^(?i)(contents)$^;
[8511]131}
132
133# want to block all files except the "contents"
134sub get_default_block_exp {
135 my $self = shift (@_);
136
[19746]137 # Block handle and txt files if present. Specifically don't block dublin_core xml
138 return q^(?i)(handle|\.tx?t)$^;
[8511]139}
140
[19746]141sub store_block_files_BACKUP
142{
143 # Option of making blocking sensitive to files that are in directory
144 # This subroutine is not currently used! (relies on default block expression stopping all handle and .txt files)
145
146 my $self =shift (@_);
147 my ($filename_full_path, $block_hash) = @_;
148
149 my ($tailname, $contents_basedir, $suffix) = &File::Basename::fileparse($filename_full_path, "\\.[^\\.]+\$");
150 my $handle_filename = &util::filename_cat($contents_basedir,"handle");
151
[23419]152 if (&util::fd_exists($handle_filename)) {
[23561]153 &util::block_filename($block_hash,$handle_filename);
[23419]154 }
[19746]155}
156
[8511]157sub read_content
158{
159 my $self = shift (@_);
160 my ($dir, $only_first_doc, $first_inorder_ext, $first_inorder_mime, $mimetype_list) = @_;
161 my $outhandle = $self->{'outhandle'};
162
163 my @fnamemime_list = ();
164 my @assocmime_list = ();
165
166 my $content_fname = &util::filename_cat($dir,"contents");
167
168 open(CIN,"<$content_fname")
169 || die "Unable to open $content_fname: $!\n";
170
171 my $line;
172 my $pos = 0;
173
174 while (defined ($line = <CIN>)) {
175 if ($line =~ m/^(.*)\s+bundle:ORIGINAL\s*$/) {
176 my $fname = $1;
177 my $mtype = $mimetype_list->[$pos];
178 my $fm_rec = { 'file' => $fname, 'mimetype' => $mtype};
179 push(@fnamemime_list,$fm_rec);
180 $pos++;
181 }
182 }
183 close CIN;
184
185 if ($only_first_doc){
186 my ($first_fname, @rest_fnames) = @fnamemime_list;
187 @fnamemime_list = ($first_fname);
188 @assocmime_list = @rest_fnames;
189 }
190
191 # allow user to specify the types of files (inorder)they would like to assign as
192 # a primary bitstream
193 if ($first_inorder_ext) {
194 # parse user-define file extension names
195 my @extfiles_list = split /,/, $first_inorder_ext;
196 my (@rest_fnames) = @fnamemime_list;
197 my @matched_list = ();
198 foreach my $file_ext (@extfiles_list) {
199 $pos = 0;
200 foreach my $allfiles (@fnamemime_list){
201 $allfiles->{'file'} =~ /^(.*)\.(.*?)$/;
202 my $allfiles_ext = $2;
203
204 if ($allfiles_ext =~ /$file_ext/) {
205 print $outhandle "Existing file:$allfiles->{'file'} match the user-define File Extension:$file_ext\n";
206 push (@matched_list, $allfiles);
207
208 # delete the matched extension file from the array
209 splice(@rest_fnames,$pos,1);
210
211 return (\@matched_list, \@rest_fnames);
212
213 }
214 $pos++;
215 }
216 }
217 }
218
219 if ($first_inorder_mime) {
220 # parse user-define file mimetype
221 my @file_mime_list = split /,/, $first_inorder_mime;
222 my (@rest_fnames) = @fnamemime_list;
223 my @matched_list = ();
224 foreach my $file_mime (@file_mime_list) {
225 $pos = 0;
226 foreach my $allfiles (@fnamemime_list){
227 my $allfiles_mime = $allfiles->{'mimetype'};
228
229 if ($allfiles_mime =~ /$file_mime/) {
230 print $outhandle "Existing file:$allfiles->{'file'} match the user-defined File MimeType:$file_mime\n";
231 push (@matched_list, $allfiles);
232
233 # delete the matched MIMEType file from the array
234 splice(@rest_fnames,$pos,1);
235 return (\@matched_list, \@rest_fnames);
236 }
237 $pos++;
238 }
239 }
240 }
241 return (\@fnamemime_list, \@assocmime_list);
242}
243
244
245sub filemime_list_to_re
246{
247 my $self = shift (@_);
248 my ($fnamemime_list) = @_;
249
250 my @fname_list = map { "(".$_->{'file'}.")" } @$fnamemime_list;
251 my $fname_re = join("|",@fname_list);
[24971]252
253 # Indexing into the extrameta data structures requires the filename's style of slashes to be in URL format
254 # Then need to convert the filename to a regex, no longer to protect windows directory chars \, but for
255 # protecting special characters like brackets in the filepath such as "C:\Program Files (x86)\Greenstone".
256 $fname_re = &util::filepath_to_url_format($fname_re); # just in case there are slashes in there
257
[8511]258 $fname_re =~ s/\./\\\./g;
259
260 return $fname_re;
261}
262
263# Read dublin_core metadata from DSpace collection
264sub metadata_read {
265 my $self = shift (@_);
[19493]266 my ($pluginfo, $base_dir, $file, $block_hash,
267 $extrametakeys, $extrametadata, $extrametafile,
[23212]268 $processor, $gli, $aux) = @_;
[8511]269
270 my $only_first_doc = $self->{'only_first_doc'};
271 my $first_inorder_ext = $self->{'first_inorder_ext'};
272 my $first_inorder_mime = $self->{'first_inorder_mime'};
273
274 my $outhandle = $self->{'outhandle'};
275
276 my $filename = &util::filename_cat($base_dir, $file);
277 # return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
278
279 if ($filename !~ /dublin_core\.xml$/ || !-f $filename) {
280 return undef;
281 }
282
[15872]283 print $outhandle "DSpacePlugin: extracting metadata from $file\n"
[8511]284 if $self->{'verbosity'} > 1;
285
286 my ($dir) = $filename =~ /^(.*?)[^\/\\]*$/;
287
288 eval {
289 $self->{'parser'}->parsefile($filename);
290 };
291
292 if ($@) {
[15872]293 die "DSpacePlugin: ERROR $filename is not a well formed dublin_core.xml file ($@)\n";
[8511]294 }
295
[24306]296 my $mimetype_list = $self->{'saved_metadata'}->{'ex.dc.Format^mimetype'};
[8511]297 my ($doc_file_mimes, $assoc_file_mimes) = $self->read_content($dir, $only_first_doc, $first_inorder_ext,
298 $first_inorder_mime, $mimetype_list);
[8913]299
[8511]300 my $file_re = $self->filemime_list_to_re($doc_file_mimes);
301
302 if ($only_first_doc || $first_inorder_ext || $first_inorder_mime) {
303 foreach my $afm ( @$assoc_file_mimes ) {
304 my $full_af = &util::filename_cat($dir,$afm->{'file'});
305 $self->{'extra_blocks'}->{$full_af} = 1;
306 }
307 }
[24951]308 &extrametautil::addmetakey($extrametakeys, $file_re);
[8511]309
[24355]310 # See Format's Extent section in http://dublincore.org/documents/usageguide/qualifiers.shtml
311 # it could specify duration, size or even dimensions of the resource. It may be a useful piece
312 # of metadata to preserve after all.
313 #if (defined $self->{'saved_metadata'}->{'ex.dc.Format^extent'}) {
314 #delete $self->{'saved_metadata'}->{'ex.dc.Format^extent'};
315 #}
[8511]316
317 if (defined $mimetype_list) {
[24306]318 delete $self->{'saved_metadata'}->{'ex.dc.Format^mimetype'};
[8511]319
320 # Temporarily store associate file info in metadata table
[15872]321 # This will be removed in 'extra_metadata' in BasePlugin and used
[8511]322 # to perform the actual file association (once the doc obj has
323 # been formed
324
325 my $main_doc = $doc_file_mimes->[0];
326 my $md_mimetype = $main_doc->{'mimetype'};
327
328 my $pd_lookup = $primary_doc_lookup->{$md_mimetype};
329
330 if (defined $pd_lookup) {
331 my $filter_re = $pd_lookup;
332 @$assoc_file_mimes = grep { $_->{'file'} !~ m/$filter_re/ } @$assoc_file_mimes;
333 }
334
335 my @gsdlassocfile_tobe
[24219]336 = map { &util::filename_cat($dir,$_->{'file'}) .":".$_->{'mimetype'}.":" } @$assoc_file_mimes if @$assoc_file_mimes;
[8511]337 $self->{'saved_metadata'}->{'gsdlassocfile_tobe'} = \@gsdlassocfile_tobe;
338
339 }
[8913]340
[24951]341 &extrametautil::setmetadata($extrametadata, $file_re, $self->{'saved_metadata'});
[8511]342
343 return 1;
344}
345
346
[15872]347# The DSpacePlugin read() function. We are not actually reading any documents
348# here, just blocking ones that have been processed by metadata read.
[8511]349#
[15872]350# Returns 0 for a file its blocking, undef for any other
[8511]351sub read {
352 my $self = shift (@_);
[16392]353 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
[8511]354 my $outhandle = $self->{'outhandle'};
355
356 # Block all files except contents
357 my $filename = &util::filename_cat($base_dir, $file);
358 return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
359
360 my $assocfile = $metadata->{'assocfile'};
361
362 return 0 if (($filename =~ /dublin_core\.xml$/) || ($filename =~ /contents$/));
363 return 0 if (defined $self->{'extra_blocks'}->{$filename});
364 return undef;
365}
366
367sub Doctype {
368 my ($expat, $name, $sysid, $pubid, $internal) = @_;
369
370 die if ($name !~ /^dublin_core$/);
371}
372
373sub StartTag {
374 my ($expat, $element) = @_;
375 if ($element eq "dublin_core") {
376 $self->{'saved_metadata'} = {};
[8913]377 } elsif ($element eq "dcvalue") {
[8511]378 my $metaname = $_{'element'};
[10549]379 my $qualifier = $_{'qualifier'}||"";
[21367]380 if ($metaname ne "description" || $qualifier ne "provenance") {
[9694]381 $metaname .= "^$qualifier" if ($qualifier ne "none" && $qualifier ne "");
[24306]382 $self->{'metaname'} = "ex.dc.\u$metaname";
[8511]383 }
384 }
385}
386
387sub EndTag {
388 my ($expat, $element) = @_;
389
390 if ($element eq "dcvalue") {
391 $self->{'metaname'} = "";
392 }
393}
394
395sub Text {
396 if (defined ($self->{'metaname'}) && $self->{'metaname'} ne "") {
397 # $_ == Metadata content
398 my $mname = $self->{'metaname'};
[21367]399 my $mvalue = prepareMetadataValue($_);
[8511]400 if (defined $self->{'saved_metadata'}->{$mname}) {
401 # accumulate - add value to existing value(s)
402 if (ref ($self->{'saved_metadata'}->{$mname}) eq "ARRAY") {
[21367]403 push (@{$self->{'saved_metadata'}->{$mname}}, $mvalue);
[8511]404 } else {
405 $self->{'saved_metadata'}->{$mname} =
[21367]406 [$self->{'saved_metadata'}->{$mname}, $mvalue];
[8511]407 }
408 } else {
409 # accumulate - add value into (currently empty) array
[21367]410 $self->{'saved_metadata'}->{$mname} = [$mvalue];
[8511]411 }
412
413 }
414}
415
[21367]416# Prepare DSpace metadata for using with Greenstone.
417# Some value must be escaped.
418sub prepareMetadataValue {
419 my ($value) = @_;
420
421 $value =~ s/\[/&#091;/g;
422 $value =~ s/\]/&#093;/g;
423
424 return $value;
425 }
[8511]426# This Char function overrides the one in XML::Parser::Stream to overcome a
427# problem where $expat->{Text} is treated as the return value, slowing
428# things down significantly in some cases.
429sub Char {
[9462]430 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
431 $_[0]->{'Text'} .= $_[1];
432 return undef;
[8511]433}
434
4351;
Note: See TracBrowser for help on using the repository browser.