source: main/trunk/greenstone2/perllib/plugins/DSpacePlugin.pm@ 32589

Last change on this file since 32589 was 31494, checked in by kjdon, 7 years ago

updated text string keys based on new plugin names

  • Property svn:keywords set to Author Date Id Revision
File size: 13.4 KB
RevLine 
[10549]1
[8511]2###########################################################################
3#
[15872]4# DSpacePlugin.pm -- plugin for importing a collection from DSpace
[8511]5#
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
[15872]10# Copyright (C) 2004 New Zealand Digital Library Project
[8511]11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
[15872]28
[8511]29# This plugin takes "contents" and dublin_core.xml file, which contain
30# Metadata and lists of associated files for a particular document
31# and produces a document containing sections, one for each page.
32# The files should be named "contents" and "dublin_core.xml". For each of
33# document in DSpace, it is stored in one directory
34#
35# The format of the "contents" file is as follows:
36#
37# File.type bundle:ORIGINAL
38# license.txt bundle:LICENSE
39# The format of the "dublin_core.xml" file is as follows:
40# The first line contains any metadata for the whole document
41# <dublin_core>
42# eg.
43# <dcvalue element="Title" qualifier="">Snail farming</dcvalue>
44# <dcvalue element="date" qualifier="">2004-10-15</dcvalue>
45#
46
[15872]47package DSpacePlugin;
[8511]48
[24951]49use extrametautil;
[19746]50use ReadTextFile;
[8511]51use plugin;
[24971]52use util;
[28563]53use FileUtils;
[8511]54use XMLParser;
[10254]55use strict;
56no strict 'refs'; # allow filehandles to be variables and viceversa
[8511]57
58sub BEGIN {
[19746]59 @DSpacePlugin::ISA = ('ReadTextFile');
[8511]60}
61
62my $arguments =
63 [ { 'name' => "process_exp",
[31492]64 'desc' => "{BaseImporter.process_exp}",
[8511]65 'type' => "string",
66 'deft' => &get_default_process_exp(),
67 'reqd' => "no" },
68 { 'name' => "only_first_doc",
[15872]69 'desc' => "{DSpacePlugin.only_first_doc}",
[8511]70 'type' => "flag",
71 'reqd' => "no" },
[9168]72 { 'name' => "first_inorder_ext",
[15872]73 'desc' => "{DSpacePlugin.first_inorder_ext}",
[8891]74 'type' => "string",
[8511]75 'reqd' => "no" },
76 { 'name' => "first_inorder_mime",
[15872]77 'desc' => "{DSpacePlugin.first_inorder_mime}",
[8511]78 'type' => "flag",
79 'reqd' => "no" },
80 { 'name' => "block_exp",
[31494]81 'desc' => "{CommonUtil.block_exp}",
[8891]82 'type' => "regexp",
[8511]83 'deft' => &get_default_block_exp(),
84 'reqd' => "no" }];
85
86
[15872]87my $options = { 'name' => "DSpacePlugin",
88 'desc' => "{DSpacePlugin.desc}",
[8511]89 'inherits' => "yes",
[11676]90 'abstract' => "no",
[8511]91 'args' => $arguments };
92
93
94my $primary_doc_lookup = { 'text/html' => '(?i)\.(gif|jpe?g|jpe|jpg|png|css)$' };
95
[8913]96# Important variation to regular plugin structure. Need to declare
97# $self as global variable to file so XMLParser callback routines
98# can access the content of the object.
99my ($self);
100
[8511]101sub new {
[10218]102 my ($class) = shift (@_);
103 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
104 push(@$pluginlist, $class);
[8511]105
[15872]106 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
107 push(@{$hashArgOptLists->{"OptList"}},$options);
[8511]108
[19746]109 $self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists);
[8511]110
[15872]111 if ($self->{'info_only'}) {
112 # don't worry about creating the XML parser as all we want is the
113 # list of plugin options
114 return bless $self, $class;
115 }
116
[8511]117 #create XML::Parser object for parsing dublin_core.xml files
118 my $parser = new XML::Parser('Style' => 'Stream',
119 'Handlers' => {'Char' => \&Char,
120 'Doctype' => \&Doctype
121 });
122 $self->{'parser'} = $parser;
123 $self->{'extra_blocks'} = {};
124
125 return bless $self, $class;
126}
127
128sub get_default_process_exp {
129 my $self = shift (@_);
130
[19746]131 return q^(?i)(contents)$^;
[8511]132}
133
134# want to block all files except the "contents"
135sub get_default_block_exp {
136 my $self = shift (@_);
137
[19746]138 # Block handle and txt files if present. Specifically don't block dublin_core xml
139 return q^(?i)(handle|\.tx?t)$^;
[8511]140}
141
[19746]142sub store_block_files_BACKUP
143{
144 # Option of making blocking sensitive to files that are in directory
145 # This subroutine is not currently used! (relies on default block expression stopping all handle and .txt files)
146
147 my $self =shift (@_);
148 my ($filename_full_path, $block_hash) = @_;
149
150 my ($tailname, $contents_basedir, $suffix) = &File::Basename::fileparse($filename_full_path, "\\.[^\\.]+\$");
[28563]151 my $handle_filename = &FileUtils::filenameConcatenate($contents_basedir,"handle");
[19746]152
[28563]153 if (&FileUtils::fileTest($handle_filename)) {
[31480]154 $self->block_raw_filename($block_hash,$handle_filename);
[23419]155 }
[19746]156}
157
[8511]158sub read_content
159{
160 my $self = shift (@_);
161 my ($dir, $only_first_doc, $first_inorder_ext, $first_inorder_mime, $mimetype_list) = @_;
162 my $outhandle = $self->{'outhandle'};
163
164 my @fnamemime_list = ();
165 my @assocmime_list = ();
166
[28563]167 my $content_fname = &FileUtils::filenameConcatenate($dir,"contents");
[8511]168
169 open(CIN,"<$content_fname")
170 || die "Unable to open $content_fname: $!\n";
171
172 my $line;
173 my $pos = 0;
174
175 while (defined ($line = <CIN>)) {
176 if ($line =~ m/^(.*)\s+bundle:ORIGINAL\s*$/) {
177 my $fname = $1;
178 my $mtype = $mimetype_list->[$pos];
179 my $fm_rec = { 'file' => $fname, 'mimetype' => $mtype};
180 push(@fnamemime_list,$fm_rec);
181 $pos++;
182 }
183 }
184 close CIN;
185
186 if ($only_first_doc){
187 my ($first_fname, @rest_fnames) = @fnamemime_list;
188 @fnamemime_list = ($first_fname);
189 @assocmime_list = @rest_fnames;
190 }
191
192 # allow user to specify the types of files (inorder)they would like to assign as
193 # a primary bitstream
194 if ($first_inorder_ext) {
195 # parse user-define file extension names
196 my @extfiles_list = split /,/, $first_inorder_ext;
197 my (@rest_fnames) = @fnamemime_list;
198 my @matched_list = ();
199 foreach my $file_ext (@extfiles_list) {
200 $pos = 0;
201 foreach my $allfiles (@fnamemime_list){
202 $allfiles->{'file'} =~ /^(.*)\.(.*?)$/;
203 my $allfiles_ext = $2;
204
205 if ($allfiles_ext =~ /$file_ext/) {
206 print $outhandle "Existing file:$allfiles->{'file'} match the user-define File Extension:$file_ext\n";
207 push (@matched_list, $allfiles);
208
209 # delete the matched extension file from the array
210 splice(@rest_fnames,$pos,1);
211
212 return (\@matched_list, \@rest_fnames);
213
214 }
215 $pos++;
216 }
217 }
218 }
219
220 if ($first_inorder_mime) {
221 # parse user-define file mimetype
222 my @file_mime_list = split /,/, $first_inorder_mime;
223 my (@rest_fnames) = @fnamemime_list;
224 my @matched_list = ();
225 foreach my $file_mime (@file_mime_list) {
226 $pos = 0;
227 foreach my $allfiles (@fnamemime_list){
228 my $allfiles_mime = $allfiles->{'mimetype'};
229
230 if ($allfiles_mime =~ /$file_mime/) {
231 print $outhandle "Existing file:$allfiles->{'file'} match the user-defined File MimeType:$file_mime\n";
232 push (@matched_list, $allfiles);
233
234 # delete the matched MIMEType file from the array
235 splice(@rest_fnames,$pos,1);
236 return (\@matched_list, \@rest_fnames);
237 }
238 $pos++;
239 }
240 }
241 }
242 return (\@fnamemime_list, \@assocmime_list);
243}
244
245
246sub filemime_list_to_re
247{
248 my $self = shift (@_);
249 my ($fnamemime_list) = @_;
250
251 my @fname_list = map { "(".$_->{'file'}.")" } @$fnamemime_list;
252 my $fname_re = join("|",@fname_list);
[24971]253
254 # Indexing into the extrameta data structures requires the filename's style of slashes to be in URL format
255 # Then need to convert the filename to a regex, no longer to protect windows directory chars \, but for
256 # protecting special characters like brackets in the filepath such as "C:\Program Files (x86)\Greenstone".
257 $fname_re = &util::filepath_to_url_format($fname_re); # just in case there are slashes in there
258
[8511]259 $fname_re =~ s/\./\\\./g;
260
261 return $fname_re;
262}
263
264# Read dublin_core metadata from DSpace collection
265sub metadata_read {
266 my $self = shift (@_);
[19493]267 my ($pluginfo, $base_dir, $file, $block_hash,
268 $extrametakeys, $extrametadata, $extrametafile,
[23212]269 $processor, $gli, $aux) = @_;
[8511]270
271 my $only_first_doc = $self->{'only_first_doc'};
272 my $first_inorder_ext = $self->{'first_inorder_ext'};
273 my $first_inorder_mime = $self->{'first_inorder_mime'};
274
275 my $outhandle = $self->{'outhandle'};
276
[28563]277 my $filename = &FileUtils::filenameConcatenate($base_dir, $file);
[8511]278 # return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
279
280 if ($filename !~ /dublin_core\.xml$/ || !-f $filename) {
281 return undef;
282 }
283
[15872]284 print $outhandle "DSpacePlugin: extracting metadata from $file\n"
[8511]285 if $self->{'verbosity'} > 1;
286
287 my ($dir) = $filename =~ /^(.*?)[^\/\\]*$/;
288
289 eval {
290 $self->{'parser'}->parsefile($filename);
291 };
292
293 if ($@) {
[15872]294 die "DSpacePlugin: ERROR $filename is not a well formed dublin_core.xml file ($@)\n";
[8511]295 }
296
[24306]297 my $mimetype_list = $self->{'saved_metadata'}->{'ex.dc.Format^mimetype'};
[8511]298 my ($doc_file_mimes, $assoc_file_mimes) = $self->read_content($dir, $only_first_doc, $first_inorder_ext,
299 $first_inorder_mime, $mimetype_list);
[8913]300
[8511]301 my $file_re = $self->filemime_list_to_re($doc_file_mimes);
302
303 if ($only_first_doc || $first_inorder_ext || $first_inorder_mime) {
304 foreach my $afm ( @$assoc_file_mimes ) {
[28563]305 my $full_af = &FileUtils::filenameConcatenate($dir,$afm->{'file'});
[8511]306 $self->{'extra_blocks'}->{$full_af} = 1;
307 }
308 }
[24951]309 &extrametautil::addmetakey($extrametakeys, $file_re);
[8511]310
[24355]311 # See Format's Extent section in http://dublincore.org/documents/usageguide/qualifiers.shtml
312 # it could specify duration, size or even dimensions of the resource. It may be a useful piece
313 # of metadata to preserve after all.
314 #if (defined $self->{'saved_metadata'}->{'ex.dc.Format^extent'}) {
315 #delete $self->{'saved_metadata'}->{'ex.dc.Format^extent'};
316 #}
[8511]317
318 if (defined $mimetype_list) {
[24306]319 delete $self->{'saved_metadata'}->{'ex.dc.Format^mimetype'};
[8511]320
321 # Temporarily store associate file info in metadata table
[31492]322 # This will be removed in 'extra_metadata' in BaseImporter and used
[8511]323 # to perform the actual file association (once the doc obj has
324 # been formed
325
326 my $main_doc = $doc_file_mimes->[0];
327 my $md_mimetype = $main_doc->{'mimetype'};
328
329 my $pd_lookup = $primary_doc_lookup->{$md_mimetype};
330
331 if (defined $pd_lookup) {
332 my $filter_re = $pd_lookup;
333 @$assoc_file_mimes = grep { $_->{'file'} !~ m/$filter_re/ } @$assoc_file_mimes;
334 }
335
336 my @gsdlassocfile_tobe
[28563]337 = map { &FileUtils::filenameConcatenate($dir,$_->{'file'}) .":".$_->{'mimetype'}.":" } @$assoc_file_mimes if @$assoc_file_mimes;
[8511]338 $self->{'saved_metadata'}->{'gsdlassocfile_tobe'} = \@gsdlassocfile_tobe;
339
340 }
[8913]341
[24951]342 &extrametautil::setmetadata($extrametadata, $file_re, $self->{'saved_metadata'});
[8511]343
344 return 1;
345}
346
347
[15872]348# The DSpacePlugin read() function. We are not actually reading any documents
349# here, just blocking ones that have been processed by metadata read.
[8511]350#
[15872]351# Returns 0 for a file its blocking, undef for any other
[8511]352sub read {
353 my $self = shift (@_);
[16392]354 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
[8511]355 my $outhandle = $self->{'outhandle'};
356
357 # Block all files except contents
[28563]358 my $filename = &FileUtils::filenameConcatenate($base_dir, $file);
[8511]359 return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
360
361 my $assocfile = $metadata->{'assocfile'};
362
363 return 0 if (($filename =~ /dublin_core\.xml$/) || ($filename =~ /contents$/));
364 return 0 if (defined $self->{'extra_blocks'}->{$filename});
365 return undef;
366}
367
368sub Doctype {
369 my ($expat, $name, $sysid, $pubid, $internal) = @_;
370
371 die if ($name !~ /^dublin_core$/);
372}
373
374sub StartTag {
375 my ($expat, $element) = @_;
376 if ($element eq "dublin_core") {
377 $self->{'saved_metadata'} = {};
[8913]378 } elsif ($element eq "dcvalue") {
[8511]379 my $metaname = $_{'element'};
[10549]380 my $qualifier = $_{'qualifier'}||"";
[21367]381 if ($metaname ne "description" || $qualifier ne "provenance") {
[9694]382 $metaname .= "^$qualifier" if ($qualifier ne "none" && $qualifier ne "");
[24306]383 $self->{'metaname'} = "ex.dc.\u$metaname";
[8511]384 }
385 }
386}
387
388sub EndTag {
389 my ($expat, $element) = @_;
390
391 if ($element eq "dcvalue") {
392 $self->{'metaname'} = "";
393 }
394}
395
396sub Text {
397 if (defined ($self->{'metaname'}) && $self->{'metaname'} ne "") {
398 # $_ == Metadata content
399 my $mname = $self->{'metaname'};
[21367]400 my $mvalue = prepareMetadataValue($_);
[8511]401 if (defined $self->{'saved_metadata'}->{$mname}) {
402 # accumulate - add value to existing value(s)
403 if (ref ($self->{'saved_metadata'}->{$mname}) eq "ARRAY") {
[21367]404 push (@{$self->{'saved_metadata'}->{$mname}}, $mvalue);
[8511]405 } else {
406 $self->{'saved_metadata'}->{$mname} =
[21367]407 [$self->{'saved_metadata'}->{$mname}, $mvalue];
[8511]408 }
409 } else {
410 # accumulate - add value into (currently empty) array
[21367]411 $self->{'saved_metadata'}->{$mname} = [$mvalue];
[8511]412 }
413
414 }
415}
416
[21367]417# Prepare DSpace metadata for using with Greenstone.
418# Some value must be escaped.
419sub prepareMetadataValue {
420 my ($value) = @_;
421
422 $value =~ s/\[/&#091;/g;
423 $value =~ s/\]/&#093;/g;
424
425 return $value;
426 }
[8511]427# This Char function overrides the one in XML::Parser::Stream to overcome a
428# problem where $expat->{Text} is treated as the return value, slowing
429# things down significantly in some cases.
430sub Char {
[9462]431 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
432 $_[0]->{'Text'} .= $_[1];
433 return undef;
[8511]434}
435
4361;
Note: See TracBrowser for help on using the repository browser.