source: main/trunk/greenstone2/perllib/plugins/DSpacePlugin.pm@ 31492

Last change on this file since 31492 was 31492, checked in by kjdon, 7 years ago

renamed EncodingUtil to CommonUtil, BasePlugin to BaseImporter. The idea is that only top level plugins that you can specify in your collection get to have plugin in their name. Modified all other plugins to reflect these name changes

  • Property svn:keywords set to Author Date Id Revision
File size: 13.4 KB
Line 
1
2###########################################################################
3#
4# DSpacePlugin.pm -- plugin for importing a collection from DSpace
5#
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 2004 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# This plugin takes "contents" and dublin_core.xml file, which contain
30# Metadata and lists of associated files for a particular document
31# and produces a document containing sections, one for each page.
32# The files should be named "contents" and "dublin_core.xml". For each of
33# document in DSpace, it is stored in one directory
34#
35# The format of the "contents" file is as follows:
36#
37# File.type bundle:ORIGINAL
38# license.txt bundle:LICENSE
39# The format of the "dublin_core.xml" file is as follows:
40# The first line contains any metadata for the whole document
41# <dublin_core>
42# eg.
43# <dcvalue element="Title" qualifier="">Snail farming</dcvalue>
44# <dcvalue element="date" qualifier="">2004-10-15</dcvalue>
45#
46
47package DSpacePlugin;
48
49use extrametautil;
50use ReadTextFile;
51use plugin;
52use util;
53use FileUtils;
54use XMLParser;
55use strict;
56no strict 'refs'; # allow filehandles to be variables and viceversa
57
58sub BEGIN {
59 @DSpacePlugin::ISA = ('ReadTextFile');
60}
61
62my $arguments =
63 [ { 'name' => "process_exp",
64 'desc' => "{BaseImporter.process_exp}",
65 'type' => "string",
66 'deft' => &get_default_process_exp(),
67 'reqd' => "no" },
68 { 'name' => "only_first_doc",
69 'desc' => "{DSpacePlugin.only_first_doc}",
70 'type' => "flag",
71 'reqd' => "no" },
72 { 'name' => "first_inorder_ext",
73 'desc' => "{DSpacePlugin.first_inorder_ext}",
74 'type' => "string",
75 'reqd' => "no" },
76 { 'name' => "first_inorder_mime",
77 'desc' => "{DSpacePlugin.first_inorder_mime}",
78 'type' => "flag",
79 'reqd' => "no" },
80 { 'name' => "block_exp",
81 'desc' => "{BaseImporter.block_exp}",
82 'type' => "regexp",
83 'deft' => &get_default_block_exp(),
84 'reqd' => "no" }];
85
86
87my $options = { 'name' => "DSpacePlugin",
88 'desc' => "{DSpacePlugin.desc}",
89 'inherits' => "yes",
90 'abstract' => "no",
91 'args' => $arguments };
92
93
94my $primary_doc_lookup = { 'text/html' => '(?i)\.(gif|jpe?g|jpe|jpg|png|css)$' };
95
96# Important variation to regular plugin structure. Need to declare
97# $self as global variable to file so XMLParser callback routines
98# can access the content of the object.
99my ($self);
100
101sub new {
102 my ($class) = shift (@_);
103 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
104 push(@$pluginlist, $class);
105
106 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
107 push(@{$hashArgOptLists->{"OptList"}},$options);
108
109 $self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists);
110
111 if ($self->{'info_only'}) {
112 # don't worry about creating the XML parser as all we want is the
113 # list of plugin options
114 return bless $self, $class;
115 }
116
117 #create XML::Parser object for parsing dublin_core.xml files
118 my $parser = new XML::Parser('Style' => 'Stream',
119 'Handlers' => {'Char' => \&Char,
120 'Doctype' => \&Doctype
121 });
122 $self->{'parser'} = $parser;
123 $self->{'extra_blocks'} = {};
124
125 return bless $self, $class;
126}
127
128sub get_default_process_exp {
129 my $self = shift (@_);
130
131 return q^(?i)(contents)$^;
132}
133
134# want to block all files except the "contents"
135sub get_default_block_exp {
136 my $self = shift (@_);
137
138 # Block handle and txt files if present. Specifically don't block dublin_core xml
139 return q^(?i)(handle|\.tx?t)$^;
140}
141
142sub store_block_files_BACKUP
143{
144 # Option of making blocking sensitive to files that are in directory
145 # This subroutine is not currently used! (relies on default block expression stopping all handle and .txt files)
146
147 my $self =shift (@_);
148 my ($filename_full_path, $block_hash) = @_;
149
150 my ($tailname, $contents_basedir, $suffix) = &File::Basename::fileparse($filename_full_path, "\\.[^\\.]+\$");
151 my $handle_filename = &FileUtils::filenameConcatenate($contents_basedir,"handle");
152
153 if (&FileUtils::fileTest($handle_filename)) {
154 $self->block_raw_filename($block_hash,$handle_filename);
155 }
156}
157
158sub read_content
159{
160 my $self = shift (@_);
161 my ($dir, $only_first_doc, $first_inorder_ext, $first_inorder_mime, $mimetype_list) = @_;
162 my $outhandle = $self->{'outhandle'};
163
164 my @fnamemime_list = ();
165 my @assocmime_list = ();
166
167 my $content_fname = &FileUtils::filenameConcatenate($dir,"contents");
168
169 open(CIN,"<$content_fname")
170 || die "Unable to open $content_fname: $!\n";
171
172 my $line;
173 my $pos = 0;
174
175 while (defined ($line = <CIN>)) {
176 if ($line =~ m/^(.*)\s+bundle:ORIGINAL\s*$/) {
177 my $fname = $1;
178 my $mtype = $mimetype_list->[$pos];
179 my $fm_rec = { 'file' => $fname, 'mimetype' => $mtype};
180 push(@fnamemime_list,$fm_rec);
181 $pos++;
182 }
183 }
184 close CIN;
185
186 if ($only_first_doc){
187 my ($first_fname, @rest_fnames) = @fnamemime_list;
188 @fnamemime_list = ($first_fname);
189 @assocmime_list = @rest_fnames;
190 }
191
192 # allow user to specify the types of files (inorder)they would like to assign as
193 # a primary bitstream
194 if ($first_inorder_ext) {
195 # parse user-define file extension names
196 my @extfiles_list = split /,/, $first_inorder_ext;
197 my (@rest_fnames) = @fnamemime_list;
198 my @matched_list = ();
199 foreach my $file_ext (@extfiles_list) {
200 $pos = 0;
201 foreach my $allfiles (@fnamemime_list){
202 $allfiles->{'file'} =~ /^(.*)\.(.*?)$/;
203 my $allfiles_ext = $2;
204
205 if ($allfiles_ext =~ /$file_ext/) {
206 print $outhandle "Existing file:$allfiles->{'file'} match the user-define File Extension:$file_ext\n";
207 push (@matched_list, $allfiles);
208
209 # delete the matched extension file from the array
210 splice(@rest_fnames,$pos,1);
211
212 return (\@matched_list, \@rest_fnames);
213
214 }
215 $pos++;
216 }
217 }
218 }
219
220 if ($first_inorder_mime) {
221 # parse user-define file mimetype
222 my @file_mime_list = split /,/, $first_inorder_mime;
223 my (@rest_fnames) = @fnamemime_list;
224 my @matched_list = ();
225 foreach my $file_mime (@file_mime_list) {
226 $pos = 0;
227 foreach my $allfiles (@fnamemime_list){
228 my $allfiles_mime = $allfiles->{'mimetype'};
229
230 if ($allfiles_mime =~ /$file_mime/) {
231 print $outhandle "Existing file:$allfiles->{'file'} match the user-defined File MimeType:$file_mime\n";
232 push (@matched_list, $allfiles);
233
234 # delete the matched MIMEType file from the array
235 splice(@rest_fnames,$pos,1);
236 return (\@matched_list, \@rest_fnames);
237 }
238 $pos++;
239 }
240 }
241 }
242 return (\@fnamemime_list, \@assocmime_list);
243}
244
245
246sub filemime_list_to_re
247{
248 my $self = shift (@_);
249 my ($fnamemime_list) = @_;
250
251 my @fname_list = map { "(".$_->{'file'}.")" } @$fnamemime_list;
252 my $fname_re = join("|",@fname_list);
253
254 # Indexing into the extrameta data structures requires the filename's style of slashes to be in URL format
255 # Then need to convert the filename to a regex, no longer to protect windows directory chars \, but for
256 # protecting special characters like brackets in the filepath such as "C:\Program Files (x86)\Greenstone".
257 $fname_re = &util::filepath_to_url_format($fname_re); # just in case there are slashes in there
258
259 $fname_re =~ s/\./\\\./g;
260
261 return $fname_re;
262}
263
264# Read dublin_core metadata from DSpace collection
265sub metadata_read {
266 my $self = shift (@_);
267 my ($pluginfo, $base_dir, $file, $block_hash,
268 $extrametakeys, $extrametadata, $extrametafile,
269 $processor, $gli, $aux) = @_;
270
271 my $only_first_doc = $self->{'only_first_doc'};
272 my $first_inorder_ext = $self->{'first_inorder_ext'};
273 my $first_inorder_mime = $self->{'first_inorder_mime'};
274
275 my $outhandle = $self->{'outhandle'};
276
277 my $filename = &FileUtils::filenameConcatenate($base_dir, $file);
278 # return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
279
280 if ($filename !~ /dublin_core\.xml$/ || !-f $filename) {
281 return undef;
282 }
283
284 print $outhandle "DSpacePlugin: extracting metadata from $file\n"
285 if $self->{'verbosity'} > 1;
286
287 my ($dir) = $filename =~ /^(.*?)[^\/\\]*$/;
288
289 eval {
290 $self->{'parser'}->parsefile($filename);
291 };
292
293 if ($@) {
294 die "DSpacePlugin: ERROR $filename is not a well formed dublin_core.xml file ($@)\n";
295 }
296
297 my $mimetype_list = $self->{'saved_metadata'}->{'ex.dc.Format^mimetype'};
298 my ($doc_file_mimes, $assoc_file_mimes) = $self->read_content($dir, $only_first_doc, $first_inorder_ext,
299 $first_inorder_mime, $mimetype_list);
300
301 my $file_re = $self->filemime_list_to_re($doc_file_mimes);
302
303 if ($only_first_doc || $first_inorder_ext || $first_inorder_mime) {
304 foreach my $afm ( @$assoc_file_mimes ) {
305 my $full_af = &FileUtils::filenameConcatenate($dir,$afm->{'file'});
306 $self->{'extra_blocks'}->{$full_af} = 1;
307 }
308 }
309 &extrametautil::addmetakey($extrametakeys, $file_re);
310
311 # See Format's Extent section in http://dublincore.org/documents/usageguide/qualifiers.shtml
312 # it could specify duration, size or even dimensions of the resource. It may be a useful piece
313 # of metadata to preserve after all.
314 #if (defined $self->{'saved_metadata'}->{'ex.dc.Format^extent'}) {
315 #delete $self->{'saved_metadata'}->{'ex.dc.Format^extent'};
316 #}
317
318 if (defined $mimetype_list) {
319 delete $self->{'saved_metadata'}->{'ex.dc.Format^mimetype'};
320
321 # Temporarily store associate file info in metadata table
322 # This will be removed in 'extra_metadata' in BaseImporter and used
323 # to perform the actual file association (once the doc obj has
324 # been formed
325
326 my $main_doc = $doc_file_mimes->[0];
327 my $md_mimetype = $main_doc->{'mimetype'};
328
329 my $pd_lookup = $primary_doc_lookup->{$md_mimetype};
330
331 if (defined $pd_lookup) {
332 my $filter_re = $pd_lookup;
333 @$assoc_file_mimes = grep { $_->{'file'} !~ m/$filter_re/ } @$assoc_file_mimes;
334 }
335
336 my @gsdlassocfile_tobe
337 = map { &FileUtils::filenameConcatenate($dir,$_->{'file'}) .":".$_->{'mimetype'}.":" } @$assoc_file_mimes if @$assoc_file_mimes;
338 $self->{'saved_metadata'}->{'gsdlassocfile_tobe'} = \@gsdlassocfile_tobe;
339
340 }
341
342 &extrametautil::setmetadata($extrametadata, $file_re, $self->{'saved_metadata'});
343
344 return 1;
345}
346
347
348# The DSpacePlugin read() function. We are not actually reading any documents
349# here, just blocking ones that have been processed by metadata read.
350#
351# Returns 0 for a file its blocking, undef for any other
352sub read {
353 my $self = shift (@_);
354 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
355 my $outhandle = $self->{'outhandle'};
356
357 # Block all files except contents
358 my $filename = &FileUtils::filenameConcatenate($base_dir, $file);
359 return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
360
361 my $assocfile = $metadata->{'assocfile'};
362
363 return 0 if (($filename =~ /dublin_core\.xml$/) || ($filename =~ /contents$/));
364 return 0 if (defined $self->{'extra_blocks'}->{$filename});
365 return undef;
366}
367
368sub Doctype {
369 my ($expat, $name, $sysid, $pubid, $internal) = @_;
370
371 die if ($name !~ /^dublin_core$/);
372}
373
374sub StartTag {
375 my ($expat, $element) = @_;
376 if ($element eq "dublin_core") {
377 $self->{'saved_metadata'} = {};
378 } elsif ($element eq "dcvalue") {
379 my $metaname = $_{'element'};
380 my $qualifier = $_{'qualifier'}||"";
381 if ($metaname ne "description" || $qualifier ne "provenance") {
382 $metaname .= "^$qualifier" if ($qualifier ne "none" && $qualifier ne "");
383 $self->{'metaname'} = "ex.dc.\u$metaname";
384 }
385 }
386}
387
388sub EndTag {
389 my ($expat, $element) = @_;
390
391 if ($element eq "dcvalue") {
392 $self->{'metaname'} = "";
393 }
394}
395
396sub Text {
397 if (defined ($self->{'metaname'}) && $self->{'metaname'} ne "") {
398 # $_ == Metadata content
399 my $mname = $self->{'metaname'};
400 my $mvalue = prepareMetadataValue($_);
401 if (defined $self->{'saved_metadata'}->{$mname}) {
402 # accumulate - add value to existing value(s)
403 if (ref ($self->{'saved_metadata'}->{$mname}) eq "ARRAY") {
404 push (@{$self->{'saved_metadata'}->{$mname}}, $mvalue);
405 } else {
406 $self->{'saved_metadata'}->{$mname} =
407 [$self->{'saved_metadata'}->{$mname}, $mvalue];
408 }
409 } else {
410 # accumulate - add value into (currently empty) array
411 $self->{'saved_metadata'}->{$mname} = [$mvalue];
412 }
413
414 }
415}
416
417# Prepare DSpace metadata for using with Greenstone.
418# Some value must be escaped.
419sub prepareMetadataValue {
420 my ($value) = @_;
421
422 $value =~ s/\[/&#091;/g;
423 $value =~ s/\]/&#093;/g;
424
425 return $value;
426 }
427# This Char function overrides the one in XML::Parser::Stream to overcome a
428# problem where $expat->{Text} is treated as the return value, slowing
429# things down significantly in some cases.
430sub Char {
431 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
432 $_[0]->{'Text'} .= $_[1];
433 return undef;
434}
435
4361;
Note: See TracBrowser for help on using the repository browser.