source: gsdl/trunk/perllib/plugins/DSpacePlugin.pm@ 15918

Last change on this file since 15918 was 15872, checked in by kjdon, 16 years ago

plugin overhaul: plugins renamed to xxPlugin, and in some cases the names are made more sensible. They now use the new base plugins. Hopefully we have better code reuse. Some of the plugins still need work done as I didn't want to spend another month doing this before committing it. Alos, I haven't really tested anything yet...

  • Property svn:keywords set to Author Date Id Revision
File size: 11.6 KB
Line 
1
2###########################################################################
3#
4# DSpacePlugin.pm -- plugin for importing a collection from DSpace
5#
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 2004 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# This plugin takes "contents" and dublin_core.xml file, which contain
30# Metadata and lists of associated files for a particular document
31# and produces a document containing sections, one for each page.
32# The files should be named "contents" and "dublin_core.xml". For each of
33# document in DSpace, it is stored in one directory
34#
35# The format of the "contents" file is as follows:
36#
37# File.type bundle:ORIGINAL
38# license.txt bundle:LICENSE
39# The format of the "dublin_core.xml" file is as follows:
40# The first line contains any metadata for the whole document
41# <dublin_core>
42# eg.
43# <dcvalue element="Title" qualifier="">Snail farming</dcvalue>
44# <dcvalue element="date" qualifier="">2004-10-15</dcvalue>
45#
46
47package DSpacePlugin;
48
49use BasePlugin;
50use plugin;
51use XMLParser;
52use strict;
53no strict 'refs'; # allow filehandles to be variables and viceversa
54
55sub BEGIN {
56 @DSpacePlugin::ISA = ('BasePlugin');
57}
58
59my $arguments =
60 [ { 'name' => "process_exp",
61 'desc' => "{BasePlugin.process_exp}",
62 'type' => "string",
63 'deft' => &get_default_process_exp(),
64 'reqd' => "no" },
65 { 'name' => "only_first_doc",
66 'desc' => "{DSpacePlugin.only_first_doc}",
67 'type' => "flag",
68 'reqd' => "no" },
69 { 'name' => "first_inorder_ext",
70 'desc' => "{DSpacePlugin.first_inorder_ext}",
71 'type' => "string",
72 'reqd' => "no" },
73 { 'name' => "first_inorder_mime",
74 'desc' => "{DSpacePlugin.first_inorder_mime}",
75 'type' => "flag",
76 'reqd' => "no" },
77 { 'name' => "block_exp",
78 'desc' => "{BasePlugin.block_exp}",
79 'type' => "regexp",
80 'deft' => &get_default_block_exp(),
81 'reqd' => "no" }];
82
83
84my $options = { 'name' => "DSpacePlugin",
85 'desc' => "{DSpacePlugin.desc}",
86 'inherits' => "yes",
87 'abstract' => "no",
88 'args' => $arguments };
89
90
91my $primary_doc_lookup = { 'text/html' => '(?i)\.(gif|jpe?g|jpe|jpg|png|css)$' };
92
93# Important variation to regular plugin structure. Need to declare
94# $self as global variable to file so XMLParser callback routines
95# can access the content of the object.
96my ($self);
97
98sub new {
99 my ($class) = shift (@_);
100 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
101 push(@$pluginlist, $class);
102
103 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
104 push(@{$hashArgOptLists->{"OptList"}},$options);
105
106 $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists);
107
108 if ($self->{'info_only'}) {
109 # don't worry about creating the XML parser as all we want is the
110 # list of plugin options
111 return bless $self, $class;
112 }
113
114 #create XML::Parser object for parsing dublin_core.xml files
115 my $parser = new XML::Parser('Style' => 'Stream',
116 'Handlers' => {'Char' => \&Char,
117 'Doctype' => \&Doctype
118 });
119 $self->{'parser'} = $parser;
120 $self->{'extra_blocks'} = {};
121
122 return bless $self, $class;
123}
124
125sub get_default_process_exp {
126 my $self = shift (@_);
127
128 return q^(?i)contents$^;
129}
130
131# want to block all files except the "contents"
132sub get_default_block_exp {
133 my $self = shift (@_);
134
135 # Block all files besides contents
136 return q^(?i)(handle|dublin_core\.xml|\.tx?t)$^;
137}
138
139sub read_content
140{
141 my $self = shift (@_);
142 my ($dir, $only_first_doc, $first_inorder_ext, $first_inorder_mime, $mimetype_list) = @_;
143 my $outhandle = $self->{'outhandle'};
144
145 my @fnamemime_list = ();
146 my @assocmime_list = ();
147
148 my $content_fname = &util::filename_cat($dir,"contents");
149
150 open(CIN,"<$content_fname")
151 || die "Unable to open $content_fname: $!\n";
152
153 my $line;
154 my $pos = 0;
155
156 while (defined ($line = <CIN>)) {
157 if ($line =~ m/^(.*)\s+bundle:ORIGINAL\s*$/) {
158 my $fname = $1;
159 my $mtype = $mimetype_list->[$pos];
160 my $fm_rec = { 'file' => $fname, 'mimetype' => $mtype};
161 push(@fnamemime_list,$fm_rec);
162 $pos++;
163 }
164 }
165 close CIN;
166
167 if ($only_first_doc){
168 my ($first_fname, @rest_fnames) = @fnamemime_list;
169 @fnamemime_list = ($first_fname);
170 @assocmime_list = @rest_fnames;
171 }
172
173 # allow user to specify the types of files (inorder)they would like to assign as
174 # a primary bitstream
175 if ($first_inorder_ext) {
176 # parse user-define file extension names
177 my @extfiles_list = split /,/, $first_inorder_ext;
178 my (@rest_fnames) = @fnamemime_list;
179 my @matched_list = ();
180 foreach my $file_ext (@extfiles_list) {
181 $pos = 0;
182 foreach my $allfiles (@fnamemime_list){
183 $allfiles->{'file'} =~ /^(.*)\.(.*?)$/;
184 my $allfiles_ext = $2;
185
186 if ($allfiles_ext =~ /$file_ext/) {
187 print $outhandle "Existing file:$allfiles->{'file'} match the user-define File Extension:$file_ext\n";
188 push (@matched_list, $allfiles);
189
190 # delete the matched extension file from the array
191 splice(@rest_fnames,$pos,1);
192
193 return (\@matched_list, \@rest_fnames);
194
195 }
196 $pos++;
197 }
198 }
199 }
200
201 if ($first_inorder_mime) {
202 # parse user-define file mimetype
203 my @file_mime_list = split /,/, $first_inorder_mime;
204 my (@rest_fnames) = @fnamemime_list;
205 my @matched_list = ();
206 foreach my $file_mime (@file_mime_list) {
207 $pos = 0;
208 foreach my $allfiles (@fnamemime_list){
209 my $allfiles_mime = $allfiles->{'mimetype'};
210
211 if ($allfiles_mime =~ /$file_mime/) {
212 print $outhandle "Existing file:$allfiles->{'file'} match the user-defined File MimeType:$file_mime\n";
213 push (@matched_list, $allfiles);
214
215 # delete the matched MIMEType file from the array
216 splice(@rest_fnames,$pos,1);
217 return (\@matched_list, \@rest_fnames);
218 }
219 $pos++;
220 }
221 }
222 }
223 return (\@fnamemime_list, \@assocmime_list);
224}
225
226
227sub filemime_list_to_re
228{
229 my $self = shift (@_);
230 my ($fnamemime_list) = @_;
231
232 my @fname_list = map { "(".$_->{'file'}.")" } @$fnamemime_list;
233 my $fname_re = join("|",@fname_list);
234 $fname_re =~ s/\./\\\./g;
235
236 return $fname_re;
237}
238
239# Read dublin_core metadata from DSpace collection
240sub metadata_read {
241 my $self = shift (@_);
242 my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
243
244 my $only_first_doc = $self->{'only_first_doc'};
245 my $first_inorder_ext = $self->{'first_inorder_ext'};
246 my $first_inorder_mime = $self->{'first_inorder_mime'};
247
248 my $outhandle = $self->{'outhandle'};
249
250 my $filename = &util::filename_cat($base_dir, $file);
251 # return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
252
253 if ($filename !~ /dublin_core\.xml$/ || !-f $filename) {
254 return undef;
255 }
256
257 print $outhandle "DSpacePlugin: extracting metadata from $file\n"
258 if $self->{'verbosity'} > 1;
259
260 my ($dir) = $filename =~ /^(.*?)[^\/\\]*$/;
261
262 eval {
263 $self->{'parser'}->parsefile($filename);
264 };
265
266 if ($@) {
267 die "DSpacePlugin: ERROR $filename is not a well formed dublin_core.xml file ($@)\n";
268 }
269
270 my $mimetype_list = $self->{'saved_metadata'}->{'dc.Format^mimetype'};
271 my ($doc_file_mimes, $assoc_file_mimes) = $self->read_content($dir, $only_first_doc, $first_inorder_ext,
272 $first_inorder_mime, $mimetype_list);
273
274 my $file_re = $self->filemime_list_to_re($doc_file_mimes);
275
276 if ($only_first_doc || $first_inorder_ext || $first_inorder_mime) {
277 foreach my $afm ( @$assoc_file_mimes ) {
278 my $full_af = &util::filename_cat($dir,$afm->{'file'});
279 $self->{'extra_blocks'}->{$full_af} = 1;
280 }
281 }
282 push(@$extrametakeys,$file_re);
283
284 if (defined $self->{'saved_metadata'}->{'dc.Format^extent'}) {
285 delete $self->{'saved_metadata'}->{'dc.Format^extent'};
286 }
287
288 if (defined $mimetype_list) {
289 delete $self->{'saved_metadata'}->{'dc.Format^mimetype'};
290
291 # Temporarily store associate file info in metadata table
292 # This will be removed in 'extra_metadata' in BasePlugin and used
293 # to perform the actual file association (once the doc obj has
294 # been formed
295
296 my $main_doc = $doc_file_mimes->[0];
297 my $md_mimetype = $main_doc->{'mimetype'};
298
299 my $pd_lookup = $primary_doc_lookup->{$md_mimetype};
300
301 if (defined $pd_lookup) {
302 my $filter_re = $pd_lookup;
303 @$assoc_file_mimes = grep { $_->{'file'} !~ m/$filter_re/ } @$assoc_file_mimes;
304 }
305
306 my @gsdlassocfile_tobe
307 = map { &util::filename_cat($dir,$_->{'file'}).":".$_->{'mimetype'}.":" } @$assoc_file_mimes;
308 $self->{'saved_metadata'}->{'gsdlassocfile_tobe'} = \@gsdlassocfile_tobe;
309
310 }
311
312 $extrametadata->{$file_re} = $self->{'saved_metadata'};
313
314 return 1;
315}
316
317
318# The DSpacePlugin read() function. We are not actually reading any documents
319# here, just blocking ones that have been processed by metadata read.
320#
321# Returns 0 for a file its blocking, undef for any other
322sub read {
323 my $self = shift (@_);
324 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
325 my $outhandle = $self->{'outhandle'};
326
327 # Block all files except contents
328 my $filename = &util::filename_cat($base_dir, $file);
329 return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
330
331 my $assocfile = $metadata->{'assocfile'};
332
333 return 0 if (($filename =~ /dublin_core\.xml$/) || ($filename =~ /contents$/));
334 return 0 if (defined $self->{'extra_blocks'}->{$filename});
335 return undef;
336}
337
338sub Doctype {
339 my ($expat, $name, $sysid, $pubid, $internal) = @_;
340
341 die if ($name !~ /^dublin_core$/);
342}
343
344sub StartTag {
345 my ($expat, $element) = @_;
346 if ($element eq "dublin_core") {
347 $self->{'saved_metadata'} = {};
348 } elsif ($element eq "dcvalue") {
349 my $metaname = $_{'element'};
350 my $qualifier = $_{'qualifier'}||"";
351 if ($metaname ne "description") {
352 $metaname .= "^$qualifier" if ($qualifier ne "none" && $qualifier ne "");
353 $self->{'metaname'} = "dc.\u$metaname";
354 }
355 }
356}
357
358sub EndTag {
359 my ($expat, $element) = @_;
360
361 if ($element eq "dcvalue") {
362 $self->{'metaname'} = "";
363 }
364}
365
366sub Text {
367 if (defined ($self->{'metaname'}) && $self->{'metaname'} ne "") {
368 # $_ == Metadata content
369 my $mname = $self->{'metaname'};
370 if (defined $self->{'saved_metadata'}->{$mname}) {
371 # accumulate - add value to existing value(s)
372 if (ref ($self->{'saved_metadata'}->{$mname}) eq "ARRAY") {
373 push (@{$self->{'saved_metadata'}->{$mname}}, $_);
374 } else {
375 $self->{'saved_metadata'}->{$mname} =
376 [$self->{'saved_metadata'}->{$mname}, $_];
377 }
378 } else {
379 # accumulate - add value into (currently empty) array
380 $self->{'saved_metadata'}->{$mname} = [$_];
381 }
382
383 }
384}
385
386# This Char function overrides the one in XML::Parser::Stream to overcome a
387# problem where $expat->{Text} is treated as the return value, slowing
388# things down significantly in some cases.
389sub Char {
390 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
391 $_[0]->{'Text'} .= $_[1];
392 return undef;
393}
394
3951;
Note: See TracBrowser for help on using the repository browser.