source: gsdl/trunk/perllib/plugins/DSpacePlugin.pm@ 19493

Last change on this file since 19493 was 19493, checked in by davidb, 15 years ago

Introduction of new extrametafile to track which metadata.xml file a piece of metadata came from

  • Property svn:keywords set to Author Date Id Revision
File size: 11.6 KB
Line 
1
2###########################################################################
3#
4# DSpacePlugin.pm -- plugin for importing a collection from DSpace
5#
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 2004 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# This plugin takes "contents" and dublin_core.xml file, which contain
30# Metadata and lists of associated files for a particular document
31# and produces a document containing sections, one for each page.
32# The files should be named "contents" and "dublin_core.xml". For each of
33# document in DSpace, it is stored in one directory
34#
35# The format of the "contents" file is as follows:
36#
37# File.type bundle:ORIGINAL
38# license.txt bundle:LICENSE
39# The format of the "dublin_core.xml" file is as follows:
40# The first line contains any metadata for the whole document
41# <dublin_core>
42# eg.
43# <dcvalue element="Title" qualifier="">Snail farming</dcvalue>
44# <dcvalue element="date" qualifier="">2004-10-15</dcvalue>
45#
46
47package DSpacePlugin;
48
49use BasePlugin;
50use plugin;
51use XMLParser;
52use strict;
53no strict 'refs'; # allow filehandles to be variables and viceversa
54
55sub BEGIN {
56 @DSpacePlugin::ISA = ('BasePlugin');
57}
58
59my $arguments =
60 [ { 'name' => "process_exp",
61 'desc' => "{BasePlugin.process_exp}",
62 'type' => "string",
63 'deft' => &get_default_process_exp(),
64 'reqd' => "no" },
65 { 'name' => "only_first_doc",
66 'desc' => "{DSpacePlugin.only_first_doc}",
67 'type' => "flag",
68 'reqd' => "no" },
69 { 'name' => "first_inorder_ext",
70 'desc' => "{DSpacePlugin.first_inorder_ext}",
71 'type' => "string",
72 'reqd' => "no" },
73 { 'name' => "first_inorder_mime",
74 'desc' => "{DSpacePlugin.first_inorder_mime}",
75 'type' => "flag",
76 'reqd' => "no" },
77 { 'name' => "block_exp",
78 'desc' => "{BasePlugin.block_exp}",
79 'type' => "regexp",
80 'deft' => &get_default_block_exp(),
81 'reqd' => "no" }];
82
83
84my $options = { 'name' => "DSpacePlugin",
85 'desc' => "{DSpacePlugin.desc}",
86 'inherits' => "yes",
87 'abstract' => "no",
88 'args' => $arguments };
89
90
91my $primary_doc_lookup = { 'text/html' => '(?i)\.(gif|jpe?g|jpe|jpg|png|css)$' };
92
93# Important variation to regular plugin structure. Need to declare
94# $self as global variable to file so XMLParser callback routines
95# can access the content of the object.
96my ($self);
97
98sub new {
99 my ($class) = shift (@_);
100 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
101 push(@$pluginlist, $class);
102
103 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
104 push(@{$hashArgOptLists->{"OptList"}},$options);
105
106 $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists);
107
108 if ($self->{'info_only'}) {
109 # don't worry about creating the XML parser as all we want is the
110 # list of plugin options
111 return bless $self, $class;
112 }
113
114 #create XML::Parser object for parsing dublin_core.xml files
115 my $parser = new XML::Parser('Style' => 'Stream',
116 'Handlers' => {'Char' => \&Char,
117 'Doctype' => \&Doctype
118 });
119 $self->{'parser'} = $parser;
120 $self->{'extra_blocks'} = {};
121
122 return bless $self, $class;
123}
124
125sub get_default_process_exp {
126 my $self = shift (@_);
127
128 return q^(?i)contents$^;
129}
130
131# want to block all files except the "contents"
132sub get_default_block_exp {
133 my $self = shift (@_);
134
135 # Block all files besides contents
136 return q^(?i)(handle|dublin_core\.xml|\.tx?t)$^;
137}
138
139sub read_content
140{
141 my $self = shift (@_);
142 my ($dir, $only_first_doc, $first_inorder_ext, $first_inorder_mime, $mimetype_list) = @_;
143 my $outhandle = $self->{'outhandle'};
144
145 my @fnamemime_list = ();
146 my @assocmime_list = ();
147
148 my $content_fname = &util::filename_cat($dir,"contents");
149
150 open(CIN,"<$content_fname")
151 || die "Unable to open $content_fname: $!\n";
152
153 my $line;
154 my $pos = 0;
155
156 while (defined ($line = <CIN>)) {
157 if ($line =~ m/^(.*)\s+bundle:ORIGINAL\s*$/) {
158 my $fname = $1;
159 my $mtype = $mimetype_list->[$pos];
160 my $fm_rec = { 'file' => $fname, 'mimetype' => $mtype};
161 push(@fnamemime_list,$fm_rec);
162 $pos++;
163 }
164 }
165 close CIN;
166
167 if ($only_first_doc){
168 my ($first_fname, @rest_fnames) = @fnamemime_list;
169 @fnamemime_list = ($first_fname);
170 @assocmime_list = @rest_fnames;
171 }
172
173 # allow user to specify the types of files (inorder)they would like to assign as
174 # a primary bitstream
175 if ($first_inorder_ext) {
176 # parse user-define file extension names
177 my @extfiles_list = split /,/, $first_inorder_ext;
178 my (@rest_fnames) = @fnamemime_list;
179 my @matched_list = ();
180 foreach my $file_ext (@extfiles_list) {
181 $pos = 0;
182 foreach my $allfiles (@fnamemime_list){
183 $allfiles->{'file'} =~ /^(.*)\.(.*?)$/;
184 my $allfiles_ext = $2;
185
186 if ($allfiles_ext =~ /$file_ext/) {
187 print $outhandle "Existing file:$allfiles->{'file'} match the user-define File Extension:$file_ext\n";
188 push (@matched_list, $allfiles);
189
190 # delete the matched extension file from the array
191 splice(@rest_fnames,$pos,1);
192
193 return (\@matched_list, \@rest_fnames);
194
195 }
196 $pos++;
197 }
198 }
199 }
200
201 if ($first_inorder_mime) {
202 # parse user-define file mimetype
203 my @file_mime_list = split /,/, $first_inorder_mime;
204 my (@rest_fnames) = @fnamemime_list;
205 my @matched_list = ();
206 foreach my $file_mime (@file_mime_list) {
207 $pos = 0;
208 foreach my $allfiles (@fnamemime_list){
209 my $allfiles_mime = $allfiles->{'mimetype'};
210
211 if ($allfiles_mime =~ /$file_mime/) {
212 print $outhandle "Existing file:$allfiles->{'file'} match the user-defined File MimeType:$file_mime\n";
213 push (@matched_list, $allfiles);
214
215 # delete the matched MIMEType file from the array
216 splice(@rest_fnames,$pos,1);
217 return (\@matched_list, \@rest_fnames);
218 }
219 $pos++;
220 }
221 }
222 }
223 return (\@fnamemime_list, \@assocmime_list);
224}
225
226
227sub filemime_list_to_re
228{
229 my $self = shift (@_);
230 my ($fnamemime_list) = @_;
231
232 my @fname_list = map { "(".$_->{'file'}.")" } @$fnamemime_list;
233 my $fname_re = join("|",@fname_list);
234 $fname_re =~ s/\./\\\./g;
235
236 return $fname_re;
237}
238
239# Read dublin_core metadata from DSpace collection
240sub metadata_read {
241 my $self = shift (@_);
242 my ($pluginfo, $base_dir, $file, $block_hash,
243 $extrametakeys, $extrametadata, $extrametafile,
244 $processor, $maxdocs, $gli) = @_;
245
246 my $only_first_doc = $self->{'only_first_doc'};
247 my $first_inorder_ext = $self->{'first_inorder_ext'};
248 my $first_inorder_mime = $self->{'first_inorder_mime'};
249
250 my $outhandle = $self->{'outhandle'};
251
252 my $filename = &util::filename_cat($base_dir, $file);
253 # return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
254
255 if ($filename !~ /dublin_core\.xml$/ || !-f $filename) {
256 return undef;
257 }
258
259 print $outhandle "DSpacePlugin: extracting metadata from $file\n"
260 if $self->{'verbosity'} > 1;
261
262 my ($dir) = $filename =~ /^(.*?)[^\/\\]*$/;
263
264 eval {
265 $self->{'parser'}->parsefile($filename);
266 };
267
268 if ($@) {
269 die "DSpacePlugin: ERROR $filename is not a well formed dublin_core.xml file ($@)\n";
270 }
271
272 my $mimetype_list = $self->{'saved_metadata'}->{'dc.Format^mimetype'};
273 my ($doc_file_mimes, $assoc_file_mimes) = $self->read_content($dir, $only_first_doc, $first_inorder_ext,
274 $first_inorder_mime, $mimetype_list);
275
276 my $file_re = $self->filemime_list_to_re($doc_file_mimes);
277
278 if ($only_first_doc || $first_inorder_ext || $first_inorder_mime) {
279 foreach my $afm ( @$assoc_file_mimes ) {
280 my $full_af = &util::filename_cat($dir,$afm->{'file'});
281 $self->{'extra_blocks'}->{$full_af} = 1;
282 }
283 }
284 push(@$extrametakeys,$file_re);
285
286 if (defined $self->{'saved_metadata'}->{'dc.Format^extent'}) {
287 delete $self->{'saved_metadata'}->{'dc.Format^extent'};
288 }
289
290 if (defined $mimetype_list) {
291 delete $self->{'saved_metadata'}->{'dc.Format^mimetype'};
292
293 # Temporarily store associate file info in metadata table
294 # This will be removed in 'extra_metadata' in BasePlugin and used
295 # to perform the actual file association (once the doc obj has
296 # been formed
297
298 my $main_doc = $doc_file_mimes->[0];
299 my $md_mimetype = $main_doc->{'mimetype'};
300
301 my $pd_lookup = $primary_doc_lookup->{$md_mimetype};
302
303 if (defined $pd_lookup) {
304 my $filter_re = $pd_lookup;
305 @$assoc_file_mimes = grep { $_->{'file'} !~ m/$filter_re/ } @$assoc_file_mimes;
306 }
307
308 my @gsdlassocfile_tobe
309 = map { &util::filename_cat($dir,$_->{'file'}).":".$_->{'mimetype'}.":" } @$assoc_file_mimes;
310 $self->{'saved_metadata'}->{'gsdlassocfile_tobe'} = \@gsdlassocfile_tobe;
311
312 }
313
314 $extrametadata->{$file_re} = $self->{'saved_metadata'};
315
316 return 1;
317}
318
319
320# The DSpacePlugin read() function. We are not actually reading any documents
321# here, just blocking ones that have been processed by metadata read.
322#
323# Returns 0 for a file its blocking, undef for any other
324sub read {
325 my $self = shift (@_);
326 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
327 my $outhandle = $self->{'outhandle'};
328
329 # Block all files except contents
330 my $filename = &util::filename_cat($base_dir, $file);
331 return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
332
333 my $assocfile = $metadata->{'assocfile'};
334
335 return 0 if (($filename =~ /dublin_core\.xml$/) || ($filename =~ /contents$/));
336 return 0 if (defined $self->{'extra_blocks'}->{$filename});
337 return undef;
338}
339
340sub Doctype {
341 my ($expat, $name, $sysid, $pubid, $internal) = @_;
342
343 die if ($name !~ /^dublin_core$/);
344}
345
346sub StartTag {
347 my ($expat, $element) = @_;
348 if ($element eq "dublin_core") {
349 $self->{'saved_metadata'} = {};
350 } elsif ($element eq "dcvalue") {
351 my $metaname = $_{'element'};
352 my $qualifier = $_{'qualifier'}||"";
353 if ($metaname ne "description") {
354 $metaname .= "^$qualifier" if ($qualifier ne "none" && $qualifier ne "");
355 $self->{'metaname'} = "dc.\u$metaname";
356 }
357 }
358}
359
360sub EndTag {
361 my ($expat, $element) = @_;
362
363 if ($element eq "dcvalue") {
364 $self->{'metaname'} = "";
365 }
366}
367
368sub Text {
369 if (defined ($self->{'metaname'}) && $self->{'metaname'} ne "") {
370 # $_ == Metadata content
371 my $mname = $self->{'metaname'};
372 if (defined $self->{'saved_metadata'}->{$mname}) {
373 # accumulate - add value to existing value(s)
374 if (ref ($self->{'saved_metadata'}->{$mname}) eq "ARRAY") {
375 push (@{$self->{'saved_metadata'}->{$mname}}, $_);
376 } else {
377 $self->{'saved_metadata'}->{$mname} =
378 [$self->{'saved_metadata'}->{$mname}, $_];
379 }
380 } else {
381 # accumulate - add value into (currently empty) array
382 $self->{'saved_metadata'}->{$mname} = [$_];
383 }
384
385 }
386}
387
388# This Char function overrides the one in XML::Parser::Stream to overcome a
389# problem where $expat->{Text} is treated as the return value, slowing
390# things down significantly in some cases.
391sub Char {
392 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
393 $_[0]->{'Text'} .= $_[1];
394 return undef;
395}
396
3971;
Note: See TracBrowser for help on using the repository browser.