source: trunk/gsdl/perllib/plugins/DSpacePlug.pm@ 12169

Last change on this file since 12169 was 12169, checked in by mdewsnip, 18 years ago

Tidied up that horrible long line in the new() function of every plugin.

  • Property svn:keywords set to Author Date Id Revision
File size: 12.1 KB
Line 
1
2###########################################################################
3#
4# DSpacePlug.pm -- plugin for import the collection from DSpace
5#
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28# DSpace Plug - 10/2004
29#
30#
31# This plugin takes "contents" and dublin_core.xml file, which contain
32# Metadata and lists of associated files for a particular document
33# and produces a document containing sections, one for each page.
34# The files should be named "contents" and "dublin_core.xml". For each of
35# document in DSpace, it is stored in one directory
36#
37# The format of the "contents" file is as follows:
38#
39# File.type bundle:ORIGINAL
40# license.txt bundle:LICENSE
41# The format of the "dublin_core.xml" file is as follows:
42# The first line contains any metadata for the whole document
43# <dublin_core>
44# eg.
45# <dcvalue element="Title" qualifier="">Snail farming</dcvalue>
46# <dcvalue element="date" qualifier="">2004-10-15</dcvalue>
47#
48
49package DSpacePlug;
50
51use BasPlug;
52use plugin;
53#use ghtml;
54use XMLParser;
55use strict;
56no strict 'refs'; # allow filehandles to be variables and viceversa
57
58sub BEGIN {
59 @DSpacePlug::ISA = ('BasPlug');
60}
61
62my $arguments =
63 [ { 'name' => "process_exp",
64 'desc' => "{BasPlug.process_exp}",
65 'type' => "string",
66 'deft' => &get_default_process_exp(),
67 'reqd' => "no" },
68 { 'name' => "only_first_doc",
69 'desc' => "{DSpacePlug.only_first_doc}",
70 'type' => "flag",
71 'reqd' => "no" },
72 { 'name' => "first_inorder_ext",
73 'desc' => "{DSpacePlug.first_inorder_ext}",
74 'type' => "string",
75 'reqd' => "no" },
76 { 'name' => "first_inorder_mime",
77 'desc' => "{DSpacePlug.first_inorder_mime}",
78 'type' => "flag",
79 'reqd' => "no" },
80 { 'name' => "block_exp",
81 'desc' => "{BasPlug.block_exp}",
82 'type' => "regexp",
83 'deft' => &get_default_block_exp(),
84 'reqd' => "no" }];
85
86
87my $options = { 'name' => "DSpacePlug",
88 'desc' => "{DSpacePlug.desc}",
89 'inherits' => "yes",
90 'abstract' => "no",
91 'args' => $arguments };
92
93
94my $primary_doc_lookup = { 'text/html' => '(?i)\.(gif|jpe?g|jpe|jpg|png|css)$' };
95
96# Important variation to regular plugin structure. Need to declare
97# $self as global variable to file so XMLParser callback routines
98# can access the content of the object.
99my ($self);
100
101sub new {
102 my ($class) = shift (@_);
103 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
104 push(@$pluginlist, $class);
105
106 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
107 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
108
109 $self = new BasPlug($pluginlist, $inputargs, $hashArgOptLists);
110
111 #create XML::Parser object for parsing dublin_core.xml files
112 my $parser = new XML::Parser('Style' => 'Stream',
113 'Handlers' => {'Char' => \&Char,
114 'Doctype' => \&Doctype
115 });
116 $self->{'parser'} = $parser;
117 $self->{'extra_blocks'} = {};
118
119 return bless $self, $class;
120}
121
122sub get_default_process_exp {
123 my $self = shift (@_);
124
125 return q^(?i)contents$^;
126}
127
128# want to block all files except the "contents"
129sub get_default_block_exp {
130 my $self = shift (@_);
131
132 # Block all files besides contents
133 return q^(?i)(handle|dublin_core\.xml|\.tx?t)$^;
134}
135
136sub read_content
137{
138 my $self = shift (@_);
139 my ($dir, $only_first_doc, $first_inorder_ext, $first_inorder_mime, $mimetype_list) = @_;
140 my $outhandle = $self->{'outhandle'};
141
142 my @fnamemime_list = ();
143 my @assocmime_list = ();
144
145 my $content_fname = &util::filename_cat($dir,"contents");
146
147 open(CIN,"<$content_fname")
148 || die "Unable to open $content_fname: $!\n";
149
150 my $line;
151 my $pos = 0;
152
153 while (defined ($line = <CIN>)) {
154 if ($line =~ m/^(.*)\s+bundle:ORIGINAL\s*$/) {
155 my $fname = $1;
156 my $mtype = $mimetype_list->[$pos];
157 my $fm_rec = { 'file' => $fname, 'mimetype' => $mtype};
158 push(@fnamemime_list,$fm_rec);
159 $pos++;
160 }
161 }
162 close CIN;
163
164 if ($only_first_doc){
165 my ($first_fname, @rest_fnames) = @fnamemime_list;
166 @fnamemime_list = ($first_fname);
167 @assocmime_list = @rest_fnames;
168 }
169
170 # allow user to specify the types of files (inorder)they would like to assign as
171 # a primary bitstream
172 if ($first_inorder_ext) {
173 # parse user-define file extension names
174 my @extfiles_list = split /,/, $first_inorder_ext;
175 my (@rest_fnames) = @fnamemime_list;
176 my @matched_list = ();
177 foreach my $file_ext (@extfiles_list) {
178 $pos = 0;
179 foreach my $allfiles (@fnamemime_list){
180 $allfiles->{'file'} =~ /^(.*)\.(.*?)$/;
181 my $allfiles_ext = $2;
182
183 if ($allfiles_ext =~ /$file_ext/) {
184 print $outhandle "Existing file:$allfiles->{'file'} match the user-define File Extension:$file_ext\n";
185 push (@matched_list, $allfiles);
186
187 # delete the matched extension file from the array
188 splice(@rest_fnames,$pos,1);
189
190 return (\@matched_list, \@rest_fnames);
191
192 }
193 $pos++;
194 }
195 }
196 }
197
198 if ($first_inorder_mime) {
199 # parse user-define file mimetype
200 my @file_mime_list = split /,/, $first_inorder_mime;
201 my (@rest_fnames) = @fnamemime_list;
202 my @matched_list = ();
203 foreach my $file_mime (@file_mime_list) {
204 $pos = 0;
205 foreach my $allfiles (@fnamemime_list){
206 my $allfiles_mime = $allfiles->{'mimetype'};
207
208 if ($allfiles_mime =~ /$file_mime/) {
209 print $outhandle "Existing file:$allfiles->{'file'} match the user-defined File MimeType:$file_mime\n";
210 push (@matched_list, $allfiles);
211
212 # delete the matched MIMEType file from the array
213 splice(@rest_fnames,$pos,1);
214 return (\@matched_list, \@rest_fnames);
215 }
216 $pos++;
217 }
218 }
219 }
220 return (\@fnamemime_list, \@assocmime_list);
221}
222
223
224sub filemime_list_to_re
225{
226 my $self = shift (@_);
227 my ($fnamemime_list) = @_;
228
229 my @fname_list = map { "(".$_->{'file'}.")" } @$fnamemime_list;
230 my $fname_re = join("|",@fname_list);
231 $fname_re =~ s/\./\\\./g;
232
233 return $fname_re;
234}
235
236# Read dublin_core metadata from DSpace collection
237sub metadata_read {
238 my $self = shift (@_);
239 my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
240
241 my $only_first_doc = $self->{'only_first_doc'};
242 my $first_inorder_ext = $self->{'first_inorder_ext'};
243 my $first_inorder_mime = $self->{'first_inorder_mime'};
244
245 my $outhandle = $self->{'outhandle'};
246
247 my $filename = &util::filename_cat($base_dir, $file);
248 # return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
249
250 if ($filename !~ /dublin_core\.xml$/ || !-f $filename) {
251 return undef;
252 }
253
254 print $outhandle "DSpacePlug: extracting metadata from $file\n"
255 if $self->{'verbosity'} > 1;
256
257 my ($dir) = $filename =~ /^(.*?)[^\/\\]*$/;
258
259 eval {
260 $self->{'parser'}->parsefile($filename);
261 };
262
263 if ($@) {
264 die "DSpacePlug: ERROR $filename is not a well formed dublin_core.xml file ($@)\n";
265 }
266
267 my $mimetype_list = $self->{'saved_metadata'}->{'dc.Format^mimetype'};
268 my ($doc_file_mimes, $assoc_file_mimes) = $self->read_content($dir, $only_first_doc, $first_inorder_ext,
269 $first_inorder_mime, $mimetype_list);
270
271 my $file_re = $self->filemime_list_to_re($doc_file_mimes);
272
273 if ($only_first_doc || $first_inorder_ext || $first_inorder_mime) {
274 foreach my $afm ( @$assoc_file_mimes ) {
275 my $full_af = &util::filename_cat($dir,$afm->{'file'});
276 $self->{'extra_blocks'}->{$full_af} = 1;
277 }
278 }
279 push(@$extrametakeys,$file_re);
280
281 if (defined $self->{'saved_metadata'}->{'dc.Format^extent'}) {
282 delete $self->{'saved_metadata'}->{'dc.Format^extent'};
283 }
284
285 if (defined $mimetype_list) {
286 delete $self->{'saved_metadata'}->{'dc.Format^mimetype'};
287
288 # Temporarily store associate file info in metadata table
289 # This will be removed in 'extra_metadata' in BasPlug and used
290 # to perform the actual file association (once the doc obj has
291 # been formed
292
293 my $main_doc = $doc_file_mimes->[0];
294 my $md_mimetype = $main_doc->{'mimetype'};
295
296 my $pd_lookup = $primary_doc_lookup->{$md_mimetype};
297
298 if (defined $pd_lookup) {
299 my $filter_re = $pd_lookup;
300 @$assoc_file_mimes = grep { $_->{'file'} !~ m/$filter_re/ } @$assoc_file_mimes;
301 }
302
303 my @gsdlassocfile_tobe
304 = map { &util::filename_cat($dir,$_->{'file'}).":".$_->{'mimetype'}.":" } @$assoc_file_mimes;
305 $self->{'saved_metadata'}->{'gsdlassocfile_tobe'} = \@gsdlassocfile_tobe;
306
307 }
308
309 $extrametadata->{$file_re} = $self->{'saved_metadata'};
310
311 return 1;
312}
313
314
315# The DSpacePlug read() function. This function does all the right things
316# to make general options work for a given plugin. It calls the process()
317# function which does all the work specific to a plugin (like the old
318# read functions used to do). Most plugins should define their own
319# process() function and let this read() function keep control.
320#
321# DSpace overrides read() because there is no need to read the actual
322# text of the file in, because the contents of the file is not text...
323#
324# Return number of files processed, undef if can't process
325# Note that $base_dir might be "" and that $file might
326# include directories
327
328sub read {
329 my $self = shift (@_);
330 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
331 my $outhandle = $self->{'outhandle'};
332
333 # Block all files except contents
334 my $filename = &util::filename_cat($base_dir, $file);
335 return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
336
337 my $assocfile = $metadata->{'assocfile'};
338
339 return 0 if (($filename =~ /dublin_core\.xml$/) || ($filename =~ /contents$/));
340 return 0 if (defined $self->{'extra_blocks'}->{$filename});
341 return undef;
342}
343
344# do plugin specific processing of doc_obj
345sub process {
346 my $self = shift (@_);
347 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
348 my $outhandle = $self->{'outhandle'};
349
350 return 1;
351}
352
353sub Doctype {
354 my ($expat, $name, $sysid, $pubid, $internal) = @_;
355
356 die if ($name !~ /^dublin_core$/);
357}
358
359sub StartTag {
360 my ($expat, $element) = @_;
361 if ($element eq "dublin_core") {
362 $self->{'saved_metadata'} = {};
363 } elsif ($element eq "dcvalue") {
364 my $metaname = $_{'element'};
365 my $qualifier = $_{'qualifier'}||"";
366 if ($metaname ne "description") {
367 $metaname .= "^$qualifier" if ($qualifier ne "none" && $qualifier ne "");
368 $self->{'metaname'} = "dc.\u$metaname";
369 }
370 }
371}
372
373sub EndTag {
374 my ($expat, $element) = @_;
375
376 if ($element eq "dcvalue") {
377 $self->{'metaname'} = "";
378 }
379}
380
381sub Text {
382 if (defined ($self->{'metaname'}) && $self->{'metaname'} ne "") {
383 # $_ == Metadata content
384 my $mname = $self->{'metaname'};
385 if (defined $self->{'saved_metadata'}->{$mname}) {
386 # accumulate - add value to existing value(s)
387 if (ref ($self->{'saved_metadata'}->{$mname}) eq "ARRAY") {
388 push (@{$self->{'saved_metadata'}->{$mname}}, $_);
389 } else {
390 $self->{'saved_metadata'}->{$mname} =
391 [$self->{'saved_metadata'}->{$mname}, $_];
392 }
393 } else {
394 # accumulate - add value into (currently empty) array
395 $self->{'saved_metadata'}->{$mname} = [$_];
396 }
397
398 }
399}
400
401# This Char function overrides the one in XML::Parser::Stream to overcome a
402# problem where $expat->{Text} is treated as the return value, slowing
403# things down significantly in some cases.
404sub Char {
405 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
406 $_[0]->{'Text'} .= $_[1];
407 return undef;
408}
409
4101;
Note: See TracBrowser for help on using the repository browser.