source: trunk/gsdl/perllib/plugins/DSpacePlug.pm@ 8891

Last change on this file since 8891 was 8891, checked in by davidb, 19 years ago

Revision of argument types to a few plugin options to better reflect
their data-type.

  • Property svn:keywords set to Author Date Id Revision
File size: 12.0 KB
Line 
1###########################################################################
2#
3# DSpacePlug.pm -- plugin for import the collection from DSpace
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27# DSpace Plug - 10/2004
28#
29#
30# This plugin takes "contents" and dublin_core.xml file, which contain
31# Metadata and lists of associated files for a particular document
32# and produces a document containing sections, one for each page.
33# The files should be named "contents" and "dublin_core.xml". For each of
34# document in DSpace, it is stored in one directory
35#
36# The format of the "contents" file is as follows:
37#
38# File.type bundle:ORIGINAL
39# license.txt bundle:LICENSE
40# The format of the "dublin_core.xml" file is as follows:
41# The first line contains any metadata for the whole document
42# <dublin_core>
43# eg.
44# <dcvalue element="Title" qualifier="">Snail farming</dcvalue>
45# <dcvalue element="date" qualifier="">2004-10-15</dcvalue>
46#
47
48package DSpacePlug;
49
50use BasPlug;
51use plugin;
52#use ghtml;
53use XMLParser;
54
55sub BEGIN {
56 @ISA = ('BasPlug');
57}
58
59my $arguments =
60 [ { 'name' => "process_exp",
61 'desc' => "{BasPlug.process_exp}",
62 'type' => "string",
63 'deft' => &get_default_process_exp(),
64 'reqd' => "no" },
65 { 'name' => "only_first_doc",
66 'desc' => "{DSpacePlug.only_first_doc}",
67 'type' => "flag",
68 'reqd' => "no" },
69 { 'name' => "first_inoder_ext",
70 'desc' => "{DSpacePlug.first_inorder_ext}",
71 'type' => "string",
72 'reqd' => "no" },
73 { 'name' => "first_inorder_mime",
74 'desc' => "{DSpacePlug.first_inorder_mime}",
75 'type' => "flag",
76 'reqd' => "no" },
77 { 'name' => "block_exp",
78 'desc' => "{BasPlug.block_exp}",
79 'type' => "regexp",
80 'deft' => &get_default_block_exp(),
81 'reqd' => "no" }];
82
83
84my $options = { 'name' => "DSpacePlug",
85 'desc' => "{DSpacePlug.desc}",
86 'inherits' => "yes",
87 'args' => $arguments };
88
89
90my $primary_doc_lookup = { 'text/html' => '(?i)\.(gif|jpe?g|jpe|jpg|png|css)$' };
91
92my ($self);
93sub new {
94 my $class = shift (@_);
95 #my $plugin_name = shift (@_);
96
97 $self = new BasPlug ($class, @_);
98 $self->{'plugin_type'} = "DSpacePlug";
99
100 my $option_list = $self->{'option_list'};
101 push( @{$option_list}, $options );
102
103
104 if (!parsargv::parse(\@_,
105 q^only_first_doc^, \$self->{'only_first_doc'},
106 q^first_inorder_ext/.*/^, \$self->{'first_inorder_ext'},
107 q^first_inorder_mime/.*/^, \$self->{'first_inorder_mime'},
108 "allow_extra_options")) {
109 print STDERR "\nDSpacePlug uses an incorrect option.\n";
110 print STDERR "Check your collect.cfg configuration file.\n\n";
111 $self->print_txt_usage(""); # Use default resource bundle
112 die "\n";
113 }
114
115 #create XML::Parser object for parsing dublin_core.xml files
116 my $parser = new XML::Parser('Style' => 'Stream',
117 'Handlers' => {'Char' => \&Char,
118 'Doctype' => \&Doctype
119 });
120 $self->{'parser'} = $parser;
121
122 $self->{'extra_blocks'} = {};
123
124 return bless $self, $class;
125}
126
127
128sub get_default_process_exp {
129 my $self = shift (@_);
130
131 return q^(?i)contents$^;
132}
133
134# want to block all files except the "contents"
135sub get_default_block_exp {
136 my $self = shift (@_);
137
138 # Block all files besides contents
139 return q^(?i)(handle|dublin_core\.xml|\.tx?t)$^;
140}
141
142sub read_content
143{
144 my $self = shift (@_);
145 my ($dir, $only_first_doc, $first_inorder_ext, $first_inorder_mime, $mimetype_list) = @_;
146 my $outhandle = $self->{'outhandle'};
147
148 my @fnamemime_list = ();
149 my @assocmime_list = ();
150
151 my $content_fname = &util::filename_cat($dir,"contents");
152
153 open(CIN,"<$content_fname")
154 || die "Unable to open $content_fname: $!\n";
155
156 my $line;
157 my $pos = 0;
158
159 while (defined ($line = <CIN>)) {
160 if ($line =~ m/^(.*)\s+bundle:ORIGINAL\s*$/) {
161 my $fname = $1;
162 my $mtype = $mimetype_list->[$pos];
163 my $fm_rec = { 'file' => $fname, 'mimetype' => $mtype};
164 push(@fnamemime_list,$fm_rec);
165 $pos++;
166 }
167 }
168 close CIN;
169
170 if ($only_first_doc){
171 my ($first_fname, @rest_fnames) = @fnamemime_list;
172 @fnamemime_list = ($first_fname);
173 @assocmime_list = @rest_fnames;
174 }
175
176 # allow user to specify the types of files (inorder)they would like to assign as
177 # a primary bitstream
178 if ($first_inorder_ext) {
179 # parse user-define file extension names
180 my @extfiles_list = split /,/, $first_inorder_ext;
181 my (@rest_fnames) = @fnamemime_list;
182 my @matched_list = ();
183 foreach my $file_ext (@extfiles_list) {
184 $pos = 0;
185 foreach my $allfiles (@fnamemime_list){
186 $allfiles->{'file'} =~ /^(.*)\.(.*?)$/;
187 my $allfiles_ext = $2;
188
189 if ($allfiles_ext =~ /$file_ext/) {
190 print $outhandle "Existing file:$allfiles->{'file'} match the user-define File Extension:$file_ext\n";
191 push (@matched_list, $allfiles);
192
193 # delete the matched extension file from the array
194 splice(@rest_fnames,$pos,1);
195
196 return (\@matched_list, \@rest_fnames);
197
198 }
199 $pos++;
200 }
201 }
202 }
203
204 if ($first_inorder_mime) {
205 # parse user-define file mimetype
206 my @file_mime_list = split /,/, $first_inorder_mime;
207 my (@rest_fnames) = @fnamemime_list;
208 my @matched_list = ();
209 foreach my $file_mime (@file_mime_list) {
210 $pos = 0;
211 foreach my $allfiles (@fnamemime_list){
212 my $allfiles_mime = $allfiles->{'mimetype'};
213
214 if ($allfiles_mime =~ /$file_mime/) {
215 print $outhandle "Existing file:$allfiles->{'file'} match the user-defined File MimeType:$file_mime\n";
216 push (@matched_list, $allfiles);
217
218 # delete the matched MIMEType file from the array
219 splice(@rest_fnames,$pos,1);
220 return (\@matched_list, \@rest_fnames);
221 }
222 $pos++;
223 }
224 }
225 }
226 return (\@fnamemime_list, \@assocmime_list);
227}
228
229
230sub filemime_list_to_re
231{
232 my $self = shift (@_);
233 my ($fnamemime_list) = @_;
234
235 my @fname_list = map { "(".$_->{'file'}.")" } @$fnamemime_list;
236 my $fname_re = join("|",@fname_list);
237 $fname_re =~ s/\./\\\./g;
238
239 return $fname_re;
240}
241
242# Read dublin_core metadata from DSpace collection
243sub metadata_read {
244 my $self = shift (@_);
245 my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs) = @_;
246
247 my $only_first_doc = $self->{'only_first_doc'};
248 my $first_inorder_ext = $self->{'first_inorder_ext'};
249 my $first_inorder_mime = $self->{'first_inorder_mime'};
250
251 my $outhandle = $self->{'outhandle'};
252
253 my $filename = &util::filename_cat($base_dir, $file);
254 # return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
255
256 #my $block_files = $self->{'img_blocks'}->{$filename};
257
258 if ($filename !~ /dublin_core\.xml$/ || !-f $filename) {
259 return undef;
260 }
261
262 print $outhandle "DSpacePlug: extracting metadata from $file\n"
263 if $self->{'verbosity'} > 1;
264
265 my ($dir) = $filename =~ /^(.*?)[^\/\\]*$/;
266
267 eval {
268 $self->{'parser'}->parsefile($filename);
269 };
270
271 if ($@) {
272 die "DSpacePlug: ERROR $filename is not a well formed dublin_core.xml file ($@)\n";
273 }
274
275 my $mimetype_list = $self->{'saved_metadata'}->{'dc.Format^mimetype'};
276 my ($doc_file_mimes, $assoc_file_mimes) = $self->read_content($dir, $only_first_doc, $first_inorder_ext,
277 $first_inorder_mime, $mimetype_list);
278 my $file_re = $self->filemime_list_to_re($doc_file_mimes);
279
280 if ($only_first_doc || $first_inorder_ext || $first_inorder_mime) {
281 foreach my $afm ( @$assoc_file_mimes ) {
282 my $full_af = &util::filename_cat($dir,$afm->{'file'});
283 $self->{'extra_blocks'}->{$full_af} = 1;
284 }
285 }
286 push(@$extrametakeys,$file_re);
287
288 if (defined $self->{'saved_metadata'}->{'dc.Format^extent'}) {
289 delete $self->{'saved_metadata'}->{'dc.Format^extent'};
290 }
291
292 if (defined $mimetype_list) {
293 delete $self->{'saved_metadata'}->{'dc.Format^mimetype'};
294
295 # Temporarily store associate file info in metadata table
296 # This will be removed in 'extra_metadata' in BasPlug and used
297 # to perform the actual file association (once the doc obj has
298 # been formed
299
300 my $main_doc = $doc_file_mimes->[0];
301 my $md_mimetype = $main_doc->{'mimetype'};
302
303 my $pd_lookup = $primary_doc_lookup->{$md_mimetype};
304
305 if (defined $pd_lookup) {
306 my $filter_re = $pd_lookup;
307
308 @$assoc_file_mimes = grep { $_->{'file'} !~ m/$filter_re/ } @$assoc_file_mimes;
309 }
310
311 my @gsdlassocfile_tobe
312 = map { &util::filename_cat($dir,$_->{'file'}).":".$_->{'mimetype'}.":" } @$assoc_file_mimes;
313 $self->{'saved_metadata'}->{'gsdlassocfile_tobe'} = \@gsdlassocfile_tobe;
314
315 }
316
317 $extrametadata->{$file_re} = $self->{'saved_metadata'};
318
319 return 1;
320}
321
322
323# The DSpacePlug read() function. This function does all the right things
324# to make general options work for a given plugin. It calls the process()
325# function which does all the work specific to a plugin (like the old
326# read functions used to do). Most plugins should define their own
327# process() function and let this read() function keep control.
328#
329# DSpace overrides read() because there is no need to read the actual
330# text of the file in, because the contents of the file is not text...
331#
332# Return number of files processed, undef if can't process
333# Note that $base_dir might be "" and that $file might
334# include directories
335
336sub read {
337 my $self = shift (@_);
338 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
339 my $outhandle = $self->{'outhandle'};
340
341 # Block all files except contents
342 my $filename = &util::filename_cat($base_dir, $file);
343 return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
344
345 my $assocfile = $metadata->{'assocfile'};
346
347 return 0 if (($filename =~ /dublin_core\.xml$/) || ($filename =~ /contents$/));
348 return 0 if (defined $self->{'extra_blocks'}->{$filename});
349 return undef;
350}
351
352# do plugin specific processing of doc_obj
353sub process {
354 my $self = shift (@_);
355 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
356 my $outhandle = $self->{'outhandle'};
357
358 return 1;
359}
360
361sub Doctype {
362 my ($expat, $name, $sysid, $pubid, $internal) = @_;
363
364 die if ($name !~ /^dublin_core$/);
365}
366
367sub StartTag {
368 my ($expat, $element) = @_;
369 if ($element eq "dublin_core") {
370 $self->{'saved_metadata'} = {};
371 }
372 elsif ($element eq "dcvalue") {
373 my $metaname = $_{'element'};
374 my $qualifier = $_{'qualifier'};
375 if ($metaname ne "description") {
376 $metaname .= "^$qualifier" if ($qualifier ne "none");
377
378 $self->{'metaname'} = "dc.\u$metaname";
379 }
380 }
381}
382
383sub EndTag {
384 my ($expat, $element) = @_;
385
386 if ($element eq "dcvalue") {
387 $self->{'metaname'} = "";
388 }
389
390}
391
392sub Text {
393 if (defined ($self->{'metaname'}) && $self->{'metaname'} ne "") {
394 # $_ == Metadata content
395 my $mname = $self->{'metaname'};
396 if (defined $self->{'saved_metadata'}->{$mname}) {
397 # accumulate - add value to existing value(s)
398 if (ref ($self->{'saved_metadata'}->{$mname}) eq "ARRAY") {
399 push (@{$self->{'saved_metadata'}->{$mname}}, $_);
400 } else {
401 $self->{'saved_metadata'}->{$mname} =
402 [$self->{'saved_metadata'}->{$mname}, $_];
403 }
404 } else {
405 # accumulate - add value into (currently empty) array
406 $self->{'saved_metadata'}->{$mname} = [$_];
407 }
408
409 }
410}
411
412# This Char function overrides the one in XML::Parser::Stream to overcome a
413# problem where $expat->{Text} is treated as the return value, slowing
414# things down significantly in some cases.
415sub Char {
416 $_[0]->{'Text'} .= $_[1];
417 return undef;
418}
419
4201;
Note: See TracBrowser for help on using the repository browser.