source: trunk/gsdl/perllib/plugins/DSpacePlug.pm@ 10254

Last change on this file since 10254 was 10254, checked in by kjdon, 19 years ago

added 'use strict' to all plugins, and made modifications (mostly adding 'my') to make them compile

  • Property svn:keywords set to Author Date Id Revision
File size: 12.1 KB
Line 
1###########################################################################
2#
3# DSpacePlug.pm -- plugin for import the collection from DSpace
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27# DSpace Plug - 10/2004
28#
29#
30# This plugin takes "contents" and dublin_core.xml file, which contain
31# Metadata and lists of associated files for a particular document
32# and produces a document containing sections, one for each page.
33# The files should be named "contents" and "dublin_core.xml". For each of
34# document in DSpace, it is stored in one directory
35#
36# The format of the "contents" file is as follows:
37#
38# File.type bundle:ORIGINAL
39# license.txt bundle:LICENSE
40# The format of the "dublin_core.xml" file is as follows:
41# The first line contains any metadata for the whole document
42# <dublin_core>
43# eg.
44# <dcvalue element="Title" qualifier="">Snail farming</dcvalue>
45# <dcvalue element="date" qualifier="">2004-10-15</dcvalue>
46#
47
48package DSpacePlug;
49
50use BasPlug;
51use plugin;
52#use ghtml;
53use XMLParser;
54use strict;
55no strict 'refs'; # allow filehandles to be variables and viceversa
56
57sub BEGIN {
58 @DSpacePlug::ISA = ('BasPlug');
59}
60
61my $arguments =
62 [ { 'name' => "process_exp",
63 'desc' => "{BasPlug.process_exp}",
64 'type' => "string",
65 'deft' => &get_default_process_exp(),
66 'reqd' => "no" },
67 { 'name' => "only_first_doc",
68 'desc' => "{DSpacePlug.only_first_doc}",
69 'type' => "flag",
70 'reqd' => "no" },
71 { 'name' => "first_inorder_ext",
72 'desc' => "{DSpacePlug.first_inorder_ext}",
73 'type' => "string",
74 'reqd' => "no" },
75 { 'name' => "first_inorder_mime",
76 'desc' => "{DSpacePlug.first_inorder_mime}",
77 'type' => "flag",
78 'reqd' => "no" },
79 { 'name' => "block_exp",
80 'desc' => "{BasPlug.block_exp}",
81 'type' => "regexp",
82 'deft' => &get_default_block_exp(),
83 'reqd' => "no" }];
84
85
86my $options = { 'name' => "DSpacePlug",
87 'desc' => "{DSpacePlug.desc}",
88 'inherits' => "yes",
89 'args' => $arguments };
90
91
92my $primary_doc_lookup = { 'text/html' => '(?i)\.(gif|jpe?g|jpe|jpg|png|css)$' };
93
94# Important variation to regular plugin structure. Need to declare
95# $self as global variable to file so XMLParser callback routines
96# can access the content of the object.
97my ($self);
98
99sub new {
100 my ($class) = shift (@_);
101 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
102 push(@$pluginlist, $class);
103
104 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
105 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
106
107 $self = (defined $hashArgOptLists)? new BasPlug($pluginlist,$inputargs,$hashArgOptLists): new BasPlug($pluginlist,$inputargs);
108
109 #create XML::Parser object for parsing dublin_core.xml files
110 my $parser = new XML::Parser('Style' => 'Stream',
111 'Handlers' => {'Char' => \&Char,
112 'Doctype' => \&Doctype
113 });
114 $self->{'parser'} = $parser;
115 $self->{'extra_blocks'} = {};
116
117 return bless $self, $class;
118}
119
120sub get_default_process_exp {
121 my $self = shift (@_);
122
123 return q^(?i)contents$^;
124}
125
126# want to block all files except the "contents"
127sub get_default_block_exp {
128 my $self = shift (@_);
129
130 # Block all files besides contents
131 return q^(?i)(handle|dublin_core\.xml|\.tx?t)$^;
132}
133
134sub read_content
135{
136 my $self = shift (@_);
137 my ($dir, $only_first_doc, $first_inorder_ext, $first_inorder_mime, $mimetype_list) = @_;
138 my $outhandle = $self->{'outhandle'};
139
140 my @fnamemime_list = ();
141 my @assocmime_list = ();
142
143 my $content_fname = &util::filename_cat($dir,"contents");
144
145 open(CIN,"<$content_fname")
146 || die "Unable to open $content_fname: $!\n";
147
148 my $line;
149 my $pos = 0;
150
151 while (defined ($line = <CIN>)) {
152 if ($line =~ m/^(.*)\s+bundle:ORIGINAL\s*$/) {
153 my $fname = $1;
154 my $mtype = $mimetype_list->[$pos];
155 my $fm_rec = { 'file' => $fname, 'mimetype' => $mtype};
156 push(@fnamemime_list,$fm_rec);
157 $pos++;
158 }
159 }
160 close CIN;
161
162 if ($only_first_doc){
163 my ($first_fname, @rest_fnames) = @fnamemime_list;
164 @fnamemime_list = ($first_fname);
165 @assocmime_list = @rest_fnames;
166 }
167
168 # allow user to specify the types of files (inorder)they would like to assign as
169 # a primary bitstream
170 if ($first_inorder_ext) {
171 # parse user-define file extension names
172 my @extfiles_list = split /,/, $first_inorder_ext;
173 my (@rest_fnames) = @fnamemime_list;
174 my @matched_list = ();
175 foreach my $file_ext (@extfiles_list) {
176 $pos = 0;
177 foreach my $allfiles (@fnamemime_list){
178 $allfiles->{'file'} =~ /^(.*)\.(.*?)$/;
179 my $allfiles_ext = $2;
180
181 if ($allfiles_ext =~ /$file_ext/) {
182 print $outhandle "Existing file:$allfiles->{'file'} match the user-define File Extension:$file_ext\n";
183 push (@matched_list, $allfiles);
184
185 # delete the matched extension file from the array
186 splice(@rest_fnames,$pos,1);
187
188 return (\@matched_list, \@rest_fnames);
189
190 }
191 $pos++;
192 }
193 }
194 }
195
196 if ($first_inorder_mime) {
197 # parse user-define file mimetype
198 my @file_mime_list = split /,/, $first_inorder_mime;
199 my (@rest_fnames) = @fnamemime_list;
200 my @matched_list = ();
201 foreach my $file_mime (@file_mime_list) {
202 $pos = 0;
203 foreach my $allfiles (@fnamemime_list){
204 my $allfiles_mime = $allfiles->{'mimetype'};
205
206 if ($allfiles_mime =~ /$file_mime/) {
207 print $outhandle "Existing file:$allfiles->{'file'} match the user-defined File MimeType:$file_mime\n";
208 push (@matched_list, $allfiles);
209
210 # delete the matched MIMEType file from the array
211 splice(@rest_fnames,$pos,1);
212 return (\@matched_list, \@rest_fnames);
213 }
214 $pos++;
215 }
216 }
217 }
218 return (\@fnamemime_list, \@assocmime_list);
219}
220
221
222sub filemime_list_to_re
223{
224 my $self = shift (@_);
225 my ($fnamemime_list) = @_;
226
227 my @fname_list = map { "(".$_->{'file'}.")" } @$fnamemime_list;
228 my $fname_re = join("|",@fname_list);
229 $fname_re =~ s/\./\\\./g;
230
231 return $fname_re;
232}
233
234# Read dublin_core metadata from DSpace collection
235sub metadata_read {
236 my $self = shift (@_);
237 my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
238
239 my $only_first_doc = $self->{'only_first_doc'};
240 my $first_inorder_ext = $self->{'first_inorder_ext'};
241 my $first_inorder_mime = $self->{'first_inorder_mime'};
242
243 my $outhandle = $self->{'outhandle'};
244
245 my $filename = &util::filename_cat($base_dir, $file);
246 # return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
247
248 if ($filename !~ /dublin_core\.xml$/ || !-f $filename) {
249 return undef;
250 }
251
252 print $outhandle "DSpacePlug: extracting metadata from $file\n"
253 if $self->{'verbosity'} > 1;
254
255 my ($dir) = $filename =~ /^(.*?)[^\/\\]*$/;
256
257 eval {
258 $self->{'parser'}->parsefile($filename);
259 };
260
261 if ($@) {
262 die "DSpacePlug: ERROR $filename is not a well formed dublin_core.xml file ($@)\n";
263 }
264
265 my $mimetype_list = $self->{'saved_metadata'}->{'dc.Format^mimetype'};
266 my ($doc_file_mimes, $assoc_file_mimes) = $self->read_content($dir, $only_first_doc, $first_inorder_ext,
267 $first_inorder_mime, $mimetype_list);
268
269 my $file_re = $self->filemime_list_to_re($doc_file_mimes);
270
271 if ($only_first_doc || $first_inorder_ext || $first_inorder_mime) {
272 foreach my $afm ( @$assoc_file_mimes ) {
273 my $full_af = &util::filename_cat($dir,$afm->{'file'});
274 $self->{'extra_blocks'}->{$full_af} = 1;
275 }
276 }
277 push(@$extrametakeys,$file_re);
278
279 if (defined $self->{'saved_metadata'}->{'dc.Format^extent'}) {
280 delete $self->{'saved_metadata'}->{'dc.Format^extent'};
281 }
282
283 if (defined $mimetype_list) {
284 delete $self->{'saved_metadata'}->{'dc.Format^mimetype'};
285
286 # Temporarily store associate file info in metadata table
287 # This will be removed in 'extra_metadata' in BasPlug and used
288 # to perform the actual file association (once the doc obj has
289 # been formed
290
291 my $main_doc = $doc_file_mimes->[0];
292 my $md_mimetype = $main_doc->{'mimetype'};
293
294 my $pd_lookup = $primary_doc_lookup->{$md_mimetype};
295
296 if (defined $pd_lookup) {
297 my $filter_re = $pd_lookup;
298 @$assoc_file_mimes = grep { $_->{'file'} !~ m/$filter_re/ } @$assoc_file_mimes;
299 }
300
301 my @gsdlassocfile_tobe
302 = map { &util::filename_cat($dir,$_->{'file'}).":".$_->{'mimetype'}.":" } @$assoc_file_mimes;
303 $self->{'saved_metadata'}->{'gsdlassocfile_tobe'} = \@gsdlassocfile_tobe;
304
305 }
306
307 $extrametadata->{$file_re} = $self->{'saved_metadata'};
308
309 return 1;
310}
311
312
313# The DSpacePlug read() function. This function does all the right things
314# to make general options work for a given plugin. It calls the process()
315# function which does all the work specific to a plugin (like the old
316# read functions used to do). Most plugins should define their own
317# process() function and let this read() function keep control.
318#
319# DSpace overrides read() because there is no need to read the actual
320# text of the file in, because the contents of the file is not text...
321#
322# Return number of files processed, undef if can't process
323# Note that $base_dir might be "" and that $file might
324# include directories
325
326sub read {
327 my $self = shift (@_);
328 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
329 my $outhandle = $self->{'outhandle'};
330
331 # Block all files except contents
332 my $filename = &util::filename_cat($base_dir, $file);
333 return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
334
335 my $assocfile = $metadata->{'assocfile'};
336
337 return 0 if (($filename =~ /dublin_core\.xml$/) || ($filename =~ /contents$/));
338 return 0 if (defined $self->{'extra_blocks'}->{$filename});
339 return undef;
340}
341
342# do plugin specific processing of doc_obj
343sub process {
344 my $self = shift (@_);
345 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
346 my $outhandle = $self->{'outhandle'};
347
348 return 1;
349}
350
351sub Doctype {
352 my ($expat, $name, $sysid, $pubid, $internal) = @_;
353
354 die if ($name !~ /^dublin_core$/);
355}
356
357sub StartTag {
358 my ($expat, $element) = @_;
359 if ($element eq "dublin_core") {
360 $self->{'saved_metadata'} = {};
361 } elsif ($element eq "dcvalue") {
362 my $metaname = $_{'element'};
363 my $qualifier = $_{'qualifier'};
364 if ($metaname ne "description") {
365 $metaname .= "^$qualifier" if ($qualifier ne "none" && $qualifier ne "");
366 $self->{'metaname'} = "dc.\u$metaname";
367 }
368 }
369}
370
371sub EndTag {
372 my ($expat, $element) = @_;
373
374 if ($element eq "dcvalue") {
375 $self->{'metaname'} = "";
376 }
377}
378
379sub Text {
380 if (defined ($self->{'metaname'}) && $self->{'metaname'} ne "") {
381 # $_ == Metadata content
382 my $mname = $self->{'metaname'};
383 if (defined $self->{'saved_metadata'}->{$mname}) {
384 # accumulate - add value to existing value(s)
385 if (ref ($self->{'saved_metadata'}->{$mname}) eq "ARRAY") {
386 push (@{$self->{'saved_metadata'}->{$mname}}, $_);
387 } else {
388 $self->{'saved_metadata'}->{$mname} =
389 [$self->{'saved_metadata'}->{$mname}, $_];
390 }
391 } else {
392 # accumulate - add value into (currently empty) array
393 $self->{'saved_metadata'}->{$mname} = [$_];
394 }
395
396 }
397}
398
399# This Char function overrides the one in XML::Parser::Stream to overcome a
400# problem where $expat->{Text} is treated as the return value, slowing
401# things down significantly in some cases.
402sub Char {
403 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
404 $_[0]->{'Text'} .= $_[1];
405 return undef;
406}
407
4081;
Note: See TracBrowser for help on using the repository browser.