source: trunk/gsdl/perllib/plugins/DSpacePlug.pm@ 11090

Last change on this file since 11090 was 10549, checked in by chi, 19 years ago

Modifications to deal with the "dc value" without qualifier.

  • Property svn:keywords set to Author Date Id Revision
File size: 12.1 KB
Line 
1
2###########################################################################
3#
4# DSpacePlug.pm -- plugin for import the collection from DSpace
5#
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 1999 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28# DSpace Plug - 10/2004
29#
30#
31# This plugin takes "contents" and dublin_core.xml file, which contain
32# Metadata and lists of associated files for a particular document
33# and produces a document containing sections, one for each page.
34# The files should be named "contents" and "dublin_core.xml". For each of
35# document in DSpace, it is stored in one directory
36#
37# The format of the "contents" file is as follows:
38#
39# File.type bundle:ORIGINAL
40# license.txt bundle:LICENSE
41# The format of the "dublin_core.xml" file is as follows:
42# The first line contains any metadata for the whole document
43# <dublin_core>
44# eg.
45# <dcvalue element="Title" qualifier="">Snail farming</dcvalue>
46# <dcvalue element="date" qualifier="">2004-10-15</dcvalue>
47#
48
49package DSpacePlug;
50
51use BasPlug;
52use plugin;
53#use ghtml;
54use XMLParser;
55use strict;
56no strict 'refs'; # allow filehandles to be variables and viceversa
57
58sub BEGIN {
59 @DSpacePlug::ISA = ('BasPlug');
60}
61
62my $arguments =
63 [ { 'name' => "process_exp",
64 'desc' => "{BasPlug.process_exp}",
65 'type' => "string",
66 'deft' => &get_default_process_exp(),
67 'reqd' => "no" },
68 { 'name' => "only_first_doc",
69 'desc' => "{DSpacePlug.only_first_doc}",
70 'type' => "flag",
71 'reqd' => "no" },
72 { 'name' => "first_inorder_ext",
73 'desc' => "{DSpacePlug.first_inorder_ext}",
74 'type' => "string",
75 'reqd' => "no" },
76 { 'name' => "first_inorder_mime",
77 'desc' => "{DSpacePlug.first_inorder_mime}",
78 'type' => "flag",
79 'reqd' => "no" },
80 { 'name' => "block_exp",
81 'desc' => "{BasPlug.block_exp}",
82 'type' => "regexp",
83 'deft' => &get_default_block_exp(),
84 'reqd' => "no" }];
85
86
87my $options = { 'name' => "DSpacePlug",
88 'desc' => "{DSpacePlug.desc}",
89 'inherits' => "yes",
90 'args' => $arguments };
91
92
93my $primary_doc_lookup = { 'text/html' => '(?i)\.(gif|jpe?g|jpe|jpg|png|css)$' };
94
95# Important variation to regular plugin structure. Need to declare
96# $self as global variable to file so XMLParser callback routines
97# can access the content of the object.
98my ($self);
99
100sub new {
101 my ($class) = shift (@_);
102 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
103 push(@$pluginlist, $class);
104
105 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
106 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
107
108 $self = (defined $hashArgOptLists)? new BasPlug($pluginlist,$inputargs,$hashArgOptLists): new BasPlug($pluginlist,$inputargs);
109
110 #create XML::Parser object for parsing dublin_core.xml files
111 my $parser = new XML::Parser('Style' => 'Stream',
112 'Handlers' => {'Char' => \&Char,
113 'Doctype' => \&Doctype
114 });
115 $self->{'parser'} = $parser;
116 $self->{'extra_blocks'} = {};
117
118 return bless $self, $class;
119}
120
121sub get_default_process_exp {
122 my $self = shift (@_);
123
124 return q^(?i)contents$^;
125}
126
127# want to block all files except the "contents"
128sub get_default_block_exp {
129 my $self = shift (@_);
130
131 # Block all files besides contents
132 return q^(?i)(handle|dublin_core\.xml|\.tx?t)$^;
133}
134
135sub read_content
136{
137 my $self = shift (@_);
138 my ($dir, $only_first_doc, $first_inorder_ext, $first_inorder_mime, $mimetype_list) = @_;
139 my $outhandle = $self->{'outhandle'};
140
141 my @fnamemime_list = ();
142 my @assocmime_list = ();
143
144 my $content_fname = &util::filename_cat($dir,"contents");
145
146 open(CIN,"<$content_fname")
147 || die "Unable to open $content_fname: $!\n";
148
149 my $line;
150 my $pos = 0;
151
152 while (defined ($line = <CIN>)) {
153 if ($line =~ m/^(.*)\s+bundle:ORIGINAL\s*$/) {
154 my $fname = $1;
155 my $mtype = $mimetype_list->[$pos];
156 my $fm_rec = { 'file' => $fname, 'mimetype' => $mtype};
157 push(@fnamemime_list,$fm_rec);
158 $pos++;
159 }
160 }
161 close CIN;
162
163 if ($only_first_doc){
164 my ($first_fname, @rest_fnames) = @fnamemime_list;
165 @fnamemime_list = ($first_fname);
166 @assocmime_list = @rest_fnames;
167 }
168
169 # allow user to specify the types of files (inorder)they would like to assign as
170 # a primary bitstream
171 if ($first_inorder_ext) {
172 # parse user-define file extension names
173 my @extfiles_list = split /,/, $first_inorder_ext;
174 my (@rest_fnames) = @fnamemime_list;
175 my @matched_list = ();
176 foreach my $file_ext (@extfiles_list) {
177 $pos = 0;
178 foreach my $allfiles (@fnamemime_list){
179 $allfiles->{'file'} =~ /^(.*)\.(.*?)$/;
180 my $allfiles_ext = $2;
181
182 if ($allfiles_ext =~ /$file_ext/) {
183 print $outhandle "Existing file:$allfiles->{'file'} match the user-define File Extension:$file_ext\n";
184 push (@matched_list, $allfiles);
185
186 # delete the matched extension file from the array
187 splice(@rest_fnames,$pos,1);
188
189 return (\@matched_list, \@rest_fnames);
190
191 }
192 $pos++;
193 }
194 }
195 }
196
197 if ($first_inorder_mime) {
198 # parse user-define file mimetype
199 my @file_mime_list = split /,/, $first_inorder_mime;
200 my (@rest_fnames) = @fnamemime_list;
201 my @matched_list = ();
202 foreach my $file_mime (@file_mime_list) {
203 $pos = 0;
204 foreach my $allfiles (@fnamemime_list){
205 my $allfiles_mime = $allfiles->{'mimetype'};
206
207 if ($allfiles_mime =~ /$file_mime/) {
208 print $outhandle "Existing file:$allfiles->{'file'} match the user-defined File MimeType:$file_mime\n";
209 push (@matched_list, $allfiles);
210
211 # delete the matched MIMEType file from the array
212 splice(@rest_fnames,$pos,1);
213 return (\@matched_list, \@rest_fnames);
214 }
215 $pos++;
216 }
217 }
218 }
219 return (\@fnamemime_list, \@assocmime_list);
220}
221
222
223sub filemime_list_to_re
224{
225 my $self = shift (@_);
226 my ($fnamemime_list) = @_;
227
228 my @fname_list = map { "(".$_->{'file'}.")" } @$fnamemime_list;
229 my $fname_re = join("|",@fname_list);
230 $fname_re =~ s/\./\\\./g;
231
232 return $fname_re;
233}
234
235# Read dublin_core metadata from DSpace collection
236sub metadata_read {
237 my $self = shift (@_);
238 my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
239
240 my $only_first_doc = $self->{'only_first_doc'};
241 my $first_inorder_ext = $self->{'first_inorder_ext'};
242 my $first_inorder_mime = $self->{'first_inorder_mime'};
243
244 my $outhandle = $self->{'outhandle'};
245
246 my $filename = &util::filename_cat($base_dir, $file);
247 # return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
248
249 if ($filename !~ /dublin_core\.xml$/ || !-f $filename) {
250 return undef;
251 }
252
253 print $outhandle "DSpacePlug: extracting metadata from $file\n"
254 if $self->{'verbosity'} > 1;
255
256 my ($dir) = $filename =~ /^(.*?)[^\/\\]*$/;
257
258 eval {
259 $self->{'parser'}->parsefile($filename);
260 };
261
262 if ($@) {
263 die "DSpacePlug: ERROR $filename is not a well formed dublin_core.xml file ($@)\n";
264 }
265
266 my $mimetype_list = $self->{'saved_metadata'}->{'dc.Format^mimetype'};
267 my ($doc_file_mimes, $assoc_file_mimes) = $self->read_content($dir, $only_first_doc, $first_inorder_ext,
268 $first_inorder_mime, $mimetype_list);
269
270 my $file_re = $self->filemime_list_to_re($doc_file_mimes);
271
272 if ($only_first_doc || $first_inorder_ext || $first_inorder_mime) {
273 foreach my $afm ( @$assoc_file_mimes ) {
274 my $full_af = &util::filename_cat($dir,$afm->{'file'});
275 $self->{'extra_blocks'}->{$full_af} = 1;
276 }
277 }
278 push(@$extrametakeys,$file_re);
279
280 if (defined $self->{'saved_metadata'}->{'dc.Format^extent'}) {
281 delete $self->{'saved_metadata'}->{'dc.Format^extent'};
282 }
283
284 if (defined $mimetype_list) {
285 delete $self->{'saved_metadata'}->{'dc.Format^mimetype'};
286
287 # Temporarily store associate file info in metadata table
288 # This will be removed in 'extra_metadata' in BasPlug and used
289 # to perform the actual file association (once the doc obj has
290 # been formed
291
292 my $main_doc = $doc_file_mimes->[0];
293 my $md_mimetype = $main_doc->{'mimetype'};
294
295 my $pd_lookup = $primary_doc_lookup->{$md_mimetype};
296
297 if (defined $pd_lookup) {
298 my $filter_re = $pd_lookup;
299 @$assoc_file_mimes = grep { $_->{'file'} !~ m/$filter_re/ } @$assoc_file_mimes;
300 }
301
302 my @gsdlassocfile_tobe
303 = map { &util::filename_cat($dir,$_->{'file'}).":".$_->{'mimetype'}.":" } @$assoc_file_mimes;
304 $self->{'saved_metadata'}->{'gsdlassocfile_tobe'} = \@gsdlassocfile_tobe;
305
306 }
307
308 $extrametadata->{$file_re} = $self->{'saved_metadata'};
309
310 return 1;
311}
312
313
314# The DSpacePlug read() function. This function does all the right things
315# to make general options work for a given plugin. It calls the process()
316# function which does all the work specific to a plugin (like the old
317# read functions used to do). Most plugins should define their own
318# process() function and let this read() function keep control.
319#
320# DSpace overrides read() because there is no need to read the actual
321# text of the file in, because the contents of the file is not text...
322#
323# Return number of files processed, undef if can't process
324# Note that $base_dir might be "" and that $file might
325# include directories
326
327sub read {
328 my $self = shift (@_);
329 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
330 my $outhandle = $self->{'outhandle'};
331
332 # Block all files except contents
333 my $filename = &util::filename_cat($base_dir, $file);
334 return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
335
336 my $assocfile = $metadata->{'assocfile'};
337
338 return 0 if (($filename =~ /dublin_core\.xml$/) || ($filename =~ /contents$/));
339 return 0 if (defined $self->{'extra_blocks'}->{$filename});
340 return undef;
341}
342
343# do plugin specific processing of doc_obj
344sub process {
345 my $self = shift (@_);
346 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
347 my $outhandle = $self->{'outhandle'};
348
349 return 1;
350}
351
352sub Doctype {
353 my ($expat, $name, $sysid, $pubid, $internal) = @_;
354
355 die if ($name !~ /^dublin_core$/);
356}
357
358sub StartTag {
359 my ($expat, $element) = @_;
360 if ($element eq "dublin_core") {
361 $self->{'saved_metadata'} = {};
362 } elsif ($element eq "dcvalue") {
363 my $metaname = $_{'element'};
364 my $qualifier = $_{'qualifier'}||"";
365 if ($metaname ne "description") {
366 $metaname .= "^$qualifier" if ($qualifier ne "none" && $qualifier ne "");
367 $self->{'metaname'} = "dc.\u$metaname";
368 }
369 }
370}
371
372sub EndTag {
373 my ($expat, $element) = @_;
374
375 if ($element eq "dcvalue") {
376 $self->{'metaname'} = "";
377 }
378}
379
380sub Text {
381 if (defined ($self->{'metaname'}) && $self->{'metaname'} ne "") {
382 # $_ == Metadata content
383 my $mname = $self->{'metaname'};
384 if (defined $self->{'saved_metadata'}->{$mname}) {
385 # accumulate - add value to existing value(s)
386 if (ref ($self->{'saved_metadata'}->{$mname}) eq "ARRAY") {
387 push (@{$self->{'saved_metadata'}->{$mname}}, $_);
388 } else {
389 $self->{'saved_metadata'}->{$mname} =
390 [$self->{'saved_metadata'}->{$mname}, $_];
391 }
392 } else {
393 # accumulate - add value into (currently empty) array
394 $self->{'saved_metadata'}->{$mname} = [$_];
395 }
396
397 }
398}
399
400# This Char function overrides the one in XML::Parser::Stream to overcome a
401# problem where $expat->{Text} is treated as the return value, slowing
402# things down significantly in some cases.
403sub Char {
404 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
405 $_[0]->{'Text'} .= $_[1];
406 return undef;
407}
408
4091;
Note: See TracBrowser for help on using the repository browser.