source: trunk/gsdl/perllib/plugins/DSpacePlug.pm@ 9706

Last change on this file since 9706 was 9694, checked in by kjdon, 19 years ago

when an element had no qualifier it was still getting the symbol added to the element name. added a check for empty qualifier

  • Property svn:keywords set to Author Date Id Revision
File size: 12.3 KB
Line 
1###########################################################################
2#
3# DSpacePlug.pm -- plugin for import the collection from DSpace
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27# DSpace Plug - 10/2004
28#
29#
30# This plugin takes "contents" and dublin_core.xml file, which contain
31# Metadata and lists of associated files for a particular document
32# and produces a document containing sections, one for each page.
33# The files should be named "contents" and "dublin_core.xml". For each of
34# document in DSpace, it is stored in one directory
35#
36# The format of the "contents" file is as follows:
37#
38# File.type bundle:ORIGINAL
39# license.txt bundle:LICENSE
40# The format of the "dublin_core.xml" file is as follows:
41# The first line contains any metadata for the whole document
42# <dublin_core>
43# eg.
44# <dcvalue element="Title" qualifier="">Snail farming</dcvalue>
45# <dcvalue element="date" qualifier="">2004-10-15</dcvalue>
46#
47
48package DSpacePlug;
49
50use BasPlug;
51use plugin;
52#use ghtml;
53use XMLParser;
54
55sub BEGIN {
56 @ISA = ('BasPlug');
57}
58
59my $arguments =
60 [ { 'name' => "process_exp",
61 'desc' => "{BasPlug.process_exp}",
62 'type' => "string",
63 'deft' => &get_default_process_exp(),
64 'reqd' => "no" },
65 { 'name' => "only_first_doc",
66 'desc' => "{DSpacePlug.only_first_doc}",
67 'type' => "flag",
68 'reqd' => "no" },
69 { 'name' => "first_inorder_ext",
70 'desc' => "{DSpacePlug.first_inorder_ext}",
71 'type' => "string",
72 'reqd' => "no" },
73 { 'name' => "first_inorder_mime",
74 'desc' => "{DSpacePlug.first_inorder_mime}",
75 'type' => "flag",
76 'reqd' => "no" },
77 { 'name' => "block_exp",
78 'desc' => "{BasPlug.block_exp}",
79 'type' => "regexp",
80 'deft' => &get_default_block_exp(),
81 'reqd' => "no" }];
82
83
84my $options = { 'name' => "DSpacePlug",
85 'desc' => "{DSpacePlug.desc}",
86 'inherits' => "yes",
87 'args' => $arguments };
88
89
90my $primary_doc_lookup = { 'text/html' => '(?i)\.(gif|jpe?g|jpe|jpg|png|css)$' };
91
92# Important variation to regular plugin structure. Need to declare
93# $self as global variable to file so XMLParser callback routines
94# can access the content of the object.
95my ($self);
96
97sub new {
98 my $class = shift (@_);
99 #my $plugin_name = shift (@_);
100
101 $self = new BasPlug ($class, @_);
102 $self->{'plugin_type'} = "DSpacePlug";
103
104 my $option_list = $self->{'option_list'};
105 push( @{$option_list}, $options );
106
107
108 if (!parsargv::parse(\@_,
109 q^only_first_doc^, \$self->{'only_first_doc'},
110 q^first_inorder_ext/.*/^, \$self->{'first_inorder_ext'},
111 q^first_inorder_mime/.*/^, \$self->{'first_inorder_mime'},
112 "allow_extra_options")) {
113 print STDERR "\nDSpacePlug uses an incorrect option.\n";
114 print STDERR "Check your collect.cfg configuration file.\n\n";
115 $self->print_txt_usage(""); # Use default resource bundle
116 die "\n";
117 }
118
119 #create XML::Parser object for parsing dublin_core.xml files
120 my $parser = new XML::Parser('Style' => 'Stream',
121 'Handlers' => {'Char' => \&Char,
122 'Doctype' => \&Doctype
123 });
124 $self->{'parser'} = $parser;
125 $self->{'extra_blocks'} = {};
126
127 return bless $self, $class;
128}
129
130sub get_default_process_exp {
131 my $self = shift (@_);
132
133 return q^(?i)contents$^;
134}
135
136# want to block all files except the "contents"
137sub get_default_block_exp {
138 my $self = shift (@_);
139
140 # Block all files besides contents
141 return q^(?i)(handle|dublin_core\.xml|\.tx?t)$^;
142}
143
144sub read_content
145{
146 my $self = shift (@_);
147 my ($dir, $only_first_doc, $first_inorder_ext, $first_inorder_mime, $mimetype_list) = @_;
148 my $outhandle = $self->{'outhandle'};
149
150 my @fnamemime_list = ();
151 my @assocmime_list = ();
152
153 my $content_fname = &util::filename_cat($dir,"contents");
154
155 open(CIN,"<$content_fname")
156 || die "Unable to open $content_fname: $!\n";
157
158 my $line;
159 my $pos = 0;
160
161 while (defined ($line = <CIN>)) {
162 if ($line =~ m/^(.*)\s+bundle:ORIGINAL\s*$/) {
163 my $fname = $1;
164 my $mtype = $mimetype_list->[$pos];
165 my $fm_rec = { 'file' => $fname, 'mimetype' => $mtype};
166 push(@fnamemime_list,$fm_rec);
167 $pos++;
168 }
169 }
170 close CIN;
171
172 if ($only_first_doc){
173 my ($first_fname, @rest_fnames) = @fnamemime_list;
174 @fnamemime_list = ($first_fname);
175 @assocmime_list = @rest_fnames;
176 }
177
178 # allow user to specify the types of files (inorder)they would like to assign as
179 # a primary bitstream
180 if ($first_inorder_ext) {
181 # parse user-define file extension names
182 my @extfiles_list = split /,/, $first_inorder_ext;
183 my (@rest_fnames) = @fnamemime_list;
184 my @matched_list = ();
185 foreach my $file_ext (@extfiles_list) {
186 $pos = 0;
187 foreach my $allfiles (@fnamemime_list){
188 $allfiles->{'file'} =~ /^(.*)\.(.*?)$/;
189 my $allfiles_ext = $2;
190
191 if ($allfiles_ext =~ /$file_ext/) {
192 print $outhandle "Existing file:$allfiles->{'file'} match the user-define File Extension:$file_ext\n";
193 push (@matched_list, $allfiles);
194
195 # delete the matched extension file from the array
196 splice(@rest_fnames,$pos,1);
197
198 return (\@matched_list, \@rest_fnames);
199
200 }
201 $pos++;
202 }
203 }
204 }
205
206 if ($first_inorder_mime) {
207 # parse user-define file mimetype
208 my @file_mime_list = split /,/, $first_inorder_mime;
209 my (@rest_fnames) = @fnamemime_list;
210 my @matched_list = ();
211 foreach my $file_mime (@file_mime_list) {
212 $pos = 0;
213 foreach my $allfiles (@fnamemime_list){
214 my $allfiles_mime = $allfiles->{'mimetype'};
215
216 if ($allfiles_mime =~ /$file_mime/) {
217 print $outhandle "Existing file:$allfiles->{'file'} match the user-defined File MimeType:$file_mime\n";
218 push (@matched_list, $allfiles);
219
220 # delete the matched MIMEType file from the array
221 splice(@rest_fnames,$pos,1);
222 return (\@matched_list, \@rest_fnames);
223 }
224 $pos++;
225 }
226 }
227 }
228 return (\@fnamemime_list, \@assocmime_list);
229}
230
231
232sub filemime_list_to_re
233{
234 my $self = shift (@_);
235 my ($fnamemime_list) = @_;
236
237 my @fname_list = map { "(".$_->{'file'}.")" } @$fnamemime_list;
238 my $fname_re = join("|",@fname_list);
239 $fname_re =~ s/\./\\\./g;
240
241 return $fname_re;
242}
243
244# Read dublin_core metadata from DSpace collection
245sub metadata_read {
246 my $self = shift (@_);
247 my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs) = @_;
248
249 my $only_first_doc = $self->{'only_first_doc'};
250 my $first_inorder_ext = $self->{'first_inorder_ext'};
251 my $first_inorder_mime = $self->{'first_inorder_mime'};
252
253 my $outhandle = $self->{'outhandle'};
254
255 my $filename = &util::filename_cat($base_dir, $file);
256 # return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
257
258 if ($filename !~ /dublin_core\.xml$/ || !-f $filename) {
259 return undef;
260 }
261
262 print $outhandle "DSpacePlug: extracting metadata from $file\n"
263 if $self->{'verbosity'} > 1;
264
265 my ($dir) = $filename =~ /^(.*?)[^\/\\]*$/;
266
267 eval {
268 $self->{'parser'}->parsefile($filename);
269 };
270
271 if ($@) {
272 die "DSpacePlug: ERROR $filename is not a well formed dublin_core.xml file ($@)\n";
273 }
274
275 my $mimetype_list = $self->{'saved_metadata'}->{'dc.Format^mimetype'};
276 my ($doc_file_mimes, $assoc_file_mimes) = $self->read_content($dir, $only_first_doc, $first_inorder_ext,
277 $first_inorder_mime, $mimetype_list);
278
279 my $file_re = $self->filemime_list_to_re($doc_file_mimes);
280
281 if ($only_first_doc || $first_inorder_ext || $first_inorder_mime) {
282 foreach my $afm ( @$assoc_file_mimes ) {
283 my $full_af = &util::filename_cat($dir,$afm->{'file'});
284 $self->{'extra_blocks'}->{$full_af} = 1;
285 }
286 }
287 push(@$extrametakeys,$file_re);
288
289 if (defined $self->{'saved_metadata'}->{'dc.Format^extent'}) {
290 delete $self->{'saved_metadata'}->{'dc.Format^extent'};
291 }
292
293 if (defined $mimetype_list) {
294 delete $self->{'saved_metadata'}->{'dc.Format^mimetype'};
295
296 # Temporarily store associate file info in metadata table
297 # This will be removed in 'extra_metadata' in BasPlug and used
298 # to perform the actual file association (once the doc obj has
299 # been formed
300
301 my $main_doc = $doc_file_mimes->[0];
302 my $md_mimetype = $main_doc->{'mimetype'};
303
304 my $pd_lookup = $primary_doc_lookup->{$md_mimetype};
305
306 if (defined $pd_lookup) {
307 my $filter_re = $pd_lookup;
308 @$assoc_file_mimes = grep { $_->{'file'} !~ m/$filter_re/ } @$assoc_file_mimes;
309 }
310
311 my @gsdlassocfile_tobe
312 = map { &util::filename_cat($dir,$_->{'file'}).":".$_->{'mimetype'}.":" } @$assoc_file_mimes;
313 $self->{'saved_metadata'}->{'gsdlassocfile_tobe'} = \@gsdlassocfile_tobe;
314
315 }
316
317 $extrametadata->{$file_re} = $self->{'saved_metadata'};
318
319 return 1;
320}
321
322
323# The DSpacePlug read() function. This function does all the right things
324# to make general options work for a given plugin. It calls the process()
325# function which does all the work specific to a plugin (like the old
326# read functions used to do). Most plugins should define their own
327# process() function and let this read() function keep control.
328#
329# DSpace overrides read() because there is no need to read the actual
330# text of the file in, because the contents of the file is not text...
331#
332# Return number of files processed, undef if can't process
333# Note that $base_dir might be "" and that $file might
334# include directories
335
336sub read {
337 my $self = shift (@_);
338 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
339 my $outhandle = $self->{'outhandle'};
340
341 # Block all files except contents
342 my $filename = &util::filename_cat($base_dir, $file);
343 return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
344
345 my $assocfile = $metadata->{'assocfile'};
346
347 return 0 if (($filename =~ /dublin_core\.xml$/) || ($filename =~ /contents$/));
348 return 0 if (defined $self->{'extra_blocks'}->{$filename});
349 return undef;
350}
351
352# do plugin specific processing of doc_obj
353sub process {
354 my $self = shift (@_);
355 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
356 my $outhandle = $self->{'outhandle'};
357
358 return 1;
359}
360
361sub Doctype {
362 my ($expat, $name, $sysid, $pubid, $internal) = @_;
363
364 die if ($name !~ /^dublin_core$/);
365}
366
367sub StartTag {
368 my ($expat, $element) = @_;
369 if ($element eq "dublin_core") {
370 $self->{'saved_metadata'} = {};
371 } elsif ($element eq "dcvalue") {
372 my $metaname = $_{'element'};
373 my $qualifier = $_{'qualifier'};
374 if ($metaname ne "description") {
375 $metaname .= "^$qualifier" if ($qualifier ne "none" && $qualifier ne "");
376 $self->{'metaname'} = "dc.\u$metaname";
377 }
378 }
379}
380
381sub EndTag {
382 my ($expat, $element) = @_;
383
384 if ($element eq "dcvalue") {
385 $self->{'metaname'} = "";
386 }
387}
388
389sub Text {
390 if (defined ($self->{'metaname'}) && $self->{'metaname'} ne "") {
391 # $_ == Metadata content
392 my $mname = $self->{'metaname'};
393 if (defined $self->{'saved_metadata'}->{$mname}) {
394 # accumulate - add value to existing value(s)
395 if (ref ($self->{'saved_metadata'}->{$mname}) eq "ARRAY") {
396 push (@{$self->{'saved_metadata'}->{$mname}}, $_);
397 } else {
398 $self->{'saved_metadata'}->{$mname} =
399 [$self->{'saved_metadata'}->{$mname}, $_];
400 }
401 } else {
402 # accumulate - add value into (currently empty) array
403 $self->{'saved_metadata'}->{$mname} = [$_];
404 }
405
406 }
407}
408
409# This Char function overrides the one in XML::Parser::Stream to overcome a
410# problem where $expat->{Text} is treated as the return value, slowing
411# things down significantly in some cases.
412sub Char {
413 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
414 $_[0]->{'Text'} .= $_[1];
415 return undef;
416}
417
4181;
Note: See TracBrowser for help on using the repository browser.