source: trunk/gsdl/perllib/plugins/DSpacePlug.pm@ 9853

Last change on this file since 9853 was 9853, checked in by kjdon, 19 years ago

fixed up maxdocs - now pass an extra parameter to the read function

  • Property svn:keywords set to Author Date Id Revision
File size: 12.3 KB
Line 
1###########################################################################
2#
3# DSpacePlug.pm -- plugin for import the collection from DSpace
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27# DSpace Plug - 10/2004
28#
29#
30# This plugin takes "contents" and dublin_core.xml file, which contain
31# Metadata and lists of associated files for a particular document
32# and produces a document containing sections, one for each page.
33# The files should be named "contents" and "dublin_core.xml". For each of
34# document in DSpace, it is stored in one directory
35#
36# The format of the "contents" file is as follows:
37#
38# File.type bundle:ORIGINAL
39# license.txt bundle:LICENSE
40# The format of the "dublin_core.xml" file is as follows:
41# The first line contains any metadata for the whole document
42# <dublin_core>
43# eg.
44# <dcvalue element="Title" qualifier="">Snail farming</dcvalue>
45# <dcvalue element="date" qualifier="">2004-10-15</dcvalue>
46#
47
48package DSpacePlug;
49
50use BasPlug;
51use plugin;
52#use ghtml;
53use XMLParser;
54
55sub BEGIN {
56 @ISA = ('BasPlug');
57}
58
59my $arguments =
60 [ { 'name' => "process_exp",
61 'desc' => "{BasPlug.process_exp}",
62 'type' => "string",
63 'deft' => &get_default_process_exp(),
64 'reqd' => "no" },
65 { 'name' => "only_first_doc",
66 'desc' => "{DSpacePlug.only_first_doc}",
67 'type' => "flag",
68 'reqd' => "no" },
69 { 'name' => "first_inorder_ext",
70 'desc' => "{DSpacePlug.first_inorder_ext}",
71 'type' => "string",
72 'reqd' => "no" },
73 { 'name' => "first_inorder_mime",
74 'desc' => "{DSpacePlug.first_inorder_mime}",
75 'type' => "flag",
76 'reqd' => "no" },
77 { 'name' => "block_exp",
78 'desc' => "{BasPlug.block_exp}",
79 'type' => "regexp",
80 'deft' => &get_default_block_exp(),
81 'reqd' => "no" }];
82
83
84my $options = { 'name' => "DSpacePlug",
85 'desc' => "{DSpacePlug.desc}",
86 'inherits' => "yes",
87 'args' => $arguments };
88
89
90my $primary_doc_lookup = { 'text/html' => '(?i)\.(gif|jpe?g|jpe|jpg|png|css)$' };
91
92# Important variation to regular plugin structure. Need to declare
93# $self as global variable to file so XMLParser callback routines
94# can access the content of the object.
95my ($self);
96
97sub new {
98 my $class = shift (@_);
99 #my $plugin_name = shift (@_);
100
101 $self = new BasPlug ($class, @_);
102 $self->{'plugin_type'} = "DSpacePlug";
103
104 my $option_list = $self->{'option_list'};
105 push( @{$option_list}, $options );
106
107
108 if (!parsargv::parse(\@_,
109 q^only_first_doc^, \$self->{'only_first_doc'},
110 q^first_inorder_ext/.*/^, \$self->{'first_inorder_ext'},
111 q^first_inorder_mime/.*/^, \$self->{'first_inorder_mime'},
112 "allow_extra_options")) {
113 print STDERR "\nDSpacePlug uses an incorrect option.\n";
114 print STDERR "Check your collect.cfg configuration file.\n\n";
115 $self->print_txt_usage(""); # Use default resource bundle
116 die "\n";
117 }
118
119 #create XML::Parser object for parsing dublin_core.xml files
120 my $parser = new XML::Parser('Style' => 'Stream',
121 'Handlers' => {'Char' => \&Char,
122 'Doctype' => \&Doctype
123 });
124 $self->{'parser'} = $parser;
125 $self->{'extra_blocks'} = {};
126
127 return bless $self, $class;
128}
129
130sub get_default_process_exp {
131 my $self = shift (@_);
132
133 return q^(?i)contents$^;
134}
135
136# want to block all files except the "contents"
137sub get_default_block_exp {
138 my $self = shift (@_);
139
140 # Block all files besides contents
141 return q^(?i)(handle|dublin_core\.xml|\.tx?t)$^;
142}
143
144sub read_content
145{
146 my $self = shift (@_);
147 my ($dir, $only_first_doc, $first_inorder_ext, $first_inorder_mime, $mimetype_list) = @_;
148 my $outhandle = $self->{'outhandle'};
149
150 my @fnamemime_list = ();
151 my @assocmime_list = ();
152
153 my $content_fname = &util::filename_cat($dir,"contents");
154
155 open(CIN,"<$content_fname")
156 || die "Unable to open $content_fname: $!\n";
157
158 my $line;
159 my $pos = 0;
160
161 while (defined ($line = <CIN>)) {
162 if ($line =~ m/^(.*)\s+bundle:ORIGINAL\s*$/) {
163 my $fname = $1;
164 my $mtype = $mimetype_list->[$pos];
165 my $fm_rec = { 'file' => $fname, 'mimetype' => $mtype};
166 push(@fnamemime_list,$fm_rec);
167 $pos++;
168 }
169 }
170 close CIN;
171
172 if ($only_first_doc){
173 my ($first_fname, @rest_fnames) = @fnamemime_list;
174 @fnamemime_list = ($first_fname);
175 @assocmime_list = @rest_fnames;
176 }
177
178 # allow user to specify the types of files (inorder)they would like to assign as
179 # a primary bitstream
180 if ($first_inorder_ext) {
181 # parse user-define file extension names
182 my @extfiles_list = split /,/, $first_inorder_ext;
183 my (@rest_fnames) = @fnamemime_list;
184 my @matched_list = ();
185 foreach my $file_ext (@extfiles_list) {
186 $pos = 0;
187 foreach my $allfiles (@fnamemime_list){
188 $allfiles->{'file'} =~ /^(.*)\.(.*?)$/;
189 my $allfiles_ext = $2;
190
191 if ($allfiles_ext =~ /$file_ext/) {
192 print $outhandle "Existing file:$allfiles->{'file'} match the user-define File Extension:$file_ext\n";
193 push (@matched_list, $allfiles);
194
195 # delete the matched extension file from the array
196 splice(@rest_fnames,$pos,1);
197
198 return (\@matched_list, \@rest_fnames);
199
200 }
201 $pos++;
202 }
203 }
204 }
205
206 if ($first_inorder_mime) {
207 # parse user-define file mimetype
208 my @file_mime_list = split /,/, $first_inorder_mime;
209 my (@rest_fnames) = @fnamemime_list;
210 my @matched_list = ();
211 foreach my $file_mime (@file_mime_list) {
212 $pos = 0;
213 foreach my $allfiles (@fnamemime_list){
214 my $allfiles_mime = $allfiles->{'mimetype'};
215
216 if ($allfiles_mime =~ /$file_mime/) {
217 print $outhandle "Existing file:$allfiles->{'file'} match the user-defined File MimeType:$file_mime\n";
218 push (@matched_list, $allfiles);
219
220 # delete the matched MIMEType file from the array
221 splice(@rest_fnames,$pos,1);
222 return (\@matched_list, \@rest_fnames);
223 }
224 $pos++;
225 }
226 }
227 }
228 return (\@fnamemime_list, \@assocmime_list);
229}
230
231
232sub filemime_list_to_re
233{
234 my $self = shift (@_);
235 my ($fnamemime_list) = @_;
236
237 my @fname_list = map { "(".$_->{'file'}.")" } @$fnamemime_list;
238 my $fname_re = join("|",@fname_list);
239 $fname_re =~ s/\./\\\./g;
240
241 return $fname_re;
242}
243
244# Read dublin_core metadata from DSpace collection
245sub metadata_read {
246 my $self = shift (@_);
247 my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
248
249 my $only_first_doc = $self->{'only_first_doc'};
250 my $first_inorder_ext = $self->{'first_inorder_ext'};
251 my $first_inorder_mime = $self->{'first_inorder_mime'};
252
253 my $outhandle = $self->{'outhandle'};
254
255 my $filename = &util::filename_cat($base_dir, $file);
256 # return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
257
258 if ($filename !~ /dublin_core\.xml$/ || !-f $filename) {
259 return undef;
260 }
261
262 print $outhandle "DSpacePlug: extracting metadata from $file\n"
263 if $self->{'verbosity'} > 1;
264
265 my ($dir) = $filename =~ /^(.*?)[^\/\\]*$/;
266
267 eval {
268 $self->{'parser'}->parsefile($filename);
269 };
270
271 if ($@) {
272 die "DSpacePlug: ERROR $filename is not a well formed dublin_core.xml file ($@)\n";
273 }
274
275 my $mimetype_list = $self->{'saved_metadata'}->{'dc.Format^mimetype'};
276 my ($doc_file_mimes, $assoc_file_mimes) = $self->read_content($dir, $only_first_doc, $first_inorder_ext,
277 $first_inorder_mime, $mimetype_list);
278
279 my $file_re = $self->filemime_list_to_re($doc_file_mimes);
280
281 if ($only_first_doc || $first_inorder_ext || $first_inorder_mime) {
282 foreach my $afm ( @$assoc_file_mimes ) {
283 my $full_af = &util::filename_cat($dir,$afm->{'file'});
284 $self->{'extra_blocks'}->{$full_af} = 1;
285 }
286 }
287 push(@$extrametakeys,$file_re);
288
289 if (defined $self->{'saved_metadata'}->{'dc.Format^extent'}) {
290 delete $self->{'saved_metadata'}->{'dc.Format^extent'};
291 }
292
293 if (defined $mimetype_list) {
294 delete $self->{'saved_metadata'}->{'dc.Format^mimetype'};
295
296 # Temporarily store associate file info in metadata table
297 # This will be removed in 'extra_metadata' in BasPlug and used
298 # to perform the actual file association (once the doc obj has
299 # been formed
300
301 my $main_doc = $doc_file_mimes->[0];
302 my $md_mimetype = $main_doc->{'mimetype'};
303
304 my $pd_lookup = $primary_doc_lookup->{$md_mimetype};
305
306 if (defined $pd_lookup) {
307 my $filter_re = $pd_lookup;
308 @$assoc_file_mimes = grep { $_->{'file'} !~ m/$filter_re/ } @$assoc_file_mimes;
309 }
310
311 my @gsdlassocfile_tobe
312 = map { &util::filename_cat($dir,$_->{'file'}).":".$_->{'mimetype'}.":" } @$assoc_file_mimes;
313 $self->{'saved_metadata'}->{'gsdlassocfile_tobe'} = \@gsdlassocfile_tobe;
314
315 }
316
317 $extrametadata->{$file_re} = $self->{'saved_metadata'};
318
319 return 1;
320}
321
322
323# The DSpacePlug read() function. This function does all the right things
324# to make general options work for a given plugin. It calls the process()
325# function which does all the work specific to a plugin (like the old
326# read functions used to do). Most plugins should define their own
327# process() function and let this read() function keep control.
328#
329# DSpace overrides read() because there is no need to read the actual
330# text of the file in, because the contents of the file is not text...
331#
332# Return number of files processed, undef if can't process
333# Note that $base_dir might be "" and that $file might
334# include directories
335
336sub read {
337 my $self = shift (@_);
338 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
339 my $outhandle = $self->{'outhandle'};
340
341 # Block all files except contents
342 my $filename = &util::filename_cat($base_dir, $file);
343 return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
344
345 my $assocfile = $metadata->{'assocfile'};
346
347 return 0 if (($filename =~ /dublin_core\.xml$/) || ($filename =~ /contents$/));
348 return 0 if (defined $self->{'extra_blocks'}->{$filename});
349 return undef;
350}
351
352# do plugin specific processing of doc_obj
353sub process {
354 my $self = shift (@_);
355 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
356 my $outhandle = $self->{'outhandle'};
357
358 return 1;
359}
360
361sub Doctype {
362 my ($expat, $name, $sysid, $pubid, $internal) = @_;
363
364 die if ($name !~ /^dublin_core$/);
365}
366
367sub StartTag {
368 my ($expat, $element) = @_;
369 if ($element eq "dublin_core") {
370 $self->{'saved_metadata'} = {};
371 } elsif ($element eq "dcvalue") {
372 my $metaname = $_{'element'};
373 my $qualifier = $_{'qualifier'};
374 if ($metaname ne "description") {
375 $metaname .= "^$qualifier" if ($qualifier ne "none" && $qualifier ne "");
376 $self->{'metaname'} = "dc.\u$metaname";
377 }
378 }
379}
380
381sub EndTag {
382 my ($expat, $element) = @_;
383
384 if ($element eq "dcvalue") {
385 $self->{'metaname'} = "";
386 }
387}
388
389sub Text {
390 if (defined ($self->{'metaname'}) && $self->{'metaname'} ne "") {
391 # $_ == Metadata content
392 my $mname = $self->{'metaname'};
393 if (defined $self->{'saved_metadata'}->{$mname}) {
394 # accumulate - add value to existing value(s)
395 if (ref ($self->{'saved_metadata'}->{$mname}) eq "ARRAY") {
396 push (@{$self->{'saved_metadata'}->{$mname}}, $_);
397 } else {
398 $self->{'saved_metadata'}->{$mname} =
399 [$self->{'saved_metadata'}->{$mname}, $_];
400 }
401 } else {
402 # accumulate - add value into (currently empty) array
403 $self->{'saved_metadata'}->{$mname} = [$_];
404 }
405
406 }
407}
408
409# This Char function overrides the one in XML::Parser::Stream to overcome a
410# problem where $expat->{Text} is treated as the return value, slowing
411# things down significantly in some cases.
412sub Char {
413 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
414 $_[0]->{'Text'} .= $_[1];
415 return undef;
416}
417
4181;
Note: See TracBrowser for help on using the repository browser.