source: trunk/gsdl/perllib/plugins/DSpacePlug.pm@ 10218

Last change on this file since 10218 was 10218, checked in by kjdon, 19 years ago

Jeffrey's new parsing modifications, committed approx 6 July, 15.16

  • Property svn:keywords set to Author Date Id Revision
File size: 12.0 KB
Line 
1###########################################################################
2#
3# DSpacePlug.pm -- plugin for import the collection from DSpace
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27# DSpace Plug - 10/2004
28#
29#
30# This plugin takes "contents" and dublin_core.xml file, which contain
31# Metadata and lists of associated files for a particular document
32# and produces a document containing sections, one for each page.
33# The files should be named "contents" and "dublin_core.xml". For each of
34# document in DSpace, it is stored in one directory
35#
36# The format of the "contents" file is as follows:
37#
38# File.type bundle:ORIGINAL
39# license.txt bundle:LICENSE
40# The format of the "dublin_core.xml" file is as follows:
41# The first line contains any metadata for the whole document
42# <dublin_core>
43# eg.
44# <dcvalue element="Title" qualifier="">Snail farming</dcvalue>
45# <dcvalue element="date" qualifier="">2004-10-15</dcvalue>
46#
47
48package DSpacePlug;
49
50use BasPlug;
51use plugin;
52#use ghtml;
53use XMLParser;
54
55sub BEGIN {
56 @DSpacePlug::ISA = ('BasPlug');
57}
58
59my $arguments =
60 [ { 'name' => "process_exp",
61 'desc' => "{BasPlug.process_exp}",
62 'type' => "string",
63 'deft' => &get_default_process_exp(),
64 'reqd' => "no" },
65 { 'name' => "only_first_doc",
66 'desc' => "{DSpacePlug.only_first_doc}",
67 'type' => "flag",
68 'reqd' => "no" },
69 { 'name' => "first_inorder_ext",
70 'desc' => "{DSpacePlug.first_inorder_ext}",
71 'type' => "string",
72 'reqd' => "no" },
73 { 'name' => "first_inorder_mime",
74 'desc' => "{DSpacePlug.first_inorder_mime}",
75 'type' => "flag",
76 'reqd' => "no" },
77 { 'name' => "block_exp",
78 'desc' => "{BasPlug.block_exp}",
79 'type' => "regexp",
80 'deft' => &get_default_block_exp(),
81 'reqd' => "no" }];
82
83
84my $options = { 'name' => "DSpacePlug",
85 'desc' => "{DSpacePlug.desc}",
86 'inherits' => "yes",
87 'args' => $arguments };
88
89
90my $primary_doc_lookup = { 'text/html' => '(?i)\.(gif|jpe?g|jpe|jpg|png|css)$' };
91
92# Important variation to regular plugin structure. Need to declare
93# $self as global variable to file so XMLParser callback routines
94# can access the content of the object.
95my ($self);
96
97sub new {
98 my ($class) = shift (@_);
99 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
100 push(@$pluginlist, $class);
101
102 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
103 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
104
105 $self = (defined $hashArgOptLists)? new BasPlug($pluginlist,$inputargs,$hashArgOptLists): new BasPlug($pluginlist,$inputargs);
106
107 #create XML::Parser object for parsing dublin_core.xml files
108 my $parser = new XML::Parser('Style' => 'Stream',
109 'Handlers' => {'Char' => \&Char,
110 'Doctype' => \&Doctype
111 });
112 $self->{'parser'} = $parser;
113 $self->{'extra_blocks'} = {};
114
115 return bless $self, $class;
116}
117
118sub get_default_process_exp {
119 my $self = shift (@_);
120
121 return q^(?i)contents$^;
122}
123
124# want to block all files except the "contents"
125sub get_default_block_exp {
126 my $self = shift (@_);
127
128 # Block all files besides contents
129 return q^(?i)(handle|dublin_core\.xml|\.tx?t)$^;
130}
131
132sub read_content
133{
134 my $self = shift (@_);
135 my ($dir, $only_first_doc, $first_inorder_ext, $first_inorder_mime, $mimetype_list) = @_;
136 my $outhandle = $self->{'outhandle'};
137
138 my @fnamemime_list = ();
139 my @assocmime_list = ();
140
141 my $content_fname = &util::filename_cat($dir,"contents");
142
143 open(CIN,"<$content_fname")
144 || die "Unable to open $content_fname: $!\n";
145
146 my $line;
147 my $pos = 0;
148
149 while (defined ($line = <CIN>)) {
150 if ($line =~ m/^(.*)\s+bundle:ORIGINAL\s*$/) {
151 my $fname = $1;
152 my $mtype = $mimetype_list->[$pos];
153 my $fm_rec = { 'file' => $fname, 'mimetype' => $mtype};
154 push(@fnamemime_list,$fm_rec);
155 $pos++;
156 }
157 }
158 close CIN;
159
160 if ($only_first_doc){
161 my ($first_fname, @rest_fnames) = @fnamemime_list;
162 @fnamemime_list = ($first_fname);
163 @assocmime_list = @rest_fnames;
164 }
165
166 # allow user to specify the types of files (inorder)they would like to assign as
167 # a primary bitstream
168 if ($first_inorder_ext) {
169 # parse user-define file extension names
170 my @extfiles_list = split /,/, $first_inorder_ext;
171 my (@rest_fnames) = @fnamemime_list;
172 my @matched_list = ();
173 foreach my $file_ext (@extfiles_list) {
174 $pos = 0;
175 foreach my $allfiles (@fnamemime_list){
176 $allfiles->{'file'} =~ /^(.*)\.(.*?)$/;
177 my $allfiles_ext = $2;
178
179 if ($allfiles_ext =~ /$file_ext/) {
180 print $outhandle "Existing file:$allfiles->{'file'} match the user-define File Extension:$file_ext\n";
181 push (@matched_list, $allfiles);
182
183 # delete the matched extension file from the array
184 splice(@rest_fnames,$pos,1);
185
186 return (\@matched_list, \@rest_fnames);
187
188 }
189 $pos++;
190 }
191 }
192 }
193
194 if ($first_inorder_mime) {
195 # parse user-define file mimetype
196 my @file_mime_list = split /,/, $first_inorder_mime;
197 my (@rest_fnames) = @fnamemime_list;
198 my @matched_list = ();
199 foreach my $file_mime (@file_mime_list) {
200 $pos = 0;
201 foreach my $allfiles (@fnamemime_list){
202 my $allfiles_mime = $allfiles->{'mimetype'};
203
204 if ($allfiles_mime =~ /$file_mime/) {
205 print $outhandle "Existing file:$allfiles->{'file'} match the user-defined File MimeType:$file_mime\n";
206 push (@matched_list, $allfiles);
207
208 # delete the matched MIMEType file from the array
209 splice(@rest_fnames,$pos,1);
210 return (\@matched_list, \@rest_fnames);
211 }
212 $pos++;
213 }
214 }
215 }
216 return (\@fnamemime_list, \@assocmime_list);
217}
218
219
220sub filemime_list_to_re
221{
222 my $self = shift (@_);
223 my ($fnamemime_list) = @_;
224
225 my @fname_list = map { "(".$_->{'file'}.")" } @$fnamemime_list;
226 my $fname_re = join("|",@fname_list);
227 $fname_re =~ s/\./\\\./g;
228
229 return $fname_re;
230}
231
232# Read dublin_core metadata from DSpace collection
233sub metadata_read {
234 my $self = shift (@_);
235 my ($pluginfo, $base_dir, $file, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
236
237 my $only_first_doc = $self->{'only_first_doc'};
238 my $first_inorder_ext = $self->{'first_inorder_ext'};
239 my $first_inorder_mime = $self->{'first_inorder_mime'};
240
241 my $outhandle = $self->{'outhandle'};
242
243 my $filename = &util::filename_cat($base_dir, $file);
244 # return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
245
246 if ($filename !~ /dublin_core\.xml$/ || !-f $filename) {
247 return undef;
248 }
249
250 print $outhandle "DSpacePlug: extracting metadata from $file\n"
251 if $self->{'verbosity'} > 1;
252
253 my ($dir) = $filename =~ /^(.*?)[^\/\\]*$/;
254
255 eval {
256 $self->{'parser'}->parsefile($filename);
257 };
258
259 if ($@) {
260 die "DSpacePlug: ERROR $filename is not a well formed dublin_core.xml file ($@)\n";
261 }
262
263 my $mimetype_list = $self->{'saved_metadata'}->{'dc.Format^mimetype'};
264 my ($doc_file_mimes, $assoc_file_mimes) = $self->read_content($dir, $only_first_doc, $first_inorder_ext,
265 $first_inorder_mime, $mimetype_list);
266
267 my $file_re = $self->filemime_list_to_re($doc_file_mimes);
268
269 if ($only_first_doc || $first_inorder_ext || $first_inorder_mime) {
270 foreach my $afm ( @$assoc_file_mimes ) {
271 my $full_af = &util::filename_cat($dir,$afm->{'file'});
272 $self->{'extra_blocks'}->{$full_af} = 1;
273 }
274 }
275 push(@$extrametakeys,$file_re);
276
277 if (defined $self->{'saved_metadata'}->{'dc.Format^extent'}) {
278 delete $self->{'saved_metadata'}->{'dc.Format^extent'};
279 }
280
281 if (defined $mimetype_list) {
282 delete $self->{'saved_metadata'}->{'dc.Format^mimetype'};
283
284 # Temporarily store associate file info in metadata table
285 # This will be removed in 'extra_metadata' in BasPlug and used
286 # to perform the actual file association (once the doc obj has
287 # been formed
288
289 my $main_doc = $doc_file_mimes->[0];
290 my $md_mimetype = $main_doc->{'mimetype'};
291
292 my $pd_lookup = $primary_doc_lookup->{$md_mimetype};
293
294 if (defined $pd_lookup) {
295 my $filter_re = $pd_lookup;
296 @$assoc_file_mimes = grep { $_->{'file'} !~ m/$filter_re/ } @$assoc_file_mimes;
297 }
298
299 my @gsdlassocfile_tobe
300 = map { &util::filename_cat($dir,$_->{'file'}).":".$_->{'mimetype'}.":" } @$assoc_file_mimes;
301 $self->{'saved_metadata'}->{'gsdlassocfile_tobe'} = \@gsdlassocfile_tobe;
302
303 }
304
305 $extrametadata->{$file_re} = $self->{'saved_metadata'};
306
307 return 1;
308}
309
310
311# The DSpacePlug read() function. This function does all the right things
312# to make general options work for a given plugin. It calls the process()
313# function which does all the work specific to a plugin (like the old
314# read functions used to do). Most plugins should define their own
315# process() function and let this read() function keep control.
316#
317# DSpace overrides read() because there is no need to read the actual
318# text of the file in, because the contents of the file is not text...
319#
320# Return number of files processed, undef if can't process
321# Note that $base_dir might be "" and that $file might
322# include directories
323
324sub read {
325 my $self = shift (@_);
326 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
327 my $outhandle = $self->{'outhandle'};
328
329 # Block all files except contents
330 my $filename = &util::filename_cat($base_dir, $file);
331 return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
332
333 my $assocfile = $metadata->{'assocfile'};
334
335 return 0 if (($filename =~ /dublin_core\.xml$/) || ($filename =~ /contents$/));
336 return 0 if (defined $self->{'extra_blocks'}->{$filename});
337 return undef;
338}
339
340# do plugin specific processing of doc_obj
341sub process {
342 my $self = shift (@_);
343 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
344 my $outhandle = $self->{'outhandle'};
345
346 return 1;
347}
348
349sub Doctype {
350 my ($expat, $name, $sysid, $pubid, $internal) = @_;
351
352 die if ($name !~ /^dublin_core$/);
353}
354
355sub StartTag {
356 my ($expat, $element) = @_;
357 if ($element eq "dublin_core") {
358 $self->{'saved_metadata'} = {};
359 } elsif ($element eq "dcvalue") {
360 my $metaname = $_{'element'};
361 my $qualifier = $_{'qualifier'};
362 if ($metaname ne "description") {
363 $metaname .= "^$qualifier" if ($qualifier ne "none" && $qualifier ne "");
364 $self->{'metaname'} = "dc.\u$metaname";
365 }
366 }
367}
368
369sub EndTag {
370 my ($expat, $element) = @_;
371
372 if ($element eq "dcvalue") {
373 $self->{'metaname'} = "";
374 }
375}
376
377sub Text {
378 if (defined ($self->{'metaname'}) && $self->{'metaname'} ne "") {
379 # $_ == Metadata content
380 my $mname = $self->{'metaname'};
381 if (defined $self->{'saved_metadata'}->{$mname}) {
382 # accumulate - add value to existing value(s)
383 if (ref ($self->{'saved_metadata'}->{$mname}) eq "ARRAY") {
384 push (@{$self->{'saved_metadata'}->{$mname}}, $_);
385 } else {
386 $self->{'saved_metadata'}->{$mname} =
387 [$self->{'saved_metadata'}->{$mname}, $_];
388 }
389 } else {
390 # accumulate - add value into (currently empty) array
391 $self->{'saved_metadata'}->{$mname} = [$_];
392 }
393
394 }
395}
396
397# This Char function overrides the one in XML::Parser::Stream to overcome a
398# problem where $expat->{Text} is treated as the return value, slowing
399# things down significantly in some cases.
400sub Char {
401 use bytes; # Necessary to prevent encoding issues with XML::Parser 2.31+
402 $_[0]->{'Text'} .= $_[1];
403 return undef;
404}
405
4061;
Note: See TracBrowser for help on using the repository browser.