root/main/trunk/greenstone2/perllib/plugins/DSpacePlugin.pm @ 24971

Revision 24971, 13.3 KB (checked in by ak19, 8 years ago)

1. Introduced the util::filepath_to_url_format subroutine which will be used to convert filenames to URL style filenames to match the slashes used in the filename regex-es in extrameta keys used to index into extrameta data structures. 2. Fixed bug on windows where metadata.xml specifies filenames as regex with backslash in front of the file extension's period mark: DirectoryPlugin? needed to unregex the filepath before calling fileparse on it, else the escaping backslash would interfere with perl's fileparse routine (only on windows, since backslash also represents a dirsep here). 3. Updated all those perl plugins where the new util::filepath_to_url_format needs to be called so that they use URL style filenames (thereafter regexed) to index into the extrameta data structures.

  • Property svn:keywords set to Author Date Id Revision
Line 
1
2###########################################################################
3#
4# DSpacePlugin.pm -- plugin for importing a collection from DSpace
5#
6# A component of the Greenstone digital library software
7# from the New Zealand Digital Library Project at the
8# University of Waikato, New Zealand.
9#
10# Copyright (C) 2004 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful,
18# but WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
20# GNU General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28
29# This plugin takes "contents" and dublin_core.xml file, which contain
30# Metadata and lists of associated files for a particular document
31# and produces a document containing sections, one for each page.
32# The files should be named "contents" and "dublin_core.xml".  For each of
33# document in DSpace, it is stored in one directory
34#
35# The format of the "contents" file is as follows:
36#
37# File.type      bundle:ORIGINAL
38# license.txt    bundle:LICENSE
39# The format of the "dublin_core.xml" file is as follows:
40# The first line contains any metadata for the whole document
41# <dublin_core>
42# eg.
43# <dcvalue element="Title" qualifier="">Snail farming</dcvalue>
44# <dcvalue element="date" qualifier="">2004-10-15</dcvalue>
45#
46
47package DSpacePlugin;
48
49use extrametautil;
50use ReadTextFile;
51use plugin;
52use util;
53use XMLParser;
54use strict;
55no strict 'refs'; # allow filehandles to be variables and viceversa
56
57sub BEGIN {
58    @DSpacePlugin::ISA = ('ReadTextFile');
59}
60
61my $arguments =
62    [ { 'name' => "process_exp",
63    'desc' => "{BasePlugin.process_exp}",
64    'type' => "string",
65    'deft' => &get_default_process_exp(),
66    'reqd' => "no" },
67      { 'name' => "only_first_doc",
68    'desc' => "{DSpacePlugin.only_first_doc}",
69    'type' => "flag",
70    'reqd' => "no" },
71      { 'name' => "first_inorder_ext",
72    'desc' => "{DSpacePlugin.first_inorder_ext}",
73    'type' => "string",
74    'reqd' => "no" },
75      { 'name' => "first_inorder_mime",
76    'desc' => "{DSpacePlugin.first_inorder_mime}",
77    'type' => "flag",
78    'reqd' => "no" },
79      { 'name' => "block_exp",
80    'desc' => "{BasePlugin.block_exp}",
81    'type' => "regexp",
82    'deft' => &get_default_block_exp(),
83    'reqd' => "no" }];
84
85
86my $options = { 'name'     => "DSpacePlugin",
87        'desc'     => "{DSpacePlugin.desc}",
88        'inherits' => "yes",
89        'abstract' => "no",
90        'args'     => $arguments };
91
92
93my $primary_doc_lookup = { 'text/html' => '(?i)\.(gif|jpe?g|jpe|jpg|png|css)$' };
94
95# Important variation to regular plugin structure.  Need to declare
96# $self as global variable to file so XMLParser callback routines
97# can access the content of the object.
98my ($self);
99
100sub new {
101    my ($class) = shift (@_);
102    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
103    push(@$pluginlist, $class);
104
105    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
106    push(@{$hashArgOptLists->{"OptList"}},$options);
107
108    $self = new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists);
109   
110    if ($self->{'info_only'}) {
111    # don't worry about creating the XML parser as all we want is the
112    # list of plugin options
113    return bless $self, $class;
114    }
115
116    #create XML::Parser object for parsing dublin_core.xml files
117    my $parser = new XML::Parser('Style' => 'Stream',
118                 'Handlers' => {'Char' => \&Char,
119                        'Doctype' => \&Doctype
120                        });
121    $self->{'parser'} = $parser;
122    $self->{'extra_blocks'} = {};
123
124    return bless $self, $class;
125}
126
127sub get_default_process_exp {
128    my $self = shift (@_);
129
130    return q^(?i)(contents)$^;
131}
132
133# want to block all files except the "contents"
134sub get_default_block_exp {
135    my $self = shift (@_);
136   
137    # Block handle and txt files if present. Specifically don't block dublin_core xml
138    return q^(?i)(handle|\.tx?t)$^;
139}
140
141sub store_block_files_BACKUP
142{
143    # Option of making blocking sensitive to files that are in directory
144    # This subroutine is not currently used! (relies on default block expression stopping all handle and .txt files)
145
146    my $self =shift (@_);
147    my ($filename_full_path, $block_hash) = @_;
148
149    my ($tailname, $contents_basedir, $suffix) = &File::Basename::fileparse($filename_full_path, "\\.[^\\.]+\$");
150    my $handle_filename = &util::filename_cat($contents_basedir,"handle");
151
152    if (&util::fd_exists($handle_filename)) {
153    &util::block_filename($block_hash,$handle_filename);
154    }
155}
156
157sub read_content
158{
159    my $self = shift (@_);
160    my ($dir, $only_first_doc, $first_inorder_ext, $first_inorder_mime, $mimetype_list) = @_;
161    my $outhandle = $self->{'outhandle'};
162
163    my @fnamemime_list = ();
164    my @assocmime_list = ();
165
166    my $content_fname = &util::filename_cat($dir,"contents");
167
168    open(CIN,"<$content_fname")
169    || die "Unable to open $content_fname: $!\n";
170   
171    my $line;
172    my $pos = 0;
173
174    while (defined ($line = <CIN>)) {
175    if ($line =~ m/^(.*)\s+bundle:ORIGINAL\s*$/) {
176        my $fname = $1;
177        my $mtype = $mimetype_list->[$pos];
178        my $fm_rec = { 'file' => $fname, 'mimetype' => $mtype};
179        push(@fnamemime_list,$fm_rec);
180        $pos++;
181    }
182    }
183    close CIN;
184
185    if ($only_first_doc){
186    my ($first_fname, @rest_fnames) = @fnamemime_list;
187    @fnamemime_list = ($first_fname);
188    @assocmime_list = @rest_fnames;
189    }
190
191    # allow user to specify the types of files (inorder)they would like to assign as
192    # a primary bitstream
193    if ($first_inorder_ext) {
194    # parse user-define file extension names
195    my @extfiles_list = split /,/, $first_inorder_ext;
196    my (@rest_fnames) = @fnamemime_list;
197    my @matched_list = ();
198    foreach my $file_ext (@extfiles_list) {
199        $pos = 0;
200        foreach my $allfiles (@fnamemime_list){
201        $allfiles->{'file'} =~ /^(.*)\.(.*?)$/;
202        my $allfiles_ext = $2;
203
204        if ($allfiles_ext =~ /$file_ext/) {
205            print $outhandle "Existing file:$allfiles->{'file'} match the user-define File Extension:$file_ext\n";
206            push (@matched_list, $allfiles);
207
208            # delete the matched extension file from the array
209            splice(@rest_fnames,$pos,1);
210
211            return (\@matched_list, \@rest_fnames);
212
213        }
214        $pos++;
215        }
216    }
217    }
218   
219    if ($first_inorder_mime) {
220    # parse user-define file mimetype
221    my @file_mime_list = split /,/, $first_inorder_mime;
222    my (@rest_fnames) = @fnamemime_list;
223    my @matched_list = ();
224    foreach my $file_mime (@file_mime_list) {
225        $pos = 0;
226        foreach my $allfiles (@fnamemime_list){
227        my $allfiles_mime = $allfiles->{'mimetype'};
228
229        if ($allfiles_mime =~ /$file_mime/) {
230            print $outhandle "Existing file:$allfiles->{'file'} match the user-defined File MimeType:$file_mime\n";
231            push (@matched_list, $allfiles);
232
233            # delete the matched MIMEType file from the array
234            splice(@rest_fnames,$pos,1);
235            return (\@matched_list, \@rest_fnames);
236        }
237        $pos++;
238        }
239    }
240    }
241    return (\@fnamemime_list, \@assocmime_list);
242}
243
244
245sub filemime_list_to_re
246{
247    my $self = shift (@_);
248    my ($fnamemime_list) = @_;
249
250    my @fname_list = map { "(".$_->{'file'}.")" } @$fnamemime_list;
251    my $fname_re = join("|",@fname_list);
252   
253    # Indexing into the extrameta data structures requires the filename's style of slashes to be in URL format
254    # Then need to convert the filename to a regex, no longer to protect windows directory chars \, but for
255    # protecting special characters like brackets in the filepath such as "C:\Program Files (x86)\Greenstone".
256    $fname_re = &util::filepath_to_url_format($fname_re); # just in case there are slashes in there
257   
258    $fname_re =~ s/\./\\\./g;
259
260    return $fname_re;
261}
262
263# Read dublin_core metadata from DSpace collection
264sub metadata_read {
265    my $self = shift (@_);
266    my ($pluginfo, $base_dir, $file, $block_hash,
267    $extrametakeys, $extrametadata, $extrametafile,
268    $processor, $gli, $aux) = @_;
269
270    my $only_first_doc = $self->{'only_first_doc'};
271    my $first_inorder_ext = $self->{'first_inorder_ext'};
272    my $first_inorder_mime = $self->{'first_inorder_mime'};
273   
274    my $outhandle = $self->{'outhandle'};
275   
276    my $filename = &util::filename_cat($base_dir, $file);
277    # return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
278   
279    if ($filename !~ /dublin_core\.xml$/ || !-f $filename) {
280    return undef;
281    }
282   
283    print $outhandle "DSpacePlugin: extracting metadata from $file\n"
284    if $self->{'verbosity'} > 1;
285   
286    my ($dir) = $filename =~ /^(.*?)[^\/\\]*$/;
287
288    eval {
289    $self->{'parser'}->parsefile($filename);
290    };
291   
292    if ($@) {
293    die "DSpacePlugin: ERROR $filename is not a well formed dublin_core.xml file ($@)\n";
294    }
295
296    my $mimetype_list = $self->{'saved_metadata'}->{'ex.dc.Format^mimetype'};
297    my ($doc_file_mimes, $assoc_file_mimes) = $self->read_content($dir, $only_first_doc, $first_inorder_ext,
298                                  $first_inorder_mime, $mimetype_list);
299
300    my $file_re = $self->filemime_list_to_re($doc_file_mimes);
301
302    if ($only_first_doc || $first_inorder_ext || $first_inorder_mime) {
303    foreach my $afm ( @$assoc_file_mimes ) {
304        my $full_af = &util::filename_cat($dir,$afm->{'file'});
305        $self->{'extra_blocks'}->{$full_af} = 1;
306    }
307    }
308    &extrametautil::addmetakey($extrametakeys, $file_re);
309
310    # See Format's Extent section in http://dublincore.org/documents/usageguide/qualifiers.shtml
311    # it could specify duration, size or even dimensions of the resource. It may be a useful piece
312    # of metadata to preserve after all.
313    #if (defined $self->{'saved_metadata'}->{'ex.dc.Format^extent'}) {
314    #delete $self->{'saved_metadata'}->{'ex.dc.Format^extent'};
315    #}
316
317    if (defined $mimetype_list) {
318    delete $self->{'saved_metadata'}->{'ex.dc.Format^mimetype'};
319
320    # Temporarily store associate file info in metadata table
321    # This will be removed in 'extra_metadata' in BasePlugin and used
322    # to perform the actual file association (once the doc obj has
323    # been formed
324
325    my $main_doc = $doc_file_mimes->[0];
326    my $md_mimetype = $main_doc->{'mimetype'};
327
328    my $pd_lookup = $primary_doc_lookup->{$md_mimetype};
329
330    if (defined $pd_lookup) {
331        my $filter_re = $pd_lookup;
332        @$assoc_file_mimes = grep { $_->{'file'} !~ m/$filter_re/ }  @$assoc_file_mimes;
333    }
334
335    my @gsdlassocfile_tobe
336        = map { &util::filename_cat($dir,$_->{'file'}) .":".$_->{'mimetype'}.":" } @$assoc_file_mimes if @$assoc_file_mimes;
337    $self->{'saved_metadata'}->{'gsdlassocfile_tobe'} = \@gsdlassocfile_tobe;
338
339    }
340   
341    &extrametautil::setmetadata($extrametadata, $file_re, $self->{'saved_metadata'});
342
343    return 1;
344}
345
346
347# The DSpacePlugin read() function. We are not actually reading any documents
348# here, just blocking ones that have been processed by metadata read.
349#
350# Returns 0 for a file its blocking, undef for any other
351sub read {
352    my $self = shift (@_);
353    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
354    my $outhandle = $self->{'outhandle'};
355   
356    # Block all files except contents
357    my $filename = &util::filename_cat($base_dir, $file);
358    return 0 if $self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/;
359
360    my $assocfile = $metadata->{'assocfile'};
361
362    return 0 if (($filename =~ /dublin_core\.xml$/) || ($filename =~ /contents$/));
363    return 0 if (defined $self->{'extra_blocks'}->{$filename});
364    return undef;
365}
366
367sub Doctype {
368    my ($expat, $name, $sysid, $pubid, $internal) = @_;
369
370    die if ($name !~ /^dublin_core$/);
371}
372
373sub StartTag {
374    my ($expat, $element) = @_;
375    if ($element eq "dublin_core") {
376    $self->{'saved_metadata'} = {};
377    } elsif ($element eq "dcvalue") {
378    my $metaname = $_{'element'};
379    my $qualifier = $_{'qualifier'}||"";
380    if ($metaname ne "description" || $qualifier ne "provenance") {
381        $metaname .= "^$qualifier" if ($qualifier ne "none" && $qualifier ne "");
382        $self->{'metaname'} = "ex.dc.\u$metaname";
383    }
384    }
385}
386
387sub EndTag {
388    my ($expat, $element) = @_;
389   
390    if ($element eq "dcvalue") {
391    $self->{'metaname'} = "";
392    }
393}
394
395sub Text {
396    if (defined ($self->{'metaname'}) && $self->{'metaname'} ne "") {
397    # $_ == Metadata content
398    my $mname = $self->{'metaname'};
399    my $mvalue = prepareMetadataValue($_);
400    if (defined $self->{'saved_metadata'}->{$mname}) {
401        # accumulate - add value to existing value(s)
402        if (ref ($self->{'saved_metadata'}->{$mname}) eq "ARRAY") {
403        push (@{$self->{'saved_metadata'}->{$mname}}, $mvalue);
404        } else {
405        $self->{'saved_metadata'}->{$mname} =
406            [$self->{'saved_metadata'}->{$mname}, $mvalue];
407        }
408    } else {
409        # accumulate - add value into (currently empty) array
410        $self->{'saved_metadata'}->{$mname} = [$mvalue];
411    }
412
413    }
414}
415
416# Prepare DSpace metadata for using with Greenstone.
417# Some value must be escaped.
418sub prepareMetadataValue {
419    my ($value) = @_;
420   
421    $value =~ s/\[/&#091;/g;
422    $value =~ s/\]/&#093;/g;
423   
424    return $value;
425 }
426# This Char function overrides the one in XML::Parser::Stream to overcome a
427# problem where $expat->{Text} is treated as the return value, slowing
428# things down significantly in some cases.
429sub Char {
430    use bytes;  # Necessary to prevent encoding issues with XML::Parser 2.31+
431    $_[0]->{'Text'} .= $_[1];
432    return undef;
433}
434
4351;
Note: See TracBrowser for help on using the browser.