root/gsdl/trunk/perllib/plugins/OAIPlugin.pm @ 17197

Revision 17197, 13.1 KB (checked in by kjdon, 11 years ago)

previous metadata changes meant that there was no longer URL metadata (used to find the source doc). Now it uses dc.Identifier

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# OAIPlug.pm -- basic Open Archives Initiative (OAI) plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package OAIPlugin;
28
29use unicode;
30use util;
31
32use strict;
33no strict 'refs'; # allow filehandles to be variables and viceversa
34
35use ReadXMLFile;
36use ReadTextFile; # needed for subroutine textcat_get_language_encoding
37
38sub BEGIN {
39    @OAIPlugin::ISA = ('ReadXMLFile', 'ReadTextFile');
40}
41
42
43my $arguments =
44    [ { 'name' => "process_exp",
45    'desc' => "{BasePlugin.process_exp}",
46    'type' => "regexp",
47    'reqd' => "no",
48    'deft' => &get_default_process_exp() }
49      ];
50
51my $options = { 'name'     => "OAIPlugin",
52        'desc'     => "{OAIPlugin.desc}",
53        'abstract' => "no",
54        'inherits' => "yes",
55        'explodes' => "yes",
56        'args'     => $arguments };
57
58
59sub new {
60    my ($class) = shift (@_);
61    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
62    push(@$pluginlist, $class);
63
64    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
65    push(@{$hashArgOptLists->{"OptList"}},$options);
66
67    new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists,1);
68    my $self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
69
70    return bless $self, $class;
71}
72
73sub get_default_process_exp {
74    my $self = shift (@_);
75
76    return q^(?i)(\.oai)$^;
77}
78
79sub get_doctype {
80    my $self = shift(@_);
81   
82    return "OAI-PMH";
83}
84
85sub xml_start_document {
86    my $self = shift (@_);
87    $self->{'in_metadata_node'} = 0;
88    $self->{'rawxml'} = "";
89}
90
91sub xml_end_document {
92}
93
94sub xml_doctype {
95    my $self = shift(@_);
96
97    my ($expat, $name, $sysid, $pubid, $internal) = @_;
98
99    ##die "" if ($name !~ /^OAI-PMH$/);
100
101    my $outhandle = $self->{'outhandle'};
102    print $outhandle "OAIPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
103    print STDERR "<Processing n='$self->{'file'}' p='OAIPlugin'>\n" if $self->{'gli'};
104
105}
106
107
108sub xml_start_tag {
109    my $self = shift(@_);
110    my ($expat,$element) = @_;
111
112    my %attr_hash = %_;
113
114    my $attr = "";
115    map { $attr .= " $_=$attr_hash{$_}"; } keys %attr_hash;
116
117    $self->{'rawxml'} .= "<$element$attr>";
118
119    if ($element eq "metadata") {
120    $self->{'in_metadata_node'} = 1;
121    $self->{'metadata_xml'} = "";
122    }
123
124    if ($self->{'in_metadata_node'}) {
125    $self->{'metadata_xml'} .= "<$element$attr>";
126    }
127}
128
129sub xml_end_tag {
130    my $self = shift(@_);
131    my ($expat, $element) = @_;
132
133    $self->{'rawxml'} .= "</$element>";
134
135    if ($self->{'in_metadata_node'}) {
136    $self->{'metadata_xml'} .= "</$element>";
137    }
138
139    if ($element eq "metadata") {
140    my $textref = \$self->{'metadata_xml'};
141    my $metadata = $self->{'metadata'};
142    $self->extract_oai_metadata($textref,$metadata);
143
144    $self->{'in_metadata_node'} = 0;   
145    }
146
147
148}
149
150sub xml_text {
151    my $self = shift(@_);
152    my ($expat) = @_;
153
154    $self->{'rawxml'} .= $_;
155
156    if ($self->{'in_metadata_node'}) {
157    $self->{'metadata_xml'} .= $_;
158    }
159}
160
161
162
163
164sub read {
165    my $self = shift (@_); 
166 
167    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
168
169    my $outhandle = $self->{'outhandle'};
170
171    my $filename = $file;
172    $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
173
174    # block the srcdocs dir - we will process files in them when we find an OAI record for them
175    return 0 if ((-d $filename) && ($filename =~ m/srcdocs$/));
176    if ($self->SUPER::read(@_)) {
177    # Do encoding stuff
178    my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
179   
180    my $url_array = $metadata->{'dc.Identifier'};
181    my $num_urls = (defined $url_array) ? scalar(@$url_array) : 0;
182   
183    my $srcdoc_exists = 0;
184    my $srcdoc_pos = 0;
185    my $filename_dir = &util::filename_head($filename);
186   
187    for (my $i=0; $i<$num_urls; $i++) {
188        if ($url_array->[$i] !~ m/^(http|ftp):/) {
189       
190        my $src_filename = &util::filename_cat($filename_dir, $url_array->[$i]);
191        if (-e $src_filename) {
192            $srcdoc_pos = $i;
193            $srcdoc_exists = 1;
194        }
195        }
196    }
197   
198    if ($srcdoc_exists)
199    {
200        print $outhandle "OAIPlugin: passing metadata on to $url_array->[0]\n"
201        if ($self->{'verbosity'}>1);
202       
203       
204        # Make pretty print metadata table stick with src filename
205        my $ppmd_table = $self->{'ppmd_table'};
206        $metadata->{'prettymd'} = [ $ppmd_table ];
207        $self->{'ppmd_table'} = undef;
208       
209        return &plugin::read ($pluginfo, $filename_dir, $url_array->[0],
210                  $block_hash, $metadata, $processor, $maxdocs,
211                  $total_count, $gli);
212    }
213    else
214    {
215        # create a new document
216        my $doc_obj = new doc ($filename, "indexed_doc");
217        my $top_section = $doc_obj->get_top_section;
218        my $plugin_type = $self->{'plugin_type'};
219       
220        $doc_obj->add_utf8_metadata($top_section, "Language", $language);
221        $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding);
222        $doc_obj->add_utf8_metadata($top_section, "Plugin", $plugin_type);
223        $doc_obj->add_metadata($top_section, "FileFormat", "OAI");
224        $doc_obj->add_metadata($top_section, "FileSize", (-s $filename));
225       
226        # include any metadata passed in from previous plugins
227        # note that this metadata is associated with the top level section
228        $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
229       
230        # do plugin specific processing of doc_obj
231        my $textref = \$self->{'rawxml'};
232        unless (defined ($self->process($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
233        print STDERR "<ProcessingError n='$file'>\n" if ($gli);
234        return -1;
235        }
236       
237        # do any automatic metadata extraction
238        $self->auto_extract_metadata ($doc_obj);
239       
240        # add an OID
241        $self->add_OID($doc_obj);
242       
243        my $ppmd_table = $self->{'ppmd_table'};
244        $doc_obj->add_utf8_metadata($top_section,"prettymd",$ppmd_table);
245        $self->{'ppmd_table'} = undef;
246       
247        # process the document
248        $processor->process($doc_obj);
249       
250        $self->{'num_processed'} ++;
251       
252        return 1; # processed the file
253    }
254    }
255    else {
256    return undef;
257    }
258}
259
260
261# do plugin specific processing of doc_obj
262sub process {
263    my $self = shift (@_);
264    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
265    my $outhandle = $self->{'outhandle'};
266
267    print STDERR "<Processing n='$file' p='OAIPlugin'>\n" if ($gli);
268    print $outhandle "OAIPlugin: processing $file\n"
269    if $self->{'verbosity'} > 1;
270
271    my $cursection = $doc_obj->get_top_section();
272
273##    $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection);
274
275    # add text to document object
276
277#    $$textref =~ s/<(.*?)>/$1 /g;
278    $$textref =~ s/</&lt;/g;
279    $$textref =~ s/>/&gt;/g;
280    $$textref =~ s/\[/&#91;/g;
281    $$textref =~ s/\]/&#93;/g;
282
283##    print STDERR "*** adding text: $$textref\n";
284   
285    $doc_obj->add_utf8_text($cursection, $$textref);
286
287    return 1;
288}
289
290
291# Improvement is to merge this with newer version in MetadataPass
292
293sub open_prettyprint_metadata_table
294{
295    my $self = shift(@_);
296
297    my $att   = "width=100% cellspacing=2";
298    my $style = "style=\'border-bottom: 4px solid #000080\'";
299
300    $self->{'ppmd_table'} = "\n<table $att $style>";
301}
302
303sub add_prettyprint_metadata_line
304{
305    my $self = shift(@_);
306    my ($metaname, $metavalue_utf8) = @_;
307
308###    $metavalue_utf8 =~ s/hdl\.handle\.net/mcgonagall.cs.waikato.ac.nz:8080\/dspace\/handle/;
309    $metavalue_utf8 = &util::hyperlink_text($metavalue_utf8);
310
311    $self->{'ppmd_table'} .= "  <tr bgcolor=#b5d3cd>\n";
312    $self->{'ppmd_table'} .= "    <td width=30%>\n";
313    $self->{'ppmd_table'} .= "      $metaname\n";
314    $self->{'ppmd_table'} .= "    </td>\n";
315    $self->{'ppmd_table'} .= "    <td>\n";
316    $self->{'ppmd_table'} .= "      $metavalue_utf8\n";
317    $self->{'ppmd_table'} .= "    </td>\n";
318    $self->{'ppmd_table'} .= "  </tr>\n";
319
320}
321
322sub close_prettyprint_metadata_table
323{
324    my $self = shift(@_);
325
326    $self->{'ppmd_table'} .= "</table>\n";
327}
328
329
330sub remap_dcterms_metadata
331{
332    my $self = shift(@_);
333
334    my ($metaname) = @_;
335
336    my $dcterm_mapping = {
337    "alternative" => "dc.title",
338    "tableOfContents" => "dc.description",
339    "abstract" => "dc.description",
340    "created" => "dc.date",
341    "valid" => "dc.date",
342    "available" => "dc.date",
343    "issued" => "dc.date",
344    "modified" => "dc.date",
345    "dateAccepted" => "dc.date",
346    "dateCopyrighted" => "dc.date",
347    "dateSubmitted" => "dc.date",
348    "extent" => "dc.format",
349    "medium" => "dc.format",
350    "isVersionOf" => "dc.relation",
351    "hasVersion" => "dc.relation",
352    "isReplacedBy" => "dc.relation",
353    "replaces" => "dc.relation",
354    "isRequiredBy" => "dc.relation",
355    "requires" => "dc.relation",
356    "isPartOf" => "dc.relation",
357    "hasPart" => "dc.relation",
358    "isReferencedBy" => "dc.relation",
359    "references" => "dc.relation",
360    "isFormatOf" => "dc.relation",
361    "hasFormat" => "dc.relation",
362    "conformsTo" => "dc.relation",
363    "spatial" => "dc.coverage",
364    "temporal" => "dc.coverage",
365    "audience" => "dc.any",
366    "accrualMethod" => "dc.any",
367    "accrualPeriodicity" => "dc.any",
368    "accrualPolicy" => "dc.any",
369    "instructionalMethod" => "dc.any",
370    "provenance" => "dc.any",
371    "rightsHolder" => "dc.any",
372    "mediator" => "audience",
373    "educationLevel" => "audience",
374    "accessRights" => "dc.rights",
375    "license" => "dc.rights",
376    "bibliographicCitation" => "dc.identifier"
377    };
378
379    my ($prefix,$name) = ($metaname =~ m/^(.*?)\.(.*?)$/);
380   
381    if ($prefix eq "dcterms")
382    {
383    if (defined $dcterm_mapping->{$name})
384    {
385        return $dcterm_mapping->{$name}."^".$name;
386    }
387
388    }
389    return $metaname; # didn't get a match, return param passed in unchanged
390}
391
392
393sub extract_oai_metadata {
394    my $self = shift (@_);
395    my ($textref, $metadata) = @_;
396    my $outhandle = $self->{'outhandle'};
397
398    # Only handles DC metadata
399
400    $self->open_prettyprint_metadata_table();
401
402    if ($$textref =~ m/<metadata\s*>(.*?)<\/metadata\s*>/s)
403    {
404    my $metadata_text = $1;
405
406    # locate and remove outermost tag (ignoring any attribute information in top-level tag)
407    my ($wrapper_metadata_xml,$inner_metadata_text) = ($metadata_text =~ m/<([^ >]+).*?>(.*?)<\/\1>/s);
408
409    # split tag into namespace and tag name
410    my($namespace,$top_level_prefix) = ($wrapper_metadata_xml =~ m/^(.*?):(.*?)$/);
411
412    # sometimes, the dc namespace is not specified as the prefix in each element (like <dc:title>)
413    # but is rather defined in the wrapper element containing the various dc meta elements,
414    # like <dc><title></title><creator></creator></dc>.
415    # In such a case, we use this wrapper element as the top_level_prefix
416    if(!defined $top_level_prefix && defined $wrapper_metadata_xml && $wrapper_metadata_xml =~ m/dc$/) {
417        $top_level_prefix = $wrapper_metadata_xml;
418    }
419
420    if ($top_level_prefix !~ m/dc$/) {
421        print $outhandle "Warning: OAIPlugin currently only designed for Dublin Core (or variant) metadata\n";
422        print $outhandle "         This recorded metadata section '$top_level_prefix' does not appear to match.\n";
423        print $outhandle "         Metadata assumed to be in form: <prefix:tag>value</prefix:tag> and will be converted\n";
424        print $outhandle "         into Greenstone metadata as prefix.tag = value\n";
425    }
426
427    while ($inner_metadata_text =~ m/<([^ >]+).*?>(.*?)<\/\1>(.*)/s)
428    {
429        # if URL given for document as identifier metadata, store it ...
430        # $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
431
432        my $metaname = $1;
433        my $metavalue = $2;
434        $inner_metadata_text = $3;
435
436#       print STDERR "*** metaname = $metaname\n";
437#       print STDERR "*** metavalue = $metavalue\n";
438
439        # $metaname =~ s/^(dc:)?(.)/\u$2/; # strip of optional prefix and uppercase first letter
440        $metaname =~ s/:/\./;
441        if ($metaname !~ m/\./)
442        {
443        $metaname = "$top_level_prefix.$metaname";
444#       print STDERR "*** metaname = $metaname\tmetavalue = $metavalue\n";
445        }
446        $metaname =~ s/\.(.)/\.\u$1/;
447
448        $metaname = $self->remap_dcterms_metadata($metaname);
449
450        $metavalue =~ s/\[/&#91;/g;
451        $metavalue =~ s/\]/&#93;/g;
452
453
454#       if ($metaname eq "Identifier")
455#       {
456#       # name clashes with GSDL reserved metadata name for hash id
457#       $metaname = "URL";
458#       }
459
460        if (defined $metadata->{$metaname})
461        {
462        push(@{$metadata->{$metaname}},$metavalue);
463
464        }
465        else
466        {
467        $metadata->{$metaname} = [ $metavalue ];
468        }
469
470        $self->add_prettyprint_metadata_line($metaname, $metavalue);
471       
472    }
473    }
474
475    $self->close_prettyprint_metadata_table();
476}
477
478## we know from the file extension, so doesn't need to check the doctype
479sub check_doctype {
480
481    return 1;
482}
483
4841;
Note: See TracBrowser for help on using the browser.