root/gsdl/trunk/perllib/plugins/OAIPlugin.pm @ 17591

Revision 17591, 14.4 KB (checked in by kjdon, 11 years ago)

changed a comment

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# OAIPlug.pm -- basic Open Archives Initiative (OAI) plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package OAIPlugin;
28
29use unicode;
30use util;
31
32use strict;
33no strict 'refs'; # allow filehandles to be variables and viceversa
34
35use ReadXMLFile;
36use ReadTextFile; # needed for subroutine textcat_get_language_encoding
37use metadatautil;
38
39sub BEGIN {
40    @OAIPlugin::ISA = ('ReadXMLFile', 'ReadTextFile');
41}
42
43
44my $arguments =
45    [ { 'name' => "process_exp",
46    'desc' => "{BasePlugin.process_exp}",
47    'type' => "regexp",
48    'reqd' => "no",
49    'deft' => &get_default_process_exp() },
50      { 'name' => "document_field",
51    'desc' => "{OAIPlugin.document_field}",
52    'type' => "metadata",
53    'reqd' => "no",
54    'deft' => "gi.Sourcedoc" }
55      ];
56
57my $options = { 'name'     => "OAIPlugin",
58        'desc'     => "{OAIPlugin.desc}",
59        'abstract' => "no",
60        'inherits' => "yes",
61        'explodes' => "yes",
62        'args'     => $arguments };
63
64
65sub new {
66    my ($class) = shift (@_);
67    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
68    push(@$pluginlist, $class);
69
70    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
71    push(@{$hashArgOptLists->{"OptList"}},$options);
72
73    new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists,1);
74    my $self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
75
76    return bless $self, $class;
77}
78
79sub get_default_process_exp {
80    my $self = shift (@_);
81
82    return q^(?i)(\.oai)$^;
83}
84
85sub get_doctype {
86    my $self = shift(@_);
87   
88    return "OAI-PMH";
89}
90
91sub xml_start_document {
92    my $self = shift (@_);
93    $self->{'in_metadata_node'} = 0;
94    $self->{'rawxml'} = "";
95    $self->{'saved_metadata'} = {};
96}
97
98sub xml_end_document {
99}
100
101sub xml_doctype {
102    my $self = shift(@_);
103
104    my ($expat, $name, $sysid, $pubid, $internal) = @_;
105
106    ##die "" if ($name !~ /^OAI-PMH$/);
107
108    my $outhandle = $self->{'outhandle'};
109    print $outhandle "OAIPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
110    print STDERR "<Processing n='$self->{'file'}' p='OAIPlugin'>\n" if $self->{'gli'};
111
112}
113
114
115sub xml_start_tag {
116    my $self = shift(@_);
117    my ($expat,$element) = @_;
118
119    my %attr_hash = %_;
120
121    my $attr = "";
122    map { $attr .= " $_=$attr_hash{$_}"; } keys %attr_hash;
123
124    $self->{'rawxml'} .= "<$element$attr>";
125
126    if ($element eq "metadata") {
127    $self->{'in_metadata_node'} = 1;
128    $self->{'metadata_xml'} = "";
129    }
130
131    if ($self->{'in_metadata_node'}) {
132    $self->{'metadata_xml'} .= "<$element$attr>";
133    }
134}
135
136sub xml_end_tag {
137    my $self = shift(@_);
138    my ($expat, $element) = @_;
139
140    $self->{'rawxml'} .= "</$element>";
141
142    if ($self->{'in_metadata_node'}) {
143    $self->{'metadata_xml'} .= "</$element>";
144    }
145
146    if ($element eq "metadata") {
147    my $textref = \$self->{'metadata_xml'};
148    #my $metadata = $self->{'metadata'};
149    my $metadata = $self->{'saved_metadata'};
150    $self->extract_oai_metadata($textref,$metadata);
151
152    $self->{'in_metadata_node'} = 0;   
153    }
154
155
156}
157
158sub xml_text {
159    my $self = shift(@_);
160    my ($expat) = @_;
161
162    $self->{'rawxml'} .= $_;
163
164    if ($self->{'in_metadata_node'}) {
165    $self->{'metadata_xml'} .= $_;
166    }
167}
168
169
170sub metadata_read {
171    my $self = shift (@_); 
172
173    my ($pluginfo, $base_dir, $file, $block_hash, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
174
175    # can we process this file??
176    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
177    return undef unless $self->can_process_this_file($filename_full_path);
178   
179    if (!$self->parse_file($filename_full_path, $file, $gli)) {
180    $self->{'saved_metadata'} = undef;
181    return undef;
182    }
183
184    my $new_metadata = $self->{'saved_metadata'};
185    $self->{'saved_metadata'} = undef;
186
187    # add the pretty metadata table as metadata
188    my $ppmd_table = $self->{'ppmd_table'};
189    $new_metadata->{'prettymd'} = $ppmd_table;
190    $self->{'ppmd_table'} = undef;
191     
192    my $document_metadata_field = $self->{'document_field'};
193    my $url_array = $new_metadata->{$document_metadata_field};
194    my $num_urls = (defined $url_array) ? scalar(@$url_array) : 0;
195    ##print STDERR "$num_urls urls for $file\n";
196    my $srcdoc_exists = 0;
197    my $srcdoc_pos = 0;
198    my $filename_dir = &util::filename_head($filename_full_path);
199   
200    # filenames in extrametadata must be relative to current dir, as
201    # DirectoryPlugin adds path info on itself
202    my ($filename_for_metadata) = $file =~ /([^\\\/]+)$/; # this assumes there will only be one record per oai file - is this always the case??
203    for (my $i=0; $i<$num_urls; $i++) {
204   
205    if ($url_array->[$i] !~ m/^(https?|ftp):/) {
206       
207        my $src_filename = &util::filename_cat($filename_dir, $url_array->[$i]);
208       
209        if (-e $src_filename) {
210        $srcdoc_pos = $i;
211        $srcdoc_exists = 1;
212        $filename_for_metadata = $url_array->[$i];
213        last;
214        }
215    }
216    }
217   
218    if ($srcdoc_exists) {
219    $self->{'oai-files'}->{$file}->{'srcdoc_exists'} = 1;
220    }
221    else {
222    # save the rawxml for the source document
223    $self->{'oai-files'}->{$file}->{'srcdoc_exists'} = 0;
224    $self->{'oai-files'}->{$file}->{'rawxml'} = $self->{'rawxml'};
225    $self->{'rawxml'} = "";
226    }
227   
228    # return all the metadata we have extracted to the caller.
229    # Directory plug will pass it back in at read time, so we don't need to extract it again.
230    # extrametadata keys should be regular expressions
231    $filename_for_metadata = &util::filename_to_regex($filename_for_metadata);
232    $extrametadata->{$filename_for_metadata} = $new_metadata;
233    push(@$extrametakeys, $filename_for_metadata);
234
235    return 1;
236   
237}
238
239
240sub read {
241    my $self = shift (@_); 
242   
243    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
244
245    if (!defined $self->{'oai-files'}->{$file}) {
246    return undef;
247    }
248       
249    my $srcdoc_exists = $self->{'oai-files'}->{$file}->{'srcdoc_exists'};
250    if ($srcdoc_exists) {
251    # do nothing more - all the metadata has been extracted and associated with the srcdoc
252    # no more need to access details of this $file => tidy up as you go
253    delete $self->{'oai-files'}->{$file};
254    return 0; # not processed here, but don't pass on to rest of plugins
255    }
256
257    my $filename = $file;
258    $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
259
260    # Do encoding stuff on metadata
261    my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
262
263    # create a new document
264    my $doc_obj = new doc ($filename, "indexed_doc");
265    my $top_section = $doc_obj->get_top_section;
266    my $plugin_type = $self->{'plugin_type'};
267   
268    my ($filemeta) = $file =~ /([^\\\/]+)$/;
269    $self->set_Source_metadata($doc_obj, $filemeta, $encoding);
270    $doc_obj->add_utf8_metadata($top_section, "Language", $language);
271    $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding);
272    $doc_obj->add_utf8_metadata($top_section, "Plugin", $plugin_type);
273    $doc_obj->add_metadata($top_section, "FileFormat", "OAI");
274    $doc_obj->add_metadata($top_section, "FileSize", (-s $filename));
275   
276    # include any metadata passed in from previous plugins
277    # note that this metadata is associated with the top level section
278    # this will include all the metadata from the oai file that we extracted
279    # during metadata_read
280    $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
281   
282    # do plugin specific processing of doc_obj
283    my $text = $self->{'oai-files'}->{$file}->{'rawxml'};
284    delete $self->{'oai-files'}->{$file};
285
286    unless (defined ($self->process(\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
287    print STDERR "<ProcessingError n='$file'>\n" if ($gli);
288    return -1;
289    }
290   
291    # do any automatic metadata extraction
292    $self->auto_extract_metadata ($doc_obj);
293   
294    # add an OID
295    $self->add_OID($doc_obj);
296       
297    # process the document
298    $processor->process($doc_obj);
299   
300    $self->{'num_processed'} ++;
301   
302    return 1; # processed the file
303}
304
305
306# do plugin specific processing of doc_obj
307sub process {
308    my $self = shift (@_);
309    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
310    my $outhandle = $self->{'outhandle'};
311
312    print STDERR "<Processing n='$file' p='OAIPlugin'>\n" if ($gli);
313    print $outhandle "OAIPlugin: processing $file\n"
314    if $self->{'verbosity'} > 1;
315
316    my $cursection = $doc_obj->get_top_section();
317
318##    $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection);
319
320    # add text to document object
321
322#    $$textref =~ s/<(.*?)>/$1 /g;
323    $$textref =~ s/</&lt;/g;
324    $$textref =~ s/>/&gt;/g;
325    $$textref =~ s/\[/&#91;/g;
326    $$textref =~ s/\]/&#93;/g;
327
328    $doc_obj->add_utf8_text($cursection, $$textref);
329
330    return 1;
331}
332
333
334# Improvement is to merge this with newer version in MetadataPass
335
336sub open_prettyprint_metadata_table
337{
338    my $self = shift(@_);
339
340    my $att   = "width=100% cellspacing=2";
341    my $style = "style=\'border-bottom: 4px solid #000080\'";
342
343    $self->{'ppmd_table'} = "\n<table $att $style>";
344}
345
346sub add_prettyprint_metadata_line
347{
348    my $self = shift(@_);
349    my ($metaname, $metavalue_utf8) = @_;
350
351    $metavalue_utf8 = &util::hyperlink_text($metavalue_utf8);
352
353    $self->{'ppmd_table'} .= "  <tr bgcolor=#b5d3cd>\n";
354    $self->{'ppmd_table'} .= "    <td width=30%>\n";
355    $self->{'ppmd_table'} .= "      $metaname\n";
356    $self->{'ppmd_table'} .= "    </td>\n";
357    $self->{'ppmd_table'} .= "    <td>\n";
358    $self->{'ppmd_table'} .= "      $metavalue_utf8\n";
359    $self->{'ppmd_table'} .= "    </td>\n";
360    $self->{'ppmd_table'} .= "  </tr>\n";
361
362}
363
364sub close_prettyprint_metadata_table
365{
366    my $self = shift(@_);
367
368    $self->{'ppmd_table'} .= "</table>\n";
369}
370
371
372sub remap_dcterms_metadata
373{
374    my $self = shift(@_);
375
376    my ($metaname) = @_;
377
378    my $dcterm_mapping = {
379    "alternative" => "dc.title",
380    "tableOfContents" => "dc.description",
381    "abstract" => "dc.description",
382    "created" => "dc.date",
383    "valid" => "dc.date",
384    "available" => "dc.date",
385    "issued" => "dc.date",
386    "modified" => "dc.date",
387    "dateAccepted" => "dc.date",
388    "dateCopyrighted" => "dc.date",
389    "dateSubmitted" => "dc.date",
390    "extent" => "dc.format",
391    "medium" => "dc.format",
392    "isVersionOf" => "dc.relation",
393    "hasVersion" => "dc.relation",
394    "isReplacedBy" => "dc.relation",
395    "replaces" => "dc.relation",
396    "isRequiredBy" => "dc.relation",
397    "requires" => "dc.relation",
398    "isPartOf" => "dc.relation",
399    "hasPart" => "dc.relation",
400    "isReferencedBy" => "dc.relation",
401    "references" => "dc.relation",
402    "isFormatOf" => "dc.relation",
403    "hasFormat" => "dc.relation",
404    "conformsTo" => "dc.relation",
405    "spatial" => "dc.coverage",
406    "temporal" => "dc.coverage",
407    "audience" => "dc.any",
408    "accrualMethod" => "dc.any",
409    "accrualPeriodicity" => "dc.any",
410    "accrualPolicy" => "dc.any",
411    "instructionalMethod" => "dc.any",
412    "provenance" => "dc.any",
413    "rightsHolder" => "dc.any",
414    "mediator" => "audience",
415    "educationLevel" => "audience",
416    "accessRights" => "dc.rights",
417    "license" => "dc.rights",
418    "bibliographicCitation" => "dc.identifier"
419    };
420
421    my ($prefix,$name) = ($metaname =~ m/^(.*?)\.(.*?)$/);
422   
423    if ($prefix eq "dcterms")
424    {
425    if (defined $dcterm_mapping->{$name})
426    {
427        return $dcterm_mapping->{$name}."^".$name;
428    }
429
430    }
431    return $metaname; # didn't get a match, return param passed in unchanged
432}
433
434
435sub extract_oai_metadata {
436    my $self = shift (@_);
437    my ($textref, $metadata) = @_;
438    my $outhandle = $self->{'outhandle'};
439
440    # Only handles DC metadata
441
442    $self->open_prettyprint_metadata_table();
443
444    if ($$textref =~ m/<metadata\s*>(.*?)<\/metadata\s*>/s)
445    {
446    my $metadata_text = $1;
447
448    # locate and remove outermost tag (ignoring any attribute information in top-level tag)
449    my ($wrapper_metadata_xml,$inner_metadata_text) = ($metadata_text =~ m/<([^ >]+).*?>(.*?)<\/\1>/s);
450
451    # split tag into namespace and tag name
452    my($namespace,$top_level_prefix) = ($wrapper_metadata_xml =~ m/^(.*?):(.*?)$/);
453
454    # sometimes, the dc namespace is not specified as the prefix in each element (like <dc:title>)
455    # but is rather defined in the wrapper element containing the various dc meta elements,
456    # like <dc><title></title><creator></creator></dc>.
457    # In such a case, we use this wrapper element as the top_level_prefix
458    if(!defined $top_level_prefix && defined $wrapper_metadata_xml && $wrapper_metadata_xml =~ m/dc$/) {
459        $top_level_prefix = $wrapper_metadata_xml;
460    }
461
462    if ($top_level_prefix !~ m/dc$/) {
463        print $outhandle "Warning: OAIPlugin currently only designed for Dublin Core (or variant) metadata\n";
464        print $outhandle "         This recorded metadata section '$top_level_prefix' does not appear to match.\n";
465        print $outhandle "         Metadata assumed to be in form: <prefix:tag>value</prefix:tag> and will be converted\n";
466        print $outhandle "         into Greenstone metadata as prefix.tag = value\n";
467    }
468
469    while ($inner_metadata_text =~ m/<([^ >]+).*?>(.*?)<\/\1>(.*)/s)
470    {
471
472        my $metaname = $1;
473        my $metavalue = $2;
474        $inner_metadata_text = $3;
475
476        # $metaname =~ s/^(dc:)?(.)/\u$2/; # strip of optional prefix and uppercase first letter
477        $metaname =~ s/:/\./;
478        if ($metaname !~ m/\./)
479        {
480        $metaname = "$top_level_prefix.$metaname";
481        }
482        $metaname =~ s/\.(.)/\.\u$1/;
483
484        $metaname = $self->remap_dcterms_metadata($metaname);
485
486        $metavalue =~ s/\[/&#91;/g;
487        $metavalue =~ s/\]/&#93;/g;
488
489        if (defined $metadata->{$metaname})
490        {
491        push(@{$metadata->{$metaname}},$metavalue);
492
493        }
494        else
495        {
496        $metadata->{$metaname} = [ $metavalue ];
497        }
498
499        $self->add_prettyprint_metadata_line($metaname, $metavalue);
500       
501    }
502    }
503
504    $self->close_prettyprint_metadata_table();
505}
506
507## we know from the file extension, so doesn't need to check the doctype
508sub check_doctype {
509
510    return 1;
511}
512
5131;
Note: See TracBrowser for help on using the browser.