source: main/trunk/greenstone2/perllib/plugins/OAIPlugin.pm@ 24403

Last change on this file since 24403 was 24403, checked in by ak19, 13 years ago

Dr Bainbridge has fixed the conflict between OAIPlugin and EmbeddedMetadataPlugin which resulted in the oai tutorial (with the JCDL pictures) going wrong: meta was not attached to the images. Dr Bainbridge solved the problem by introducing a new method in BasePlugin: can_process_this_file_for_metadata, which by default returns undef so that things should work by default mostly. This method has been overridden in OAIPlugin and EmbeddedMetadataPlugin now to do the right thing there.

  • Property svn:keywords set to Author Date Id Revision
File size: 16.1 KB
RevLine 
[4726]1###########################################################################
2#
3# OAIPlug.pm -- basic Open Archives Initiative (OAI) plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
[15872]27package OAIPlugin;
[4726]28
29use unicode;
30use util;
31
[10254]32use strict;
33no strict 'refs'; # allow filehandles to be variables and viceversa
34
[15872]35use ReadXMLFile;
[17066]36use ReadTextFile; # needed for subroutine textcat_get_language_encoding
[17216]37use metadatautil;
[9958]38
[4726]39sub BEGIN {
[17066]40 @OAIPlugin::ISA = ('ReadXMLFile', 'ReadTextFile');
[4726]41}
42
[19213]43my $set_list =
44 [ { 'name' => "auto",
45 'desc' => "{OAIPlugin.metadata_set.auto}" },
46 { 'name' => "dc",
47 'desc' => "{OAIPlugin.metadata_set.dc}" }
48 ];
[9958]49
[6408]50my $arguments =
51 [ { 'name' => "process_exp",
[16013]52 'desc' => "{BasePlugin.process_exp}",
[6408]53 'type' => "regexp",
54 'reqd' => "no",
[17290]55 'deft' => &get_default_process_exp() },
[19213]56 { 'name' => "metadata_set",
57 'desc' => "{OAIPlugin.metadata_set}",
58 'type' => "enumstring",
59 'reqd' => "no",
60 'list' => $set_list,
61 'deft' => "dc" },
[17319]62 { 'name' => "document_field",
63 'desc' => "{OAIPlugin.document_field}",
[17290]64 'type' => "metadata",
65 'reqd' => "no",
66 'deft' => "gi.Sourcedoc" }
[6408]67 ];
68
[15872]69my $options = { 'name' => "OAIPlugin",
70 'desc' => "{OAIPlugin.desc}",
[6408]71 'abstract' => "no",
72 'inherits' => "yes",
[17103]73 'explodes' => "yes",
[6408]74 'args' => $arguments };
[4747]75
[10254]76
[4726]77sub new {
[10218]78 my ($class) = shift (@_);
79 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
80 push(@$pluginlist, $class);
[4873]81
[15872]82 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
83 push(@{$hashArgOptLists->{"OptList"}},$options);
[4726]84
[17126]85 new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists,1);
[15872]86 my $self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
[4726]87
[21763]88 if ($self->{'info_only'}) {
89 # don't worry about modifying options
90 return bless $self, $class;
91 }
[20451]92 # trim ex. from document field (if there)
93 $self->{'document_field'} =~ s/^ex\.//;
[4726]94 return bless $self, $class;
95}
96
97sub get_default_process_exp {
98 my $self = shift (@_);
99
100 return q^(?i)(\.oai)$^;
101}
102
[13222]103sub get_doctype {
104 my $self = shift(@_);
105
106 return "OAI-PMH";
107}
108
[9958]109sub xml_start_document {
[10254]110 my $self = shift (@_);
[9958]111 $self->{'in_metadata_node'} = 0;
112 $self->{'rawxml'} = "";
[17290]113 $self->{'saved_metadata'} = {};
[9958]114}
[4726]115
[9958]116sub xml_end_document {
117}
[4726]118
[9958]119sub xml_doctype {
120 my $self = shift(@_);
121
122 my ($expat, $name, $sysid, $pubid, $internal) = @_;
123
[13886]124 ##die "" if ($name !~ /^OAI-PMH$/);
[9958]125
[4726]126 my $outhandle = $self->{'outhandle'};
[15872]127 print $outhandle "OAIPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
128 print STDERR "<Processing n='$self->{'file'}' p='OAIPlugin'>\n" if $self->{'gli'};
[4726]129
[9958]130}
[4726]131
[9958]132
133sub xml_start_tag {
134 my $self = shift(@_);
135 my ($expat,$element) = @_;
136
137 my %attr_hash = %_;
138
139 my $attr = "";
140 map { $attr .= " $_=$attr_hash{$_}"; } keys %attr_hash;
141
142 $self->{'rawxml'} .= "<$element$attr>";
143
144 if ($element eq "metadata") {
145 $self->{'in_metadata_node'} = 1;
146 $self->{'metadata_xml'} = "";
[4726]147 }
[9958]148
149 if ($self->{'in_metadata_node'}) {
150 $self->{'metadata_xml'} .= "<$element$attr>";
[4726]151 }
[9958]152}
[4726]153
[9958]154sub xml_end_tag {
155 my $self = shift(@_);
156 my ($expat, $element) = @_;
[4726]157
[9958]158 $self->{'rawxml'} .= "</$element>";
[4726]159
[9958]160 if ($self->{'in_metadata_node'}) {
161 $self->{'metadata_xml'} .= "</$element>";
[4726]162 }
163
[9958]164 if ($element eq "metadata") {
165 my $textref = \$self->{'metadata_xml'};
[17290]166 #my $metadata = $self->{'metadata'};
167 my $metadata = $self->{'saved_metadata'};
[9958]168 $self->extract_oai_metadata($textref,$metadata);
[4726]169
[9958]170 $self->{'in_metadata_node'} = 0;
171 }
[4726]172
173
[9958]174}
[4726]175
[9958]176sub xml_text {
177 my $self = shift(@_);
178 my ($expat) = @_;
[8684]179
[9958]180 $self->{'rawxml'} .= $_;
[4726]181
[9958]182 if ($self->{'in_metadata_node'}) {
183 $self->{'metadata_xml'} .= $_;
[4726]184 }
[9958]185}
[4726]186
[24403]187sub can_process_this_file_for_metadata {
188 my $self = shift(@_);
[8121]189
[24403]190 return $self->can_process_this_file(@_);
191}
192
193
[17216]194sub metadata_read {
195 my $self = shift (@_);
[4726]196
[19493]197 my ($pluginfo, $base_dir, $file, $block_hash,
198 $extrametakeys, $extrametadata, $extrametafile,
[23212]199 $processor, $gli, $aux) = @_;
[5919]200
[17216]201 # can we process this file??
202 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
[24403]203 return undef unless $self->can_process_this_file_for_metadata($filename_full_path);
[17216]204
[17290]205 if (!$self->parse_file($filename_full_path, $file, $gli)) {
206 $self->{'saved_metadata'} = undef;
207 return undef;
208 }
209
210 my $new_metadata = $self->{'saved_metadata'};
211 $self->{'saved_metadata'} = undef;
[17319]212
[17290]213 # add the pretty metadata table as metadata
214 my $ppmd_table = $self->{'ppmd_table'};
215 $new_metadata->{'prettymd'} = $ppmd_table;
216 $self->{'ppmd_table'} = undef;
[17319]217
218 my $document_metadata_field = $self->{'document_field'};
219 my $url_array = $new_metadata->{$document_metadata_field};
[22316]220 if (!defined $url_array) {
221 # try ex.
222 $url_array = $new_metadata->{"ex.$document_metadata_field"};
223 }
[17290]224 my $num_urls = (defined $url_array) ? scalar(@$url_array) : 0;
[17319]225 ##print STDERR "$num_urls urls for $file\n";
[17290]226 my $srcdoc_exists = 0;
227 my $srcdoc_pos = 0;
228 my $filename_dir = &util::filename_head($filename_full_path);
[17590]229
[17591]230 # filenames in extrametadata must be relative to current dir, as
231 # DirectoryPlugin adds path info on itself
[17590]232 my ($filename_for_metadata) = $file =~ /([^\\\/]+)$/; # this assumes there will only be one record per oai file - is this always the case??
[17290]233 for (my $i=0; $i<$num_urls; $i++) {
[17216]234
[17290]235 if ($url_array->[$i] !~ m/^(https?|ftp):/) {
[17216]236
[17290]237 my $src_filename = &util::filename_cat($filename_dir, $url_array->[$i]);
238 if (-e $src_filename) {
239 $srcdoc_pos = $i;
240 $srcdoc_exists = 1;
[19622]241 # get the slashes the right way, use filename_cat
242 $filename_for_metadata = &util::filename_cat($url_array->[$i]);
[17290]243 last;
[17216]244 }
245 }
246 }
[17290]247
[17319]248 if ($srcdoc_exists) {
[17290]249 $self->{'oai-files'}->{$file}->{'srcdoc_exists'} = 1;
250 }
[17216]251 else {
[17290]252 # save the rawxml for the source document
253 $self->{'oai-files'}->{$file}->{'srcdoc_exists'} = 0;
254 $self->{'oai-files'}->{$file}->{'rawxml'} = $self->{'rawxml'};
255 $self->{'rawxml'} = "";
[17216]256 }
[17290]257
258 # return all the metadata we have extracted to the caller.
259 # Directory plug will pass it back in at read time, so we don't need to extract it again.
[17513]260 # extrametadata keys should be regular expressions
261 $filename_for_metadata = &util::filename_to_regex($filename_for_metadata);
[17290]262 $extrametadata->{$filename_for_metadata} = $new_metadata;
263 push(@$extrametakeys, $filename_for_metadata);
[20792]264 if ($srcdoc_exists) {
265 if (!defined $extrametafile->{$filename_for_metadata}) {
266 $extrametafile->{$filename_for_metadata} = {};
267 }
268 #maps the file to full path
269 $extrametafile->{$filename_for_metadata}->{$file} = $filename_full_path;
270 }
[17290]271 return 1;
272
[17216]273}
274
275
[9958]276sub read {
277 my $self = shift (@_);
[17290]278
[16392]279 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
[4726]280
[17290]281 if (!defined $self->{'oai-files'}->{$file}) {
282 return undef;
283 }
[17319]284
[17290]285 my $srcdoc_exists = $self->{'oai-files'}->{$file}->{'srcdoc_exists'};
286 if ($srcdoc_exists) {
[17319]287 # do nothing more - all the metadata has been extracted and associated with the srcdoc
[17216]288 # no more need to access details of this $file => tidy up as you go
289 delete $self->{'oai-files'}->{$file};
[17290]290 return 0; # not processed here, but don't pass on to rest of plugins
291 }
[17216]292
[17290]293 my $filename = $file;
294 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
[17216]295
[17290]296 # Do encoding stuff on metadata
297 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
[17216]298
[17290]299 # create a new document
[18327]300 my $doc_obj = new doc ($filename, "indexed_doc", $self->{'file_rename_method'});
[17290]301 my $top_section = $doc_obj->get_top_section;
302 my $plugin_type = $self->{'plugin_type'};
303
[17588]304 my ($filemeta) = $file =~ /([^\\\/]+)$/;
[23352]305 my $plugin_filename_encoding = $self->{'filename_encoding'};
[23349]306 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
[23352]307 $self->set_Source_metadata($doc_obj, $filename, $filename_encoding);
[23349]308
[17290]309 $doc_obj->add_utf8_metadata($top_section, "Language", $language);
310 $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding);
311 $doc_obj->add_utf8_metadata($top_section, "Plugin", $plugin_type);
312 $doc_obj->add_metadata($top_section, "FileFormat", "OAI");
313 $doc_obj->add_metadata($top_section, "FileSize", (-s $filename));
314
315 # include any metadata passed in from previous plugins
316 # note that this metadata is associated with the top level section
[17319]317 # this will include all the metadata from the oai file that we extracted
318 # during metadata_read
[17290]319 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
320
321 # do plugin specific processing of doc_obj
322 my $text = $self->{'oai-files'}->{$file}->{'rawxml'};
323 delete $self->{'oai-files'}->{$file};
[17216]324
[17290]325 unless (defined ($self->process(\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
326 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
327 return -1;
[17216]328 }
[17290]329
330 # do any automatic metadata extraction
331 $self->auto_extract_metadata ($doc_obj);
332
333 # add an OID
334 $self->add_OID($doc_obj);
335
336 # process the document
337 $processor->process($doc_obj);
338
339 $self->{'num_processed'} ++;
340
341 return 1; # processed the file
[17216]342}
343
344
[4726]345# do plugin specific processing of doc_obj
346sub process {
347 my $self = shift (@_);
[6332]348 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[4726]349 my $outhandle = $self->{'outhandle'};
350
[15872]351 print STDERR "<Processing n='$file' p='OAIPlugin'>\n" if ($gli);
352 print $outhandle "OAIPlugin: processing $file\n"
[4726]353 if $self->{'verbosity'} > 1;
354
355 my $cursection = $doc_obj->get_top_section();
356
357## $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection);
358
359 # add text to document object
360
361# $$textref =~ s/<(.*?)>/$1 /g;
362 $$textref =~ s/</&lt;/g;
363 $$textref =~ s/>/&gt;/g;
[14963]364 $$textref =~ s/\[/&#91;/g;
365 $$textref =~ s/\]/&#93;/g;
[4726]366
367 $doc_obj->add_utf8_text($cursection, $$textref);
368
369 return 1;
370}
371
372
[9958]373# Improvement is to merge this with newer version in MetadataPass
[4726]374
[9958]375sub open_prettyprint_metadata_table
376{
377 my $self = shift(@_);
378
379 my $att = "width=100% cellspacing=2";
380 my $style = "style=\'border-bottom: 4px solid #000080\'";
381
[17290]382 $self->{'ppmd_table'} = "\n<table $att $style>";
[9958]383}
384
385sub add_prettyprint_metadata_line
386{
387 my $self = shift(@_);
388 my ($metaname, $metavalue_utf8) = @_;
389
390 $metavalue_utf8 = &util::hyperlink_text($metavalue_utf8);
391
392 $self->{'ppmd_table'} .= " <tr bgcolor=#b5d3cd>\n";
393 $self->{'ppmd_table'} .= " <td width=30%>\n";
394 $self->{'ppmd_table'} .= " $metaname\n";
395 $self->{'ppmd_table'} .= " </td>\n";
396 $self->{'ppmd_table'} .= " <td>\n";
397 $self->{'ppmd_table'} .= " $metavalue_utf8\n";
398 $self->{'ppmd_table'} .= " </td>\n";
399 $self->{'ppmd_table'} .= " </tr>\n";
400
401}
402
403sub close_prettyprint_metadata_table
404{
405 my $self = shift(@_);
406
407 $self->{'ppmd_table'} .= "</table>\n";
408}
409
[19213]410my $qualified_dc_mapping = {
411 "alternative" => "dc.title",
412 "tableOfContents" => "dc.description",
413 "abstract" => "dc.description",
414 "created" => "dc.date",
415 "valid" => "dc.date",
416 "available" => "dc.date",
417 "issued" => "dc.date",
418 "modified" => "dc.date",
419 "dateAccepted" => "dc.date",
420 "dateCopyrighted" => "dc.date",
421 "dateSubmitted" => "dc.date",
422 "extent" => "dc.format",
423 "medium" => "dc.format",
424 "isVersionOf" => "dc.relation",
425 "hasVersion" => "dc.relation",
426 "isReplacedBy" => "dc.relation",
427 "replaces" => "dc.relation",
428 "isRequiredBy" => "dc.relation",
429 "requires" => "dc.relation",
430 "isPartOf" => "dc.relation",
431 "hasPart" => "dc.relation",
432 "isReferencedBy" => "dc.relation",
433 "references" => "dc.relation",
434 "isFormatOf" => "dc.relation",
435 "hasFormat" => "dc.relation",
436 "conformsTo" => "dc.relation",
437 "spatial" => "dc.coverage",
438 "temporal" => "dc.coverage",
439# these are now top level elements in our qualified dc metadata set
[18901]440# "audience" => "dc.any",
441# "accrualMethod" => "dc.any",
442# "accrualPeriodicity" => "dc.any",
443# "accrualPolicy" => "dc.any",
444# "instructionalMethod" => "dc.any",
445# "provenance" => "dc.any",
446# "rightsHolder" => "dc.any",
[19213]447 "mediator" => "dc.audience",
448 "educationLevel" => "dc.audience",
449 "accessRights" => "dc.rights",
450 "license" => "dc.rights",
451 "bibliographicCitation" => "dc.identifier"
452 };
[14940]453
[19213]454sub remap_dc_metadata
455{
456 my $self = shift(@_);
457
458 my ($metaname) = @_;
459
[14940]460 my ($prefix,$name) = ($metaname =~ m/^(.*?)\.(.*?)$/);
461
[19213]462 if (defined $qualified_dc_mapping->{$name}) {
463
464 return $qualified_dc_mapping->{$name}."^".$name;
[14940]465 }
[19213]466
467
[14940]468 return $metaname; # didn't get a match, return param passed in unchanged
469}
470
471
[4726]472sub extract_oai_metadata {
473 my $self = shift (@_);
474 my ($textref, $metadata) = @_;
475 my $outhandle = $self->{'outhandle'};
476
[9958]477 $self->open_prettyprint_metadata_table();
478
479 if ($$textref =~ m/<metadata\s*>(.*?)<\/metadata\s*>/s)
[4726]480 {
[10254]481 my $metadata_text = $1;
[4726]482
[14940]483 # locate and remove outermost tag (ignoring any attribute information in top-level tag)
[19213]484 my ($outer_tagname,$inner_metadata_text) = ($metadata_text =~ m/<([^ >]+).*?>(.*?)<\/\1>/s);
[14940]485 # split tag into namespace and tag name
[19213]486 my($namespace,$top_level_prefix) = ($outer_tagname =~ m/^(.*?):(.*?)$/);
[17066]487 # sometimes, the dc namespace is not specified as the prefix in each element (like <dc:title>)
488 # but is rather defined in the wrapper element containing the various dc meta elements,
489 # like <dc><title></title><creator></creator></dc>.
490 # In such a case, we use this wrapper element as the top_level_prefix
[19213]491
492 # if there was no prefix, then the tag itself becomes the top_level_prefix
493 if(!defined $top_level_prefix && defined $outer_tagname) {
494 $top_level_prefix = $outer_tagname;
[17066]495 }
496
[19213]497 #process each element one by one
[14949]498 while ($inner_metadata_text =~ m/<([^ >]+).*?>(.*?)<\/\1>(.*)/s)
[4726]499 {
[9958]500
[4726]501 my $metaname = $1;
502 my $metavalue = $2;
[14949]503 $inner_metadata_text = $3;
504
[19213]505 # greenstone uses . for namespace, while oai uses :
[14940]506 $metaname =~ s/:/\./;
[19213]507 # if there is no namespace, then we use the outer tag name or
508 # namespace for this element
[14940]509 if ($metaname !~ m/\./)
[4726]510 {
[14940]511 $metaname = "$top_level_prefix.$metaname";
[4726]512 }
[19213]513
514 # if metadata set is auto, leave as is, otherwise convert to
515 # specified namespace
516 if ($self->{'metadata_set'} ne "auto") {
[20787]517 if ($metaname !~ /^gi\./) { # hack to not overwrite gi metadata
518 $metaname =~ s/^([^\.]*)\./$self->{'metadata_set'}\./;
519 if ($self->{'metadata_set'} eq "dc") {
520 # convert qualified dc terms to gs version, e.g.
521 # spatial becomes coverage^spatial
522 $metaname = $self->remap_dc_metadata($metaname);
523 }
[19213]524 }
525 }
[4726]526
[18901]527 # uppercase the first char of the name
528 $metaname =~ s/\.(.)/\.\u$1/;
[14963]529 $metavalue =~ s/\[/&#91;/g;
530 $metavalue =~ s/\]/&#93;/g;
531
[22316]532 # so that GLI can see this metadata, store here as ex.dc.Title etc
533 my $ex_metaname = "ex.$metaname";
534
535 if (defined $metadata->{$ex_metaname})
[4726]536 {
[22316]537 push(@{$metadata->{$ex_metaname}},$metavalue);
[8121]538
[4726]539 }
540 else
541 {
[22316]542 $metadata->{$ex_metaname} = [ $metavalue ];
[4726]543 }
544
[22316]545 # but don't add ex to the pretty print line
[9958]546 $self->add_prettyprint_metadata_line($metaname, $metavalue);
547
[4726]548 }
549 }
[9958]550
551 $self->close_prettyprint_metadata_table();
[4726]552}
553
[13886]554## we know from the file extension, so doesn't need to check the doctype
555sub check_doctype {
556
557 return 1;
558}
559
[4726]5601;
Note: See TracBrowser for help on using the repository browser.