source: main/trunk/greenstone2/perllib/plugins/OAIPlugin.pm@ 24403

Last change on this file since 24403 was 24403, checked in by ak19, 11 years ago

Dr Bainbridge has fixed the conflict between OAIPlugin and EmbeddedMetadataPlugin which resulted in the oai tutorial (with the JCDL pictures) going wrong: meta was not attached to the images. Dr Bainbridge solved the problem by introducing a new method in BasePlugin: can_process_this_file_for_metadata, which by default returns undef so that things should work by default mostly. This method has been overridden in OAIPlugin and EmbeddedMetadataPlugin now to do the right thing there.

  • Property svn:keywords set to Author Date Id Revision
File size: 16.1 KB
Line 
1###########################################################################
2#
3# OAIPlug.pm -- basic Open Archives Initiative (OAI) plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package OAIPlugin;
28
29use unicode;
30use util;
31
32use strict;
33no strict 'refs'; # allow filehandles to be variables and viceversa
34
35use ReadXMLFile;
36use ReadTextFile; # needed for subroutine textcat_get_language_encoding
37use metadatautil;
38
39sub BEGIN {
40 @OAIPlugin::ISA = ('ReadXMLFile', 'ReadTextFile');
41}
42
43my $set_list =
44 [ { 'name' => "auto",
45 'desc' => "{OAIPlugin.metadata_set.auto}" },
46 { 'name' => "dc",
47 'desc' => "{OAIPlugin.metadata_set.dc}" }
48 ];
49
50my $arguments =
51 [ { 'name' => "process_exp",
52 'desc' => "{BasePlugin.process_exp}",
53 'type' => "regexp",
54 'reqd' => "no",
55 'deft' => &get_default_process_exp() },
56 { 'name' => "metadata_set",
57 'desc' => "{OAIPlugin.metadata_set}",
58 'type' => "enumstring",
59 'reqd' => "no",
60 'list' => $set_list,
61 'deft' => "dc" },
62 { 'name' => "document_field",
63 'desc' => "{OAIPlugin.document_field}",
64 'type' => "metadata",
65 'reqd' => "no",
66 'deft' => "gi.Sourcedoc" }
67 ];
68
69my $options = { 'name' => "OAIPlugin",
70 'desc' => "{OAIPlugin.desc}",
71 'abstract' => "no",
72 'inherits' => "yes",
73 'explodes' => "yes",
74 'args' => $arguments };
75
76
77sub new {
78 my ($class) = shift (@_);
79 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
80 push(@$pluginlist, $class);
81
82 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
83 push(@{$hashArgOptLists->{"OptList"}},$options);
84
85 new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists,1);
86 my $self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
87
88 if ($self->{'info_only'}) {
89 # don't worry about modifying options
90 return bless $self, $class;
91 }
92 # trim ex. from document field (if there)
93 $self->{'document_field'} =~ s/^ex\.//;
94 return bless $self, $class;
95}
96
97sub get_default_process_exp {
98 my $self = shift (@_);
99
100 return q^(?i)(\.oai)$^;
101}
102
103sub get_doctype {
104 my $self = shift(@_);
105
106 return "OAI-PMH";
107}
108
109sub xml_start_document {
110 my $self = shift (@_);
111 $self->{'in_metadata_node'} = 0;
112 $self->{'rawxml'} = "";
113 $self->{'saved_metadata'} = {};
114}
115
116sub xml_end_document {
117}
118
119sub xml_doctype {
120 my $self = shift(@_);
121
122 my ($expat, $name, $sysid, $pubid, $internal) = @_;
123
124 ##die "" if ($name !~ /^OAI-PMH$/);
125
126 my $outhandle = $self->{'outhandle'};
127 print $outhandle "OAIPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
128 print STDERR "<Processing n='$self->{'file'}' p='OAIPlugin'>\n" if $self->{'gli'};
129
130}
131
132
133sub xml_start_tag {
134 my $self = shift(@_);
135 my ($expat,$element) = @_;
136
137 my %attr_hash = %_;
138
139 my $attr = "";
140 map { $attr .= " $_=$attr_hash{$_}"; } keys %attr_hash;
141
142 $self->{'rawxml'} .= "<$element$attr>";
143
144 if ($element eq "metadata") {
145 $self->{'in_metadata_node'} = 1;
146 $self->{'metadata_xml'} = "";
147 }
148
149 if ($self->{'in_metadata_node'}) {
150 $self->{'metadata_xml'} .= "<$element$attr>";
151 }
152}
153
154sub xml_end_tag {
155 my $self = shift(@_);
156 my ($expat, $element) = @_;
157
158 $self->{'rawxml'} .= "</$element>";
159
160 if ($self->{'in_metadata_node'}) {
161 $self->{'metadata_xml'} .= "</$element>";
162 }
163
164 if ($element eq "metadata") {
165 my $textref = \$self->{'metadata_xml'};
166 #my $metadata = $self->{'metadata'};
167 my $metadata = $self->{'saved_metadata'};
168 $self->extract_oai_metadata($textref,$metadata);
169
170 $self->{'in_metadata_node'} = 0;
171 }
172
173
174}
175
176sub xml_text {
177 my $self = shift(@_);
178 my ($expat) = @_;
179
180 $self->{'rawxml'} .= $_;
181
182 if ($self->{'in_metadata_node'}) {
183 $self->{'metadata_xml'} .= $_;
184 }
185}
186
187sub can_process_this_file_for_metadata {
188 my $self = shift(@_);
189
190 return $self->can_process_this_file(@_);
191}
192
193
194sub metadata_read {
195 my $self = shift (@_);
196
197 my ($pluginfo, $base_dir, $file, $block_hash,
198 $extrametakeys, $extrametadata, $extrametafile,
199 $processor, $gli, $aux) = @_;
200
201 # can we process this file??
202 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
203 return undef unless $self->can_process_this_file_for_metadata($filename_full_path);
204
205 if (!$self->parse_file($filename_full_path, $file, $gli)) {
206 $self->{'saved_metadata'} = undef;
207 return undef;
208 }
209
210 my $new_metadata = $self->{'saved_metadata'};
211 $self->{'saved_metadata'} = undef;
212
213 # add the pretty metadata table as metadata
214 my $ppmd_table = $self->{'ppmd_table'};
215 $new_metadata->{'prettymd'} = $ppmd_table;
216 $self->{'ppmd_table'} = undef;
217
218 my $document_metadata_field = $self->{'document_field'};
219 my $url_array = $new_metadata->{$document_metadata_field};
220 if (!defined $url_array) {
221 # try ex.
222 $url_array = $new_metadata->{"ex.$document_metadata_field"};
223 }
224 my $num_urls = (defined $url_array) ? scalar(@$url_array) : 0;
225 ##print STDERR "$num_urls urls for $file\n";
226 my $srcdoc_exists = 0;
227 my $srcdoc_pos = 0;
228 my $filename_dir = &util::filename_head($filename_full_path);
229
230 # filenames in extrametadata must be relative to current dir, as
231 # DirectoryPlugin adds path info on itself
232 my ($filename_for_metadata) = $file =~ /([^\\\/]+)$/; # this assumes there will only be one record per oai file - is this always the case??
233 for (my $i=0; $i<$num_urls; $i++) {
234
235 if ($url_array->[$i] !~ m/^(https?|ftp):/) {
236
237 my $src_filename = &util::filename_cat($filename_dir, $url_array->[$i]);
238 if (-e $src_filename) {
239 $srcdoc_pos = $i;
240 $srcdoc_exists = 1;
241 # get the slashes the right way, use filename_cat
242 $filename_for_metadata = &util::filename_cat($url_array->[$i]);
243 last;
244 }
245 }
246 }
247
248 if ($srcdoc_exists) {
249 $self->{'oai-files'}->{$file}->{'srcdoc_exists'} = 1;
250 }
251 else {
252 # save the rawxml for the source document
253 $self->{'oai-files'}->{$file}->{'srcdoc_exists'} = 0;
254 $self->{'oai-files'}->{$file}->{'rawxml'} = $self->{'rawxml'};
255 $self->{'rawxml'} = "";
256 }
257
258 # return all the metadata we have extracted to the caller.
259 # Directory plug will pass it back in at read time, so we don't need to extract it again.
260 # extrametadata keys should be regular expressions
261 $filename_for_metadata = &util::filename_to_regex($filename_for_metadata);
262 $extrametadata->{$filename_for_metadata} = $new_metadata;
263 push(@$extrametakeys, $filename_for_metadata);
264 if ($srcdoc_exists) {
265 if (!defined $extrametafile->{$filename_for_metadata}) {
266 $extrametafile->{$filename_for_metadata} = {};
267 }
268 #maps the file to full path
269 $extrametafile->{$filename_for_metadata}->{$file} = $filename_full_path;
270 }
271 return 1;
272
273}
274
275
276sub read {
277 my $self = shift (@_);
278
279 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
280
281 if (!defined $self->{'oai-files'}->{$file}) {
282 return undef;
283 }
284
285 my $srcdoc_exists = $self->{'oai-files'}->{$file}->{'srcdoc_exists'};
286 if ($srcdoc_exists) {
287 # do nothing more - all the metadata has been extracted and associated with the srcdoc
288 # no more need to access details of this $file => tidy up as you go
289 delete $self->{'oai-files'}->{$file};
290 return 0; # not processed here, but don't pass on to rest of plugins
291 }
292
293 my $filename = $file;
294 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
295
296 # Do encoding stuff on metadata
297 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
298
299 # create a new document
300 my $doc_obj = new doc ($filename, "indexed_doc", $self->{'file_rename_method'});
301 my $top_section = $doc_obj->get_top_section;
302 my $plugin_type = $self->{'plugin_type'};
303
304 my ($filemeta) = $file =~ /([^\\\/]+)$/;
305 my $plugin_filename_encoding = $self->{'filename_encoding'};
306 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
307 $self->set_Source_metadata($doc_obj, $filename, $filename_encoding);
308
309 $doc_obj->add_utf8_metadata($top_section, "Language", $language);
310 $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding);
311 $doc_obj->add_utf8_metadata($top_section, "Plugin", $plugin_type);
312 $doc_obj->add_metadata($top_section, "FileFormat", "OAI");
313 $doc_obj->add_metadata($top_section, "FileSize", (-s $filename));
314
315 # include any metadata passed in from previous plugins
316 # note that this metadata is associated with the top level section
317 # this will include all the metadata from the oai file that we extracted
318 # during metadata_read
319 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
320
321 # do plugin specific processing of doc_obj
322 my $text = $self->{'oai-files'}->{$file}->{'rawxml'};
323 delete $self->{'oai-files'}->{$file};
324
325 unless (defined ($self->process(\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
326 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
327 return -1;
328 }
329
330 # do any automatic metadata extraction
331 $self->auto_extract_metadata ($doc_obj);
332
333 # add an OID
334 $self->add_OID($doc_obj);
335
336 # process the document
337 $processor->process($doc_obj);
338
339 $self->{'num_processed'} ++;
340
341 return 1; # processed the file
342}
343
344
345# do plugin specific processing of doc_obj
346sub process {
347 my $self = shift (@_);
348 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
349 my $outhandle = $self->{'outhandle'};
350
351 print STDERR "<Processing n='$file' p='OAIPlugin'>\n" if ($gli);
352 print $outhandle "OAIPlugin: processing $file\n"
353 if $self->{'verbosity'} > 1;
354
355 my $cursection = $doc_obj->get_top_section();
356
357## $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection);
358
359 # add text to document object
360
361# $$textref =~ s/<(.*?)>/$1 /g;
362 $$textref =~ s/</&lt;/g;
363 $$textref =~ s/>/&gt;/g;
364 $$textref =~ s/\[/&#91;/g;
365 $$textref =~ s/\]/&#93;/g;
366
367 $doc_obj->add_utf8_text($cursection, $$textref);
368
369 return 1;
370}
371
372
373# Improvement is to merge this with newer version in MetadataPass
374
375sub open_prettyprint_metadata_table
376{
377 my $self = shift(@_);
378
379 my $att = "width=100% cellspacing=2";
380 my $style = "style=\'border-bottom: 4px solid #000080\'";
381
382 $self->{'ppmd_table'} = "\n<table $att $style>";
383}
384
385sub add_prettyprint_metadata_line
386{
387 my $self = shift(@_);
388 my ($metaname, $metavalue_utf8) = @_;
389
390 $metavalue_utf8 = &util::hyperlink_text($metavalue_utf8);
391
392 $self->{'ppmd_table'} .= " <tr bgcolor=#b5d3cd>\n";
393 $self->{'ppmd_table'} .= " <td width=30%>\n";
394 $self->{'ppmd_table'} .= " $metaname\n";
395 $self->{'ppmd_table'} .= " </td>\n";
396 $self->{'ppmd_table'} .= " <td>\n";
397 $self->{'ppmd_table'} .= " $metavalue_utf8\n";
398 $self->{'ppmd_table'} .= " </td>\n";
399 $self->{'ppmd_table'} .= " </tr>\n";
400
401}
402
403sub close_prettyprint_metadata_table
404{
405 my $self = shift(@_);
406
407 $self->{'ppmd_table'} .= "</table>\n";
408}
409
410my $qualified_dc_mapping = {
411 "alternative" => "dc.title",
412 "tableOfContents" => "dc.description",
413 "abstract" => "dc.description",
414 "created" => "dc.date",
415 "valid" => "dc.date",
416 "available" => "dc.date",
417 "issued" => "dc.date",
418 "modified" => "dc.date",
419 "dateAccepted" => "dc.date",
420 "dateCopyrighted" => "dc.date",
421 "dateSubmitted" => "dc.date",
422 "extent" => "dc.format",
423 "medium" => "dc.format",
424 "isVersionOf" => "dc.relation",
425 "hasVersion" => "dc.relation",
426 "isReplacedBy" => "dc.relation",
427 "replaces" => "dc.relation",
428 "isRequiredBy" => "dc.relation",
429 "requires" => "dc.relation",
430 "isPartOf" => "dc.relation",
431 "hasPart" => "dc.relation",
432 "isReferencedBy" => "dc.relation",
433 "references" => "dc.relation",
434 "isFormatOf" => "dc.relation",
435 "hasFormat" => "dc.relation",
436 "conformsTo" => "dc.relation",
437 "spatial" => "dc.coverage",
438 "temporal" => "dc.coverage",
439# these are now top level elements in our qualified dc metadata set
440# "audience" => "dc.any",
441# "accrualMethod" => "dc.any",
442# "accrualPeriodicity" => "dc.any",
443# "accrualPolicy" => "dc.any",
444# "instructionalMethod" => "dc.any",
445# "provenance" => "dc.any",
446# "rightsHolder" => "dc.any",
447 "mediator" => "dc.audience",
448 "educationLevel" => "dc.audience",
449 "accessRights" => "dc.rights",
450 "license" => "dc.rights",
451 "bibliographicCitation" => "dc.identifier"
452 };
453
454sub remap_dc_metadata
455{
456 my $self = shift(@_);
457
458 my ($metaname) = @_;
459
460 my ($prefix,$name) = ($metaname =~ m/^(.*?)\.(.*?)$/);
461
462 if (defined $qualified_dc_mapping->{$name}) {
463
464 return $qualified_dc_mapping->{$name}."^".$name;
465 }
466
467
468 return $metaname; # didn't get a match, return param passed in unchanged
469}
470
471
472sub extract_oai_metadata {
473 my $self = shift (@_);
474 my ($textref, $metadata) = @_;
475 my $outhandle = $self->{'outhandle'};
476
477 $self->open_prettyprint_metadata_table();
478
479 if ($$textref =~ m/<metadata\s*>(.*?)<\/metadata\s*>/s)
480 {
481 my $metadata_text = $1;
482
483 # locate and remove outermost tag (ignoring any attribute information in top-level tag)
484 my ($outer_tagname,$inner_metadata_text) = ($metadata_text =~ m/<([^ >]+).*?>(.*?)<\/\1>/s);
485 # split tag into namespace and tag name
486 my($namespace,$top_level_prefix) = ($outer_tagname =~ m/^(.*?):(.*?)$/);
487 # sometimes, the dc namespace is not specified as the prefix in each element (like <dc:title>)
488 # but is rather defined in the wrapper element containing the various dc meta elements,
489 # like <dc><title></title><creator></creator></dc>.
490 # In such a case, we use this wrapper element as the top_level_prefix
491
492 # if there was no prefix, then the tag itself becomes the top_level_prefix
493 if(!defined $top_level_prefix && defined $outer_tagname) {
494 $top_level_prefix = $outer_tagname;
495 }
496
497 #process each element one by one
498 while ($inner_metadata_text =~ m/<([^ >]+).*?>(.*?)<\/\1>(.*)/s)
499 {
500
501 my $metaname = $1;
502 my $metavalue = $2;
503 $inner_metadata_text = $3;
504
505 # greenstone uses . for namespace, while oai uses :
506 $metaname =~ s/:/\./;
507 # if there is no namespace, then we use the outer tag name or
508 # namespace for this element
509 if ($metaname !~ m/\./)
510 {
511 $metaname = "$top_level_prefix.$metaname";
512 }
513
514 # if metadata set is auto, leave as is, otherwise convert to
515 # specified namespace
516 if ($self->{'metadata_set'} ne "auto") {
517 if ($metaname !~ /^gi\./) { # hack to not overwrite gi metadata
518 $metaname =~ s/^([^\.]*)\./$self->{'metadata_set'}\./;
519 if ($self->{'metadata_set'} eq "dc") {
520 # convert qualified dc terms to gs version, e.g.
521 # spatial becomes coverage^spatial
522 $metaname = $self->remap_dc_metadata($metaname);
523 }
524 }
525 }
526
527 # uppercase the first char of the name
528 $metaname =~ s/\.(.)/\.\u$1/;
529 $metavalue =~ s/\[/&#91;/g;
530 $metavalue =~ s/\]/&#93;/g;
531
532 # so that GLI can see this metadata, store here as ex.dc.Title etc
533 my $ex_metaname = "ex.$metaname";
534
535 if (defined $metadata->{$ex_metaname})
536 {
537 push(@{$metadata->{$ex_metaname}},$metavalue);
538
539 }
540 else
541 {
542 $metadata->{$ex_metaname} = [ $metavalue ];
543 }
544
545 # but don't add ex to the pretty print line
546 $self->add_prettyprint_metadata_line($metaname, $metavalue);
547
548 }
549 }
550
551 $self->close_prettyprint_metadata_table();
552}
553
554## we know from the file extension, so doesn't need to check the doctype
555sub check_doctype {
556
557 return 1;
558}
559
5601;
Note: See TracBrowser for help on using the repository browser.