source: main/trunk/greenstone2/perllib/plugins/OAIPlugin.pm@ 27283

Last change on this file since 27283 was 27283, checked in by ak19, 11 years ago
  1. Fixed an encoding bug that Diego helpfully discovered. Metadata extracted from downloaded .oai files were double encoded. This was because we forgot to add a call to decode() after reading in the contents of the oai file, something we had remembered to do in many other plugins. 2. John Thompson recommended the very helpful Dump(dollar-var) function of Devel::Peek which dumps information about any var to stderr, including encoding information. For strings marked utf8 it also prints out hex and octal values for chars outside of Basic Latin.
  • Property svn:keywords set to Author Date Id Revision
File size: 18.0 KB
Line 
1###########################################################################
2#
3# OAIPlug.pm -- basic Open Archives Initiative (OAI) plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package OAIPlugin;
28
29# Devel::Peek's Dump($var) function is useful for debugging encoding issues.
30#use Devel::Peek;
31use Encode;
32use extrametautil;
33use unicode;
34use util;
35
36use strict;
37no strict 'refs'; # allow filehandles to be variables and viceversa
38
39use ReadXMLFile;
40use ReadTextFile; # needed for subroutine textcat_get_language_encoding
41use metadatautil;
42use MetadataRead;
43use util;
44
45# methods with identical signatures take precedence in the order given in the ISA list.
46sub BEGIN {
47 @OAIPlugin::ISA = ('MetadataRead', 'ReadXMLFile', 'ReadTextFile');
48}
49
50my $set_list =
51 [ { 'name' => "auto",
52 'desc' => "{OAIPlugin.metadata_set.auto}" },
53 { 'name' => "dc",
54 'desc' => "{OAIPlugin.metadata_set.dc}" }
55 ];
56
57my $arguments =
58 [ { 'name' => "process_exp",
59 'desc' => "{BasePlugin.process_exp}",
60 'type' => "regexp",
61 'reqd' => "no",
62 'deft' => &get_default_process_exp() },
63 { 'name' => "metadata_set",
64 'desc' => "{OAIPlugin.metadata_set}",
65 'type' => "enumstring",
66 'reqd' => "no",
67 'list' => $set_list,
68 'deft' => "dc" },
69 { 'name' => "document_field",
70 'desc' => "{OAIPlugin.document_field}",
71 'type' => "metadata",
72 'reqd' => "no",
73 'deft' => "gi.Sourcedoc" }
74 ];
75
76my $options = { 'name' => "OAIPlugin",
77 'desc' => "{OAIPlugin.desc}",
78 'abstract' => "no",
79 'inherits' => "yes",
80 'explodes' => "yes",
81 'args' => $arguments };
82
83
84sub new {
85 my ($class) = shift (@_);
86 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
87 push(@$pluginlist, $class);
88
89 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
90 push(@{$hashArgOptLists->{"OptList"}},$options);
91
92 new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists,1);
93 my $self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
94
95 if ($self->{'info_only'}) {
96 # don't worry about modifying options
97 return bless $self, $class;
98 }
99 # trim any ex. from document field iff it's the only metadata namespace prefix
100 $self->{'document_field'} =~ s/^ex\.([^.]+)$/$1/;
101 return bless $self, $class;
102}
103
104sub get_default_process_exp {
105 my $self = shift (@_);
106
107 return q^(?i)(\.oai)$^;
108}
109
110sub get_doctype {
111 my $self = shift(@_);
112
113 return "OAI-PMH";
114}
115
116sub xml_start_document {
117 my $self = shift (@_);
118 $self->{'in_metadata_node'} = 0;
119 $self->{'rawxml'} = "";
120 $self->{'saved_metadata'} = {};
121}
122
123sub xml_end_document {
124}
125
126sub xml_doctype {
127 my $self = shift(@_);
128
129 my ($expat, $name, $sysid, $pubid, $internal) = @_;
130
131 ##die "" if ($name !~ /^OAI-PMH$/);
132
133 my $outhandle = $self->{'outhandle'};
134 print $outhandle "OAIPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
135 print STDERR "<Processing n='$self->{'file'}' p='OAIPlugin'>\n" if $self->{'gli'};
136
137}
138
139
140sub xml_start_tag {
141 my $self = shift(@_);
142 my ($expat,$element) = @_;
143
144 my %attr_hash = %_;
145
146 my $attr = "";
147 map { $attr .= " $_=$attr_hash{$_}"; } keys %attr_hash;
148
149 $self->{'rawxml'} .= "<$element$attr>";
150
151 if ($element eq "metadata") {
152 $self->{'in_metadata_node'} = 1;
153 $self->{'metadata_xml'} = "";
154 }
155
156 if ($self->{'in_metadata_node'}) {
157 $self->{'metadata_xml'} .= "<$element$attr>";
158 }
159}
160
161sub xml_end_tag {
162 my $self = shift(@_);
163 my ($expat, $element) = @_;
164
165 $self->{'rawxml'} .= "</$element>";
166
167 if ($self->{'in_metadata_node'}) {
168 $self->{'metadata_xml'} .= "</$element>";
169 }
170
171 if ($element eq "metadata") {
172 my $textref = \$self->{'metadata_xml'};
173 #my $metadata = $self->{'metadata'};
174 my $metadata = $self->{'saved_metadata'};
175 $self->extract_oai_metadata($textref,$metadata);
176
177 $self->{'in_metadata_node'} = 0;
178 }
179
180
181}
182
183sub xml_text {
184 my $self = shift(@_);
185 my ($expat) = @_;
186
187 $self->{'rawxml'} .= $_;
188
189 if ($self->{'in_metadata_node'}) {
190 $self->{'metadata_xml'} .= $_;
191 }
192}
193
194
195sub metadata_read {
196 my $self = shift (@_);
197
198 my ($pluginfo, $base_dir, $file, $block_hash,
199 $extrametakeys, $extrametadata, $extrametafile,
200 $processor, $gli, $aux) = @_;
201
202 # can we process this file??
203 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
204 return undef unless $self->can_process_this_file_for_metadata($filename_full_path);
205
206 print STDERR "\n<Processing n='$file' p='OAIPlugin'>\n" if ($gli);
207 print STDERR "OAIPlugin: processing $file\n" if ($self->{'verbosity'}) > 1;
208
209 if (!$self->parse_file($filename_full_path, $file, $gli)) {
210 $self->{'saved_metadata'} = undef;
211 return undef;
212 }
213
214 my $verbosity = $self->{'verbosity'};
215 my $new_metadata = $self->{'saved_metadata'};
216 $self->{'saved_metadata'} = undef;
217
218 # add the pretty metadata table as metadata
219 my $ppmd_table = $self->{'ppmd_table'};
220 $new_metadata->{'prettymd'} = $ppmd_table;
221 $self->{'ppmd_table'} = undef;
222
223 my $document_metadata_field = $self->{'document_field'};
224 my $url_array = $new_metadata->{$document_metadata_field};
225 if (!defined $url_array) {
226 # try ex.
227 $url_array = $new_metadata->{"ex.$document_metadata_field"};
228 }
229 my $num_urls = (defined $url_array) ? scalar(@$url_array) : 0;
230 ##print STDERR "$num_urls urls for $file\n";
231 my $srcdoc_exists = 0;
232 my $srcdoc_pos = 0;
233 my $filename_dir = &util::filename_head($filename_full_path);
234
235 # filenames in extrametadata must be relative to current dir, as
236 # DirectoryPlugin adds path info on itself
237 my ($filename_for_metadata) = $file =~ /([^\\\/]+)$/; # this assumes there will only be one record per oai file - is this always the case??
238 for (my $i=0; $i<$num_urls; $i++) {
239
240 if ($url_array->[$i] !~ m/^(https?|ftp):/) {
241
242 my $src_filename = &util::filename_cat($filename_dir, $url_array->[$i]);
243 if (-e $src_filename) {
244 $srcdoc_pos = $i;
245 $srcdoc_exists = 1;
246 # get the slashes the right way, use filename_cat
247 $filename_for_metadata = &util::filename_cat($url_array->[$i]);
248 last;
249 }
250 }
251 }
252
253 if ($srcdoc_exists) {
254 $self->{'oai-files'}->{$file}->{'srcdoc_exists'} = 1;
255 }
256 else {
257 # save the rawxml for the source document
258 $self->{'oai-files'}->{$file}->{'srcdoc_exists'} = 0;
259 $self->{'oai-files'}->{$file}->{'rawxml'} = $self->{'rawxml'};
260 $self->{'rawxml'} = "";
261 }
262
263 # return all the metadata we have extracted to the caller.
264 # Directory plug will pass it back in at read time, so we don't need to extract it again.
265
266 # Extrametadata keys should be regular expressions
267 # Indexing into the extrameta data structures requires the filename's style of slashes to be in URL format
268 # Then need to convert the filename to a regex, no longer to protect windows directory chars \, but for
269 # protecting special characters like brackets in the filepath such as "C:\Program Files (x86)\Greenstone".
270 $filename_for_metadata = &util::filepath_to_url_format($filename_for_metadata);
271 $filename_for_metadata = &util::filename_to_regex($filename_for_metadata);
272
273 # Check that we haven't already got some metadata
274 if (defined &extrametautil::getmetadata($extrametadata, $filename_for_metadata)) {
275 print STDERR "\n**** OAIPlugin: Need to merge new metadata with existing stored metadata: file = $filename_for_metadata\n" if $verbosity > 3;
276
277 my $file_metadata_table = &extrametautil::getmetadata($extrametadata, $filename_for_metadata);
278
279 foreach my $metaname (keys %{$new_metadata}) {
280 # will create new entry if one does not already exist
281 push(@{$file_metadata_table->{$metaname}}, @{$new_metadata->{$metaname}});
282 }
283
284 } else {
285 &extrametautil::setmetadata($extrametadata, $filename_for_metadata, $new_metadata);
286 &extrametautil::addmetakey($extrametakeys, $filename_for_metadata);
287 }
288
289 if ($srcdoc_exists) {
290 if (!defined &extrametautil::getmetafile($extrametafile, $filename_for_metadata)) {
291 &extrametautil::setmetafile($extrametafile, $filename_for_metadata, {});
292 }
293 #maps the file to full path
294 &extrametautil::setmetafile_for_named_file($extrametafile, $filename_for_metadata, $file, $filename_full_path);
295
296 }
297 return 1;
298
299}
300
301
302sub read {
303 my $self = shift (@_);
304
305 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
306
307 if (!defined $self->{'oai-files'}->{$file}) {
308 return undef;
309 }
310
311 my $srcdoc_exists = $self->{'oai-files'}->{$file}->{'srcdoc_exists'};
312 if ($srcdoc_exists) {
313 # do nothing more - all the metadata has been extracted and associated with the srcdoc
314 # no more need to access details of this $file => tidy up as you go
315 delete $self->{'oai-files'}->{$file};
316 return 0; # not processed here, but don't pass on to rest of plugins
317 }
318
319 my $filename = $file;
320 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
321
322 # Do encoding stuff on metadata
323 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
324
325 # create a new document
326 my $doc_obj = new doc ($filename, "indexed_doc", $self->{'file_rename_method'});
327 my $top_section = $doc_obj->get_top_section;
328 my $plugin_type = $self->{'plugin_type'};
329
330 my ($filemeta) = $file =~ /([^\\\/]+)$/;
331 my $plugin_filename_encoding = $self->{'filename_encoding'};
332 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
333 $self->set_Source_metadata($doc_obj, $filename, $filename_encoding);
334
335 $doc_obj->add_utf8_metadata($top_section, "Language", $language);
336 $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding);
337 $doc_obj->add_utf8_metadata($top_section, "Plugin", $plugin_type);
338 $doc_obj->add_metadata($top_section, "FileFormat", "OAI");
339 $doc_obj->add_metadata($top_section, "FileSize", (-s $filename));
340
341 # include any metadata passed in from previous plugins
342 # note that this metadata is associated with the top level section
343 # this will include all the metadata from the oai file that we extracted
344 # during metadata_read
345 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
346
347 # do plugin specific processing of doc_obj
348 my $text = $self->{'oai-files'}->{$file}->{'rawxml'};
349 delete $self->{'oai-files'}->{$file};
350
351 unless (defined ($self->process(\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
352 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
353 return -1;
354 }
355
356 # do any automatic metadata extraction
357 $self->auto_extract_metadata ($doc_obj);
358
359 # add an OID
360 $self->add_OID($doc_obj);
361
362 # process the document
363 $processor->process($doc_obj);
364
365 $self->{'num_processed'} ++;
366
367 return 1; # processed the file
368}
369
370
371# do plugin specific processing of doc_obj
372sub process {
373 my $self = shift (@_);
374 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
375 my $outhandle = $self->{'outhandle'};
376
377 print STDERR "<Processing n='$file' p='OAIPlugin'>\n" if ($gli);
378 print $outhandle "OAIPlugin: processing $file\n"
379 if $self->{'verbosity'} > 1;
380
381 my $cursection = $doc_obj->get_top_section();
382
383## $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection);
384
385 # add text to document object
386
387# $$textref =~ s/<(.*?)>/$1 /g;
388 $$textref =~ s/</&lt;/g;
389 $$textref =~ s/>/&gt;/g;
390 $$textref =~ s/\[/&#91;/g;
391 $$textref =~ s/\]/&#93;/g;
392
393 $doc_obj->add_utf8_text($cursection, $$textref);
394
395 return 1;
396}
397
398
399# Improvement is to merge this with newer version in MetadataPass
400
401sub open_prettyprint_metadata_table
402{
403 my $self = shift(@_);
404
405 my $att = "width=100% cellspacing=2";
406 my $style = "style=\'border-bottom: 4px solid #000080\'";
407
408 $self->{'ppmd_table'} = "\n<table $att $style>";
409}
410
411sub add_prettyprint_metadata_line
412{
413 my $self = shift(@_);
414 my ($metaname, $metavalue_utf8) = @_;
415
416 $metavalue_utf8 = &util::hyperlink_text($metavalue_utf8);
417
418 $self->{'ppmd_table'} .= " <tr bgcolor=#b5d3cd>\n";
419 $self->{'ppmd_table'} .= " <td width=30%>\n";
420 $self->{'ppmd_table'} .= " $metaname\n";
421 $self->{'ppmd_table'} .= " </td>\n";
422 $self->{'ppmd_table'} .= " <td>\n";
423 $self->{'ppmd_table'} .= " $metavalue_utf8\n";
424 $self->{'ppmd_table'} .= " </td>\n";
425 $self->{'ppmd_table'} .= " </tr>\n";
426
427}
428
429sub close_prettyprint_metadata_table
430{
431 my $self = shift(@_);
432
433 $self->{'ppmd_table'} .= "</table>\n";
434}
435
436my $qualified_dc_mapping = {
437 "alternative" => "dc.title",
438 "tableOfContents" => "dc.description",
439 "abstract" => "dc.description",
440 "created" => "dc.date",
441 "valid" => "dc.date",
442 "available" => "dc.date",
443 "issued" => "dc.date",
444 "modified" => "dc.date",
445 "dateAccepted" => "dc.date",
446 "dateCopyrighted" => "dc.date",
447 "dateSubmitted" => "dc.date",
448 "extent" => "dc.format",
449 "medium" => "dc.format",
450 "isVersionOf" => "dc.relation",
451 "hasVersion" => "dc.relation",
452 "isReplacedBy" => "dc.relation",
453 "replaces" => "dc.relation",
454 "isRequiredBy" => "dc.relation",
455 "requires" => "dc.relation",
456 "isPartOf" => "dc.relation",
457 "hasPart" => "dc.relation",
458 "isReferencedBy" => "dc.relation",
459 "references" => "dc.relation",
460 "isFormatOf" => "dc.relation",
461 "hasFormat" => "dc.relation",
462 "conformsTo" => "dc.relation",
463 "spatial" => "dc.coverage",
464 "temporal" => "dc.coverage",
465# these are now top level elements in our qualified dc metadata set
466# "audience" => "dc.any",
467# "accrualMethod" => "dc.any",
468# "accrualPeriodicity" => "dc.any",
469# "accrualPolicy" => "dc.any",
470# "instructionalMethod" => "dc.any",
471# "provenance" => "dc.any",
472# "rightsHolder" => "dc.any",
473 "mediator" => "dc.audience",
474 "educationLevel" => "dc.audience",
475 "accessRights" => "dc.rights",
476 "license" => "dc.rights",
477 "bibliographicCitation" => "dc.identifier"
478 };
479
480sub remap_dc_metadata
481{
482 my $self = shift(@_);
483
484 my ($metaname) = @_;
485
486 my ($prefix,$name) = ($metaname =~ m/^(.*?)\.(.*?)$/);
487
488 if (defined $qualified_dc_mapping->{$name}) {
489
490 return $qualified_dc_mapping->{$name}."^".$name;
491 }
492
493
494 return $metaname; # didn't get a match, return param passed in unchanged
495}
496
497
498sub extract_oai_metadata {
499 my $self = shift (@_);
500 my ($textref, $metadata) = @_;
501 my $outhandle = $self->{'outhandle'};
502
503 $self->open_prettyprint_metadata_table();
504
505 # need to decode the string, else it will be double-encoded at this point
506 $$textref = decode("utf-8",$$textref);
507
508# Debugging encoding issues with Devel::Peek's Dump() which prints octal and hexcode
509# print STDERR "#### text ref: $$textref\n";
510# print STDERR "\n@@@\n";
511# Dump($$textref);
512# print STDERR "\n";
513
514 if ($$textref =~ m/<metadata\s*>(.*?)<\/metadata\s*>/s)
515 {
516 my $metadata_text = $1;
517
518 # locate and remove outermost tag (ignoring any attribute information in top-level tag)
519 my ($outer_tagname,$inner_metadata_text) = ($metadata_text =~ m/<([^ >]+).*?>(.*?)<\/\1>/s);
520 # split tag into namespace and tag name
521 my($namespace,$top_level_prefix) = ($outer_tagname =~ m/^(.*?):(.*?)$/);
522 # sometimes, the dc namespace is not specified as the prefix in each element (like <dc:title>)
523 # but is rather defined in the wrapper element containing the various dc meta elements,
524 # like <dc><title></title><creator></creator></dc>.
525 # In such a case, we use this wrapper element as the top_level_prefix
526
527 # if there was no prefix, then the tag itself becomes the top_level_prefix
528 if(!defined $top_level_prefix && defined $outer_tagname) {
529 $top_level_prefix = $outer_tagname;
530 }
531
532 #process each element one by one
533 while ($inner_metadata_text =~ m/<([^ >]+).*?>(.*?)<\/\1>(.*)/s)
534 {
535
536 my $metaname = $1;
537 my $metavalue = $2;
538 $inner_metadata_text = $3;
539
540 # greenstone uses . for namespace, while oai uses :
541 $metaname =~ s/:/\./;
542 # if there is no namespace, then we use the outer tag name or
543 # namespace for this element
544 if ($metaname !~ m/\./)
545 {
546 $metaname = "$top_level_prefix.$metaname";
547 }
548
549 # if metadata set is auto, leave as is, otherwise convert to
550 # specified namespace
551 if ($self->{'metadata_set'} ne "auto") {
552 if ($metaname !~ /^gi\./) { # hack to not overwrite gi metadata
553 $metaname =~ s/^([^\.]*)\./$self->{'metadata_set'}\./;
554 if ($self->{'metadata_set'} eq "dc") {
555 # convert qualified dc terms to gs version, e.g.
556 # spatial becomes coverage^spatial
557 $metaname = $self->remap_dc_metadata($metaname);
558 }
559 }
560 }
561
562 # uppercase the first char of the name
563 $metaname =~ s/\.(.)/\.\u$1/;
564 $metavalue =~ s/\[/&#91;/g;
565 $metavalue =~ s/\]/&#93;/g;
566
567 # so that GLI can see this metadata, store here as ex.dc.Title etc
568 my $ex_metaname = $metaname;
569 $ex_metaname =~ s/^ex\.//; # remove any pre-existing ex. prefix
570 $ex_metaname = "ex.$ex_metaname"; # at last can prefix ex.
571
572 if (defined $metadata->{$ex_metaname})
573 {
574 push(@{$metadata->{$ex_metaname}},$metavalue);
575
576 }
577 else
578 {
579 $metadata->{$ex_metaname} = [ $metavalue ];
580 }
581
582 # but don't add ex to the pretty print line
583 $self->add_prettyprint_metadata_line($metaname, $metavalue);
584
585 }
586 }
587
588 $self->close_prettyprint_metadata_table();
589}
590
591## we know from the file extension, so doesn't need to check the doctype
592sub check_doctype {
593
594 return 1;
595}
596
5971;
Note: See TracBrowser for help on using the repository browser.