source: main/trunk/greenstone2/perllib/plugins/OAIPlugin.pm@ 24547

Last change on this file since 24547 was 24547, checked in by ak19, 13 years ago

Added new abstract plugin MetadataRead that defines can_process_this_file_for_metadata that MetadataPlugin subclasses can inherit (if MetadataRead is listed first in the ISA inheritance list) and which will then override the one defined in BasePlugin. For now committing MARC, ISIS and OAIPlugins which now additionally inherit from MetadataRead. Other metadataPlugins also need to be committed.

  • Property svn:keywords set to Author Date Id Revision
File size: 16.3 KB
Line 
1###########################################################################
2#
3# OAIPlug.pm -- basic Open Archives Initiative (OAI) plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package OAIPlugin;
28
29use unicode;
30use util;
31
32use strict;
33no strict 'refs'; # allow filehandles to be variables and viceversa
34
35use ReadXMLFile;
36use ReadTextFile; # needed for subroutine textcat_get_language_encoding
37use metadatautil;
38use MetadataRead;
39
40# methods with identical signatures take precedence in the order given in the ISA list.
41sub BEGIN {
42 @OAIPlugin::ISA = ('MetadataRead', 'ReadXMLFile', 'ReadTextFile');
43}
44
45my $set_list =
46 [ { 'name' => "auto",
47 'desc' => "{OAIPlugin.metadata_set.auto}" },
48 { 'name' => "dc",
49 'desc' => "{OAIPlugin.metadata_set.dc}" }
50 ];
51
52my $arguments =
53 [ { 'name' => "process_exp",
54 'desc' => "{BasePlugin.process_exp}",
55 'type' => "regexp",
56 'reqd' => "no",
57 'deft' => &get_default_process_exp() },
58 { 'name' => "metadata_set",
59 'desc' => "{OAIPlugin.metadata_set}",
60 'type' => "enumstring",
61 'reqd' => "no",
62 'list' => $set_list,
63 'deft' => "dc" },
64 { 'name' => "document_field",
65 'desc' => "{OAIPlugin.document_field}",
66 'type' => "metadata",
67 'reqd' => "no",
68 'deft' => "gi.Sourcedoc" }
69 ];
70
71my $options = { 'name' => "OAIPlugin",
72 'desc' => "{OAIPlugin.desc}",
73 'abstract' => "no",
74 'inherits' => "yes",
75 'explodes' => "yes",
76 'args' => $arguments };
77
78
79sub new {
80 my ($class) = shift (@_);
81 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
82 push(@$pluginlist, $class);
83
84 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
85 push(@{$hashArgOptLists->{"OptList"}},$options);
86
87 new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists,1);
88 my $self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
89
90 if ($self->{'info_only'}) {
91 # don't worry about modifying options
92 return bless $self, $class;
93 }
94 # trim any ex. from document field iff it's the only metadata namespace prefix
95 $self->{'document_field'} =~ s/^ex\.([^.]+)$/$1/;
96 return bless $self, $class;
97}
98
99sub get_default_process_exp {
100 my $self = shift (@_);
101
102 return q^(?i)(\.oai)$^;
103}
104
105sub get_doctype {
106 my $self = shift(@_);
107
108 return "OAI-PMH";
109}
110
111sub xml_start_document {
112 my $self = shift (@_);
113 $self->{'in_metadata_node'} = 0;
114 $self->{'rawxml'} = "";
115 $self->{'saved_metadata'} = {};
116}
117
118sub xml_end_document {
119}
120
121sub xml_doctype {
122 my $self = shift(@_);
123
124 my ($expat, $name, $sysid, $pubid, $internal) = @_;
125
126 ##die "" if ($name !~ /^OAI-PMH$/);
127
128 my $outhandle = $self->{'outhandle'};
129 print $outhandle "OAIPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
130 print STDERR "<Processing n='$self->{'file'}' p='OAIPlugin'>\n" if $self->{'gli'};
131
132}
133
134
135sub xml_start_tag {
136 my $self = shift(@_);
137 my ($expat,$element) = @_;
138
139 my %attr_hash = %_;
140
141 my $attr = "";
142 map { $attr .= " $_=$attr_hash{$_}"; } keys %attr_hash;
143
144 $self->{'rawxml'} .= "<$element$attr>";
145
146 if ($element eq "metadata") {
147 $self->{'in_metadata_node'} = 1;
148 $self->{'metadata_xml'} = "";
149 }
150
151 if ($self->{'in_metadata_node'}) {
152 $self->{'metadata_xml'} .= "<$element$attr>";
153 }
154}
155
156sub xml_end_tag {
157 my $self = shift(@_);
158 my ($expat, $element) = @_;
159
160 $self->{'rawxml'} .= "</$element>";
161
162 if ($self->{'in_metadata_node'}) {
163 $self->{'metadata_xml'} .= "</$element>";
164 }
165
166 if ($element eq "metadata") {
167 my $textref = \$self->{'metadata_xml'};
168 #my $metadata = $self->{'metadata'};
169 my $metadata = $self->{'saved_metadata'};
170 $self->extract_oai_metadata($textref,$metadata);
171
172 $self->{'in_metadata_node'} = 0;
173 }
174
175
176}
177
178sub xml_text {
179 my $self = shift(@_);
180 my ($expat) = @_;
181
182 $self->{'rawxml'} .= $_;
183
184 if ($self->{'in_metadata_node'}) {
185 $self->{'metadata_xml'} .= $_;
186 }
187}
188
189
190sub metadata_read {
191 my $self = shift (@_);
192
193 my ($pluginfo, $base_dir, $file, $block_hash,
194 $extrametakeys, $extrametadata, $extrametafile,
195 $processor, $gli, $aux) = @_;
196
197 # can we process this file??
198 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
199 return undef unless $self->can_process_this_file_for_metadata($filename_full_path);
200
201 if (!$self->parse_file($filename_full_path, $file, $gli)) {
202 $self->{'saved_metadata'} = undef;
203 return undef;
204 }
205
206 my $new_metadata = $self->{'saved_metadata'};
207 $self->{'saved_metadata'} = undef;
208
209 # add the pretty metadata table as metadata
210 my $ppmd_table = $self->{'ppmd_table'};
211 $new_metadata->{'prettymd'} = $ppmd_table;
212 $self->{'ppmd_table'} = undef;
213
214 my $document_metadata_field = $self->{'document_field'};
215 my $url_array = $new_metadata->{$document_metadata_field};
216 if (!defined $url_array) {
217 # try ex.
218 $url_array = $new_metadata->{"ex.$document_metadata_field"};
219 }
220 my $num_urls = (defined $url_array) ? scalar(@$url_array) : 0;
221 ##print STDERR "$num_urls urls for $file\n";
222 my $srcdoc_exists = 0;
223 my $srcdoc_pos = 0;
224 my $filename_dir = &util::filename_head($filename_full_path);
225
226 # filenames in extrametadata must be relative to current dir, as
227 # DirectoryPlugin adds path info on itself
228 my ($filename_for_metadata) = $file =~ /([^\\\/]+)$/; # this assumes there will only be one record per oai file - is this always the case??
229 for (my $i=0; $i<$num_urls; $i++) {
230
231 if ($url_array->[$i] !~ m/^(https?|ftp):/) {
232
233 my $src_filename = &util::filename_cat($filename_dir, $url_array->[$i]);
234 if (-e $src_filename) {
235 $srcdoc_pos = $i;
236 $srcdoc_exists = 1;
237 # get the slashes the right way, use filename_cat
238 $filename_for_metadata = &util::filename_cat($url_array->[$i]);
239 last;
240 }
241 }
242 }
243
244 if ($srcdoc_exists) {
245 $self->{'oai-files'}->{$file}->{'srcdoc_exists'} = 1;
246 }
247 else {
248 # save the rawxml for the source document
249 $self->{'oai-files'}->{$file}->{'srcdoc_exists'} = 0;
250 $self->{'oai-files'}->{$file}->{'rawxml'} = $self->{'rawxml'};
251 $self->{'rawxml'} = "";
252 }
253
254 # return all the metadata we have extracted to the caller.
255 # Directory plug will pass it back in at read time, so we don't need to extract it again.
256 # extrametadata keys should be regular expressions
257 $filename_for_metadata = &util::filename_to_regex($filename_for_metadata);
258 $extrametadata->{$filename_for_metadata} = $new_metadata;
259 push(@$extrametakeys, $filename_for_metadata);
260 if ($srcdoc_exists) {
261 if (!defined $extrametafile->{$filename_for_metadata}) {
262 $extrametafile->{$filename_for_metadata} = {};
263 }
264 #maps the file to full path
265 $extrametafile->{$filename_for_metadata}->{$file} = $filename_full_path;
266 }
267 return 1;
268
269}
270
271
272sub read {
273 my $self = shift (@_);
274
275 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
276
277 if (!defined $self->{'oai-files'}->{$file}) {
278 return undef;
279 }
280
281 my $srcdoc_exists = $self->{'oai-files'}->{$file}->{'srcdoc_exists'};
282 if ($srcdoc_exists) {
283 # do nothing more - all the metadata has been extracted and associated with the srcdoc
284 # no more need to access details of this $file => tidy up as you go
285 delete $self->{'oai-files'}->{$file};
286 return 0; # not processed here, but don't pass on to rest of plugins
287 }
288
289 my $filename = $file;
290 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
291
292 # Do encoding stuff on metadata
293 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
294
295 # create a new document
296 my $doc_obj = new doc ($filename, "indexed_doc", $self->{'file_rename_method'});
297 my $top_section = $doc_obj->get_top_section;
298 my $plugin_type = $self->{'plugin_type'};
299
300 my ($filemeta) = $file =~ /([^\\\/]+)$/;
301 my $plugin_filename_encoding = $self->{'filename_encoding'};
302 my $filename_encoding = $self->deduce_filename_encoding($file,$metadata,$plugin_filename_encoding);
303 $self->set_Source_metadata($doc_obj, $filename, $filename_encoding);
304
305 $doc_obj->add_utf8_metadata($top_section, "Language", $language);
306 $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding);
307 $doc_obj->add_utf8_metadata($top_section, "Plugin", $plugin_type);
308 $doc_obj->add_metadata($top_section, "FileFormat", "OAI");
309 $doc_obj->add_metadata($top_section, "FileSize", (-s $filename));
310
311 # include any metadata passed in from previous plugins
312 # note that this metadata is associated with the top level section
313 # this will include all the metadata from the oai file that we extracted
314 # during metadata_read
315 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
316
317 # do plugin specific processing of doc_obj
318 my $text = $self->{'oai-files'}->{$file}->{'rawxml'};
319 delete $self->{'oai-files'}->{$file};
320
321 unless (defined ($self->process(\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
322 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
323 return -1;
324 }
325
326 # do any automatic metadata extraction
327 $self->auto_extract_metadata ($doc_obj);
328
329 # add an OID
330 $self->add_OID($doc_obj);
331
332 # process the document
333 $processor->process($doc_obj);
334
335 $self->{'num_processed'} ++;
336
337 return 1; # processed the file
338}
339
340
341# do plugin specific processing of doc_obj
342sub process {
343 my $self = shift (@_);
344 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
345 my $outhandle = $self->{'outhandle'};
346
347 print STDERR "<Processing n='$file' p='OAIPlugin'>\n" if ($gli);
348 print $outhandle "OAIPlugin: processing $file\n"
349 if $self->{'verbosity'} > 1;
350
351 my $cursection = $doc_obj->get_top_section();
352
353## $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection);
354
355 # add text to document object
356
357# $$textref =~ s/<(.*?)>/$1 /g;
358 $$textref =~ s/</&lt;/g;
359 $$textref =~ s/>/&gt;/g;
360 $$textref =~ s/\[/&#91;/g;
361 $$textref =~ s/\]/&#93;/g;
362
363 $doc_obj->add_utf8_text($cursection, $$textref);
364
365 return 1;
366}
367
368
369# Improvement is to merge this with newer version in MetadataPass
370
371sub open_prettyprint_metadata_table
372{
373 my $self = shift(@_);
374
375 my $att = "width=100% cellspacing=2";
376 my $style = "style=\'border-bottom: 4px solid #000080\'";
377
378 $self->{'ppmd_table'} = "\n<table $att $style>";
379}
380
381sub add_prettyprint_metadata_line
382{
383 my $self = shift(@_);
384 my ($metaname, $metavalue_utf8) = @_;
385
386 $metavalue_utf8 = &util::hyperlink_text($metavalue_utf8);
387
388 $self->{'ppmd_table'} .= " <tr bgcolor=#b5d3cd>\n";
389 $self->{'ppmd_table'} .= " <td width=30%>\n";
390 $self->{'ppmd_table'} .= " $metaname\n";
391 $self->{'ppmd_table'} .= " </td>\n";
392 $self->{'ppmd_table'} .= " <td>\n";
393 $self->{'ppmd_table'} .= " $metavalue_utf8\n";
394 $self->{'ppmd_table'} .= " </td>\n";
395 $self->{'ppmd_table'} .= " </tr>\n";
396
397}
398
399sub close_prettyprint_metadata_table
400{
401 my $self = shift(@_);
402
403 $self->{'ppmd_table'} .= "</table>\n";
404}
405
406my $qualified_dc_mapping = {
407 "alternative" => "dc.title",
408 "tableOfContents" => "dc.description",
409 "abstract" => "dc.description",
410 "created" => "dc.date",
411 "valid" => "dc.date",
412 "available" => "dc.date",
413 "issued" => "dc.date",
414 "modified" => "dc.date",
415 "dateAccepted" => "dc.date",
416 "dateCopyrighted" => "dc.date",
417 "dateSubmitted" => "dc.date",
418 "extent" => "dc.format",
419 "medium" => "dc.format",
420 "isVersionOf" => "dc.relation",
421 "hasVersion" => "dc.relation",
422 "isReplacedBy" => "dc.relation",
423 "replaces" => "dc.relation",
424 "isRequiredBy" => "dc.relation",
425 "requires" => "dc.relation",
426 "isPartOf" => "dc.relation",
427 "hasPart" => "dc.relation",
428 "isReferencedBy" => "dc.relation",
429 "references" => "dc.relation",
430 "isFormatOf" => "dc.relation",
431 "hasFormat" => "dc.relation",
432 "conformsTo" => "dc.relation",
433 "spatial" => "dc.coverage",
434 "temporal" => "dc.coverage",
435# these are now top level elements in our qualified dc metadata set
436# "audience" => "dc.any",
437# "accrualMethod" => "dc.any",
438# "accrualPeriodicity" => "dc.any",
439# "accrualPolicy" => "dc.any",
440# "instructionalMethod" => "dc.any",
441# "provenance" => "dc.any",
442# "rightsHolder" => "dc.any",
443 "mediator" => "dc.audience",
444 "educationLevel" => "dc.audience",
445 "accessRights" => "dc.rights",
446 "license" => "dc.rights",
447 "bibliographicCitation" => "dc.identifier"
448 };
449
450sub remap_dc_metadata
451{
452 my $self = shift(@_);
453
454 my ($metaname) = @_;
455
456 my ($prefix,$name) = ($metaname =~ m/^(.*?)\.(.*?)$/);
457
458 if (defined $qualified_dc_mapping->{$name}) {
459
460 return $qualified_dc_mapping->{$name}."^".$name;
461 }
462
463
464 return $metaname; # didn't get a match, return param passed in unchanged
465}
466
467
468sub extract_oai_metadata {
469 my $self = shift (@_);
470 my ($textref, $metadata) = @_;
471 my $outhandle = $self->{'outhandle'};
472
473 $self->open_prettyprint_metadata_table();
474
475 if ($$textref =~ m/<metadata\s*>(.*?)<\/metadata\s*>/s)
476 {
477 my $metadata_text = $1;
478
479 # locate and remove outermost tag (ignoring any attribute information in top-level tag)
480 my ($outer_tagname,$inner_metadata_text) = ($metadata_text =~ m/<([^ >]+).*?>(.*?)<\/\1>/s);
481 # split tag into namespace and tag name
482 my($namespace,$top_level_prefix) = ($outer_tagname =~ m/^(.*?):(.*?)$/);
483 # sometimes, the dc namespace is not specified as the prefix in each element (like <dc:title>)
484 # but is rather defined in the wrapper element containing the various dc meta elements,
485 # like <dc><title></title><creator></creator></dc>.
486 # In such a case, we use this wrapper element as the top_level_prefix
487
488 # if there was no prefix, then the tag itself becomes the top_level_prefix
489 if(!defined $top_level_prefix && defined $outer_tagname) {
490 $top_level_prefix = $outer_tagname;
491 }
492
493 #process each element one by one
494 while ($inner_metadata_text =~ m/<([^ >]+).*?>(.*?)<\/\1>(.*)/s)
495 {
496
497 my $metaname = $1;
498 my $metavalue = $2;
499 $inner_metadata_text = $3;
500
501 # greenstone uses . for namespace, while oai uses :
502 $metaname =~ s/:/\./;
503 # if there is no namespace, then we use the outer tag name or
504 # namespace for this element
505 if ($metaname !~ m/\./)
506 {
507 $metaname = "$top_level_prefix.$metaname";
508 }
509
510 # if metadata set is auto, leave as is, otherwise convert to
511 # specified namespace
512 if ($self->{'metadata_set'} ne "auto") {
513 if ($metaname !~ /^gi\./) { # hack to not overwrite gi metadata
514 $metaname =~ s/^([^\.]*)\./$self->{'metadata_set'}\./;
515 if ($self->{'metadata_set'} eq "dc") {
516 # convert qualified dc terms to gs version, e.g.
517 # spatial becomes coverage^spatial
518 $metaname = $self->remap_dc_metadata($metaname);
519 }
520 }
521 }
522
523 # uppercase the first char of the name
524 $metaname =~ s/\.(.)/\.\u$1/;
525 $metavalue =~ s/\[/&#91;/g;
526 $metavalue =~ s/\]/&#93;/g;
527
528 # so that GLI can see this metadata, store here as ex.dc.Title etc
529 my $ex_metaname = $metaname;
530 $ex_metaname =~ s/^ex\.//; # remove any pre-existing ex. prefix
531 $ex_metaname = "ex.$ex_metaname"; # at last can prefix ex.
532
533 if (defined $metadata->{$ex_metaname})
534 {
535 push(@{$metadata->{$ex_metaname}},$metavalue);
536
537 }
538 else
539 {
540 $metadata->{$ex_metaname} = [ $metavalue ];
541 }
542
543 # but don't add ex to the pretty print line
544 $self->add_prettyprint_metadata_line($metaname, $metavalue);
545
546 }
547 }
548
549 $self->close_prettyprint_metadata_table();
550}
551
552## we know from the file extension, so doesn't need to check the doctype
553sub check_doctype {
554
555 return 1;
556}
557
5581;
Note: See TracBrowser for help on using the repository browser.