source: gsdl/trunk/perllib/plugins/OAIPlugin.pm@ 19213

Last change on this file since 19213 was 19213, checked in by kjdon, 15 years ago

added metadata_set option. if set to auto, then metadata will be extracted from the record and left with the namespace it had originally. If set to something else, then all elements will get that namespace. If set to dc, will additionally do a mapping to greenstone form of qualified dc eg spatial is dc.Coveragespatial

  • Property svn:keywords set to Author Date Id Revision
File size: 14.9 KB
Line 
1###########################################################################
2#
3# OAIPlug.pm -- basic Open Archives Initiative (OAI) plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package OAIPlugin;
28
29use unicode;
30use util;
31
32use strict;
33no strict 'refs'; # allow filehandles to be variables and viceversa
34
35use ReadXMLFile;
36use ReadTextFile; # needed for subroutine textcat_get_language_encoding
37use metadatautil;
38
39sub BEGIN {
40 @OAIPlugin::ISA = ('ReadXMLFile', 'ReadTextFile');
41}
42
43my $set_list =
44 [ { 'name' => "auto",
45 'desc' => "{OAIPlugin.metadata_set.auto}" },
46 { 'name' => "dc",
47 'desc' => "{OAIPlugin.metadata_set.dc}" }
48 ];
49
50my $arguments =
51 [ { 'name' => "process_exp",
52 'desc' => "{BasePlugin.process_exp}",
53 'type' => "regexp",
54 'reqd' => "no",
55 'deft' => &get_default_process_exp() },
56 { 'name' => "metadata_set",
57 'desc' => "{OAIPlugin.metadata_set}",
58 'type' => "enumstring",
59 'reqd' => "no",
60 'list' => $set_list,
61 'deft' => "dc" },
62 { 'name' => "document_field",
63 'desc' => "{OAIPlugin.document_field}",
64 'type' => "metadata",
65 'reqd' => "no",
66 'deft' => "gi.Sourcedoc" }
67 ];
68
69my $options = { 'name' => "OAIPlugin",
70 'desc' => "{OAIPlugin.desc}",
71 'abstract' => "no",
72 'inherits' => "yes",
73 'explodes' => "yes",
74 'args' => $arguments };
75
76
77sub new {
78 my ($class) = shift (@_);
79 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
80 push(@$pluginlist, $class);
81
82 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
83 push(@{$hashArgOptLists->{"OptList"}},$options);
84
85 new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists,1);
86 my $self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
87
88 return bless $self, $class;
89}
90
91sub get_default_process_exp {
92 my $self = shift (@_);
93
94 return q^(?i)(\.oai)$^;
95}
96
97sub get_doctype {
98 my $self = shift(@_);
99
100 return "OAI-PMH";
101}
102
103sub xml_start_document {
104 my $self = shift (@_);
105 $self->{'in_metadata_node'} = 0;
106 $self->{'rawxml'} = "";
107 $self->{'saved_metadata'} = {};
108}
109
110sub xml_end_document {
111}
112
113sub xml_doctype {
114 my $self = shift(@_);
115
116 my ($expat, $name, $sysid, $pubid, $internal) = @_;
117
118 ##die "" if ($name !~ /^OAI-PMH$/);
119
120 my $outhandle = $self->{'outhandle'};
121 print $outhandle "OAIPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
122 print STDERR "<Processing n='$self->{'file'}' p='OAIPlugin'>\n" if $self->{'gli'};
123
124}
125
126
127sub xml_start_tag {
128 my $self = shift(@_);
129 my ($expat,$element) = @_;
130
131 my %attr_hash = %_;
132
133 my $attr = "";
134 map { $attr .= " $_=$attr_hash{$_}"; } keys %attr_hash;
135
136 $self->{'rawxml'} .= "<$element$attr>";
137
138 if ($element eq "metadata") {
139 $self->{'in_metadata_node'} = 1;
140 $self->{'metadata_xml'} = "";
141 }
142
143 if ($self->{'in_metadata_node'}) {
144 $self->{'metadata_xml'} .= "<$element$attr>";
145 }
146}
147
148sub xml_end_tag {
149 my $self = shift(@_);
150 my ($expat, $element) = @_;
151
152 $self->{'rawxml'} .= "</$element>";
153
154 if ($self->{'in_metadata_node'}) {
155 $self->{'metadata_xml'} .= "</$element>";
156 }
157
158 if ($element eq "metadata") {
159 my $textref = \$self->{'metadata_xml'};
160 #my $metadata = $self->{'metadata'};
161 my $metadata = $self->{'saved_metadata'};
162 $self->extract_oai_metadata($textref,$metadata);
163
164 $self->{'in_metadata_node'} = 0;
165 }
166
167
168}
169
170sub xml_text {
171 my $self = shift(@_);
172 my ($expat) = @_;
173
174 $self->{'rawxml'} .= $_;
175
176 if ($self->{'in_metadata_node'}) {
177 $self->{'metadata_xml'} .= $_;
178 }
179}
180
181
182sub metadata_read {
183 my $self = shift (@_);
184
185 my ($pluginfo, $base_dir, $file, $block_hash, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
186
187 # can we process this file??
188 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
189 return undef unless $self->can_process_this_file($filename_full_path);
190
191 if (!$self->parse_file($filename_full_path, $file, $gli)) {
192 $self->{'saved_metadata'} = undef;
193 return undef;
194 }
195
196 my $new_metadata = $self->{'saved_metadata'};
197 $self->{'saved_metadata'} = undef;
198
199 # add the pretty metadata table as metadata
200 my $ppmd_table = $self->{'ppmd_table'};
201 $new_metadata->{'prettymd'} = $ppmd_table;
202 $self->{'ppmd_table'} = undef;
203
204 my $document_metadata_field = $self->{'document_field'};
205 my $url_array = $new_metadata->{$document_metadata_field};
206 my $num_urls = (defined $url_array) ? scalar(@$url_array) : 0;
207 ##print STDERR "$num_urls urls for $file\n";
208 my $srcdoc_exists = 0;
209 my $srcdoc_pos = 0;
210 my $filename_dir = &util::filename_head($filename_full_path);
211
212 # filenames in extrametadata must be relative to current dir, as
213 # DirectoryPlugin adds path info on itself
214 my ($filename_for_metadata) = $file =~ /([^\\\/]+)$/; # this assumes there will only be one record per oai file - is this always the case??
215 for (my $i=0; $i<$num_urls; $i++) {
216
217 if ($url_array->[$i] !~ m/^(https?|ftp):/) {
218
219 my $src_filename = &util::filename_cat($filename_dir, $url_array->[$i]);
220
221 if (-e $src_filename) {
222 $srcdoc_pos = $i;
223 $srcdoc_exists = 1;
224 $filename_for_metadata = $url_array->[$i];
225 last;
226 }
227 }
228 }
229
230 if ($srcdoc_exists) {
231 $self->{'oai-files'}->{$file}->{'srcdoc_exists'} = 1;
232 }
233 else {
234 # save the rawxml for the source document
235 $self->{'oai-files'}->{$file}->{'srcdoc_exists'} = 0;
236 $self->{'oai-files'}->{$file}->{'rawxml'} = $self->{'rawxml'};
237 $self->{'rawxml'} = "";
238 }
239
240 # return all the metadata we have extracted to the caller.
241 # Directory plug will pass it back in at read time, so we don't need to extract it again.
242 # extrametadata keys should be regular expressions
243 $filename_for_metadata = &util::filename_to_regex($filename_for_metadata);
244 $extrametadata->{$filename_for_metadata} = $new_metadata;
245 push(@$extrametakeys, $filename_for_metadata);
246
247 return 1;
248
249}
250
251
252sub read {
253 my $self = shift (@_);
254
255 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
256
257 if (!defined $self->{'oai-files'}->{$file}) {
258 return undef;
259 }
260
261 my $srcdoc_exists = $self->{'oai-files'}->{$file}->{'srcdoc_exists'};
262 if ($srcdoc_exists) {
263 # do nothing more - all the metadata has been extracted and associated with the srcdoc
264 # no more need to access details of this $file => tidy up as you go
265 delete $self->{'oai-files'}->{$file};
266 return 0; # not processed here, but don't pass on to rest of plugins
267 }
268
269 my $filename = $file;
270 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
271
272 # Do encoding stuff on metadata
273 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
274
275 # create a new document
276 my $doc_obj = new doc ($filename, "indexed_doc", $self->{'file_rename_method'});
277 my $top_section = $doc_obj->get_top_section;
278 my $plugin_type = $self->{'plugin_type'};
279
280 my ($filemeta) = $file =~ /([^\\\/]+)$/;
281 $self->set_Source_metadata($doc_obj, $filemeta, $encoding);
282 $doc_obj->add_utf8_metadata($top_section, "Language", $language);
283 $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding);
284 $doc_obj->add_utf8_metadata($top_section, "Plugin", $plugin_type);
285 $doc_obj->add_metadata($top_section, "FileFormat", "OAI");
286 $doc_obj->add_metadata($top_section, "FileSize", (-s $filename));
287
288 # include any metadata passed in from previous plugins
289 # note that this metadata is associated with the top level section
290 # this will include all the metadata from the oai file that we extracted
291 # during metadata_read
292 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
293
294 # do plugin specific processing of doc_obj
295 my $text = $self->{'oai-files'}->{$file}->{'rawxml'};
296 delete $self->{'oai-files'}->{$file};
297
298 unless (defined ($self->process(\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
299 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
300 return -1;
301 }
302
303 # do any automatic metadata extraction
304 $self->auto_extract_metadata ($doc_obj);
305
306 # add an OID
307 $self->add_OID($doc_obj);
308
309 # process the document
310 $processor->process($doc_obj);
311
312 $self->{'num_processed'} ++;
313
314 return 1; # processed the file
315}
316
317
318# do plugin specific processing of doc_obj
319sub process {
320 my $self = shift (@_);
321 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
322 my $outhandle = $self->{'outhandle'};
323
324 print STDERR "<Processing n='$file' p='OAIPlugin'>\n" if ($gli);
325 print $outhandle "OAIPlugin: processing $file\n"
326 if $self->{'verbosity'} > 1;
327
328 my $cursection = $doc_obj->get_top_section();
329
330## $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection);
331
332 # add text to document object
333
334# $$textref =~ s/<(.*?)>/$1 /g;
335 $$textref =~ s/</&lt;/g;
336 $$textref =~ s/>/&gt;/g;
337 $$textref =~ s/\[/&#91;/g;
338 $$textref =~ s/\]/&#93;/g;
339
340 $doc_obj->add_utf8_text($cursection, $$textref);
341
342 return 1;
343}
344
345
346# Improvement is to merge this with newer version in MetadataPass
347
348sub open_prettyprint_metadata_table
349{
350 my $self = shift(@_);
351
352 my $att = "width=100% cellspacing=2";
353 my $style = "style=\'border-bottom: 4px solid #000080\'";
354
355 $self->{'ppmd_table'} = "\n<table $att $style>";
356}
357
358sub add_prettyprint_metadata_line
359{
360 my $self = shift(@_);
361 my ($metaname, $metavalue_utf8) = @_;
362
363 $metavalue_utf8 = &util::hyperlink_text($metavalue_utf8);
364
365 $self->{'ppmd_table'} .= " <tr bgcolor=#b5d3cd>\n";
366 $self->{'ppmd_table'} .= " <td width=30%>\n";
367 $self->{'ppmd_table'} .= " $metaname\n";
368 $self->{'ppmd_table'} .= " </td>\n";
369 $self->{'ppmd_table'} .= " <td>\n";
370 $self->{'ppmd_table'} .= " $metavalue_utf8\n";
371 $self->{'ppmd_table'} .= " </td>\n";
372 $self->{'ppmd_table'} .= " </tr>\n";
373
374}
375
376sub close_prettyprint_metadata_table
377{
378 my $self = shift(@_);
379
380 $self->{'ppmd_table'} .= "</table>\n";
381}
382
383my $qualified_dc_mapping = {
384 "alternative" => "dc.title",
385 "tableOfContents" => "dc.description",
386 "abstract" => "dc.description",
387 "created" => "dc.date",
388 "valid" => "dc.date",
389 "available" => "dc.date",
390 "issued" => "dc.date",
391 "modified" => "dc.date",
392 "dateAccepted" => "dc.date",
393 "dateCopyrighted" => "dc.date",
394 "dateSubmitted" => "dc.date",
395 "extent" => "dc.format",
396 "medium" => "dc.format",
397 "isVersionOf" => "dc.relation",
398 "hasVersion" => "dc.relation",
399 "isReplacedBy" => "dc.relation",
400 "replaces" => "dc.relation",
401 "isRequiredBy" => "dc.relation",
402 "requires" => "dc.relation",
403 "isPartOf" => "dc.relation",
404 "hasPart" => "dc.relation",
405 "isReferencedBy" => "dc.relation",
406 "references" => "dc.relation",
407 "isFormatOf" => "dc.relation",
408 "hasFormat" => "dc.relation",
409 "conformsTo" => "dc.relation",
410 "spatial" => "dc.coverage",
411 "temporal" => "dc.coverage",
412# these are now top level elements in our qualified dc metadata set
413# "audience" => "dc.any",
414# "accrualMethod" => "dc.any",
415# "accrualPeriodicity" => "dc.any",
416# "accrualPolicy" => "dc.any",
417# "instructionalMethod" => "dc.any",
418# "provenance" => "dc.any",
419# "rightsHolder" => "dc.any",
420 "mediator" => "dc.audience",
421 "educationLevel" => "dc.audience",
422 "accessRights" => "dc.rights",
423 "license" => "dc.rights",
424 "bibliographicCitation" => "dc.identifier"
425 };
426
427sub remap_dc_metadata
428{
429 my $self = shift(@_);
430
431 my ($metaname) = @_;
432
433 my ($prefix,$name) = ($metaname =~ m/^(.*?)\.(.*?)$/);
434
435 if (defined $qualified_dc_mapping->{$name}) {
436
437 return $qualified_dc_mapping->{$name}."^".$name;
438 }
439
440
441 return $metaname; # didn't get a match, return param passed in unchanged
442}
443
444
445sub extract_oai_metadata {
446 my $self = shift (@_);
447 my ($textref, $metadata) = @_;
448 my $outhandle = $self->{'outhandle'};
449
450 $self->open_prettyprint_metadata_table();
451
452 if ($$textref =~ m/<metadata\s*>(.*?)<\/metadata\s*>/s)
453 {
454 my $metadata_text = $1;
455
456 # locate and remove outermost tag (ignoring any attribute information in top-level tag)
457 my ($outer_tagname,$inner_metadata_text) = ($metadata_text =~ m/<([^ >]+).*?>(.*?)<\/\1>/s);
458 # split tag into namespace and tag name
459 my($namespace,$top_level_prefix) = ($outer_tagname =~ m/^(.*?):(.*?)$/);
460 # sometimes, the dc namespace is not specified as the prefix in each element (like <dc:title>)
461 # but is rather defined in the wrapper element containing the various dc meta elements,
462 # like <dc><title></title><creator></creator></dc>.
463 # In such a case, we use this wrapper element as the top_level_prefix
464
465 # if there was no prefix, then the tag itself becomes the top_level_prefix
466 if(!defined $top_level_prefix && defined $outer_tagname) {
467 $top_level_prefix = $outer_tagname;
468 }
469
470 #process each element one by one
471 while ($inner_metadata_text =~ m/<([^ >]+).*?>(.*?)<\/\1>(.*)/s)
472 {
473
474 my $metaname = $1;
475 my $metavalue = $2;
476 $inner_metadata_text = $3;
477
478 # greenstone uses . for namespace, while oai uses :
479 $metaname =~ s/:/\./;
480 # if there is no namespace, then we use the outer tag name or
481 # namespace for this element
482 if ($metaname !~ m/\./)
483 {
484 $metaname = "$top_level_prefix.$metaname";
485 }
486
487 # if metadata set is auto, leave as is, otherwise convert to
488 # specified namespace
489 if ($self->{'metadata_set'} ne "auto") {
490 $metaname =~ s/^([^\.]*)\./$self->{'metadata_set'}\./;
491 if ($self->{'metadata_set'} eq "dc") {
492 # convert qualified dc terms to gs version, e.g.
493 # spatial becomes coverage^spatial
494 $metaname = $self->remap_dc_metadata($metaname);
495 }
496 }
497
498 # uppercase the first char of the name
499 $metaname =~ s/\.(.)/\.\u$1/;
500 $metavalue =~ s/\[/&#91;/g;
501 $metavalue =~ s/\]/&#93;/g;
502
503 if (defined $metadata->{$metaname})
504 {
505 push(@{$metadata->{$metaname}},$metavalue);
506
507 }
508 else
509 {
510 $metadata->{$metaname} = [ $metavalue ];
511 }
512
513 $self->add_prettyprint_metadata_line($metaname, $metavalue);
514
515 }
516 }
517
518 $self->close_prettyprint_metadata_table();
519}
520
521## we know from the file extension, so doesn't need to check the doctype
522sub check_doctype {
523
524 return 1;
525}
526
5271;
Note: See TracBrowser for help on using the repository browser.