source: main/trunk/greenstone2/perllib/plugins/OAIPlugin.pm@ 22597

Last change on this file since 22597 was 22316, checked in by kjdon, 14 years ago

store extracted namespaced metadata as ex.metadata, eg ex.dc.Title, then it will show up in GLI.

  • Property svn:keywords set to Author Date Id Revision
File size: 15.8 KB
Line 
1###########################################################################
2#
3# OAIPlug.pm -- basic Open Archives Initiative (OAI) plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package OAIPlugin;
28
29use unicode;
30use util;
31
32use strict;
33no strict 'refs'; # allow filehandles to be variables and viceversa
34
35use ReadXMLFile;
36use ReadTextFile; # needed for subroutine textcat_get_language_encoding
37use metadatautil;
38
39sub BEGIN {
40 @OAIPlugin::ISA = ('ReadXMLFile', 'ReadTextFile');
41}
42
43my $set_list =
44 [ { 'name' => "auto",
45 'desc' => "{OAIPlugin.metadata_set.auto}" },
46 { 'name' => "dc",
47 'desc' => "{OAIPlugin.metadata_set.dc}" }
48 ];
49
50my $arguments =
51 [ { 'name' => "process_exp",
52 'desc' => "{BasePlugin.process_exp}",
53 'type' => "regexp",
54 'reqd' => "no",
55 'deft' => &get_default_process_exp() },
56 { 'name' => "metadata_set",
57 'desc' => "{OAIPlugin.metadata_set}",
58 'type' => "enumstring",
59 'reqd' => "no",
60 'list' => $set_list,
61 'deft' => "dc" },
62 { 'name' => "document_field",
63 'desc' => "{OAIPlugin.document_field}",
64 'type' => "metadata",
65 'reqd' => "no",
66 'deft' => "gi.Sourcedoc" }
67 ];
68
69my $options = { 'name' => "OAIPlugin",
70 'desc' => "{OAIPlugin.desc}",
71 'abstract' => "no",
72 'inherits' => "yes",
73 'explodes' => "yes",
74 'args' => $arguments };
75
76
77sub new {
78 my ($class) = shift (@_);
79 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
80 push(@$pluginlist, $class);
81
82 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
83 push(@{$hashArgOptLists->{"OptList"}},$options);
84
85 new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists,1);
86 my $self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
87
88 if ($self->{'info_only'}) {
89 # don't worry about modifying options
90 return bless $self, $class;
91 }
92 # trim ex. from document field (if there)
93 $self->{'document_field'} =~ s/^ex\.//;
94 return bless $self, $class;
95}
96
97sub get_default_process_exp {
98 my $self = shift (@_);
99
100 return q^(?i)(\.oai)$^;
101}
102
103sub get_doctype {
104 my $self = shift(@_);
105
106 return "OAI-PMH";
107}
108
109sub xml_start_document {
110 my $self = shift (@_);
111 $self->{'in_metadata_node'} = 0;
112 $self->{'rawxml'} = "";
113 $self->{'saved_metadata'} = {};
114}
115
116sub xml_end_document {
117}
118
119sub xml_doctype {
120 my $self = shift(@_);
121
122 my ($expat, $name, $sysid, $pubid, $internal) = @_;
123
124 ##die "" if ($name !~ /^OAI-PMH$/);
125
126 my $outhandle = $self->{'outhandle'};
127 print $outhandle "OAIPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
128 print STDERR "<Processing n='$self->{'file'}' p='OAIPlugin'>\n" if $self->{'gli'};
129
130}
131
132
133sub xml_start_tag {
134 my $self = shift(@_);
135 my ($expat,$element) = @_;
136
137 my %attr_hash = %_;
138
139 my $attr = "";
140 map { $attr .= " $_=$attr_hash{$_}"; } keys %attr_hash;
141
142 $self->{'rawxml'} .= "<$element$attr>";
143
144 if ($element eq "metadata") {
145 $self->{'in_metadata_node'} = 1;
146 $self->{'metadata_xml'} = "";
147 }
148
149 if ($self->{'in_metadata_node'}) {
150 $self->{'metadata_xml'} .= "<$element$attr>";
151 }
152}
153
154sub xml_end_tag {
155 my $self = shift(@_);
156 my ($expat, $element) = @_;
157
158 $self->{'rawxml'} .= "</$element>";
159
160 if ($self->{'in_metadata_node'}) {
161 $self->{'metadata_xml'} .= "</$element>";
162 }
163
164 if ($element eq "metadata") {
165 my $textref = \$self->{'metadata_xml'};
166 #my $metadata = $self->{'metadata'};
167 my $metadata = $self->{'saved_metadata'};
168 $self->extract_oai_metadata($textref,$metadata);
169
170 $self->{'in_metadata_node'} = 0;
171 }
172
173
174}
175
176sub xml_text {
177 my $self = shift(@_);
178 my ($expat) = @_;
179
180 $self->{'rawxml'} .= $_;
181
182 if ($self->{'in_metadata_node'}) {
183 $self->{'metadata_xml'} .= $_;
184 }
185}
186
187
188sub metadata_read {
189 my $self = shift (@_);
190
191 my ($pluginfo, $base_dir, $file, $block_hash,
192 $extrametakeys, $extrametadata, $extrametafile,
193 $processor, $maxdocs, $gli) = @_;
194
195 # can we process this file??
196 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
197 return undef unless $self->can_process_this_file($filename_full_path);
198
199 if (!$self->parse_file($filename_full_path, $file, $gli)) {
200 $self->{'saved_metadata'} = undef;
201 return undef;
202 }
203
204 my $new_metadata = $self->{'saved_metadata'};
205 $self->{'saved_metadata'} = undef;
206
207 # add the pretty metadata table as metadata
208 my $ppmd_table = $self->{'ppmd_table'};
209 $new_metadata->{'prettymd'} = $ppmd_table;
210 $self->{'ppmd_table'} = undef;
211
212 my $document_metadata_field = $self->{'document_field'};
213 my $url_array = $new_metadata->{$document_metadata_field};
214 if (!defined $url_array) {
215 # try ex.
216 $url_array = $new_metadata->{"ex.$document_metadata_field"};
217 }
218 my $num_urls = (defined $url_array) ? scalar(@$url_array) : 0;
219 ##print STDERR "$num_urls urls for $file\n";
220 my $srcdoc_exists = 0;
221 my $srcdoc_pos = 0;
222 my $filename_dir = &util::filename_head($filename_full_path);
223
224 # filenames in extrametadata must be relative to current dir, as
225 # DirectoryPlugin adds path info on itself
226 my ($filename_for_metadata) = $file =~ /([^\\\/]+)$/; # this assumes there will only be one record per oai file - is this always the case??
227 for (my $i=0; $i<$num_urls; $i++) {
228
229 if ($url_array->[$i] !~ m/^(https?|ftp):/) {
230
231 my $src_filename = &util::filename_cat($filename_dir, $url_array->[$i]);
232 if (-e $src_filename) {
233 $srcdoc_pos = $i;
234 $srcdoc_exists = 1;
235 # get the slashes the right way, use filename_cat
236 $filename_for_metadata = &util::filename_cat($url_array->[$i]);
237 last;
238 }
239 }
240 }
241
242 if ($srcdoc_exists) {
243 $self->{'oai-files'}->{$file}->{'srcdoc_exists'} = 1;
244 }
245 else {
246 # save the rawxml for the source document
247 $self->{'oai-files'}->{$file}->{'srcdoc_exists'} = 0;
248 $self->{'oai-files'}->{$file}->{'rawxml'} = $self->{'rawxml'};
249 $self->{'rawxml'} = "";
250 }
251
252 # return all the metadata we have extracted to the caller.
253 # Directory plug will pass it back in at read time, so we don't need to extract it again.
254 # extrametadata keys should be regular expressions
255 $filename_for_metadata = &util::filename_to_regex($filename_for_metadata);
256 $extrametadata->{$filename_for_metadata} = $new_metadata;
257 push(@$extrametakeys, $filename_for_metadata);
258 if ($srcdoc_exists) {
259 if (!defined $extrametafile->{$filename_for_metadata}) {
260 $extrametafile->{$filename_for_metadata} = {};
261 }
262 #maps the file to full path
263 $extrametafile->{$filename_for_metadata}->{$file} = $filename_full_path;
264 }
265 return 1;
266
267}
268
269
270sub read {
271 my $self = shift (@_);
272
273 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
274
275 if (!defined $self->{'oai-files'}->{$file}) {
276 return undef;
277 }
278
279 my $srcdoc_exists = $self->{'oai-files'}->{$file}->{'srcdoc_exists'};
280 if ($srcdoc_exists) {
281 # do nothing more - all the metadata has been extracted and associated with the srcdoc
282 # no more need to access details of this $file => tidy up as you go
283 delete $self->{'oai-files'}->{$file};
284 return 0; # not processed here, but don't pass on to rest of plugins
285 }
286
287 my $filename = $file;
288 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
289
290 # Do encoding stuff on metadata
291 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
292
293 # create a new document
294 my $doc_obj = new doc ($filename, "indexed_doc", $self->{'file_rename_method'});
295 my $top_section = $doc_obj->get_top_section;
296 my $plugin_type = $self->{'plugin_type'};
297
298 my ($filemeta) = $file =~ /([^\\\/]+)$/;
299 $self->set_Source_metadata($doc_obj, $filemeta, $encoding);
300 $doc_obj->add_utf8_metadata($top_section, "Language", $language);
301 $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding);
302 $doc_obj->add_utf8_metadata($top_section, "Plugin", $plugin_type);
303 $doc_obj->add_metadata($top_section, "FileFormat", "OAI");
304 $doc_obj->add_metadata($top_section, "FileSize", (-s $filename));
305
306 # include any metadata passed in from previous plugins
307 # note that this metadata is associated with the top level section
308 # this will include all the metadata from the oai file that we extracted
309 # during metadata_read
310 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
311
312 # do plugin specific processing of doc_obj
313 my $text = $self->{'oai-files'}->{$file}->{'rawxml'};
314 delete $self->{'oai-files'}->{$file};
315
316 unless (defined ($self->process(\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
317 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
318 return -1;
319 }
320
321 # do any automatic metadata extraction
322 $self->auto_extract_metadata ($doc_obj);
323
324 # add an OID
325 $self->add_OID($doc_obj);
326
327 # process the document
328 $processor->process($doc_obj);
329
330 $self->{'num_processed'} ++;
331
332 return 1; # processed the file
333}
334
335
336# do plugin specific processing of doc_obj
337sub process {
338 my $self = shift (@_);
339 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
340 my $outhandle = $self->{'outhandle'};
341
342 print STDERR "<Processing n='$file' p='OAIPlugin'>\n" if ($gli);
343 print $outhandle "OAIPlugin: processing $file\n"
344 if $self->{'verbosity'} > 1;
345
346 my $cursection = $doc_obj->get_top_section();
347
348## $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection);
349
350 # add text to document object
351
352# $$textref =~ s/<(.*?)>/$1 /g;
353 $$textref =~ s/</&lt;/g;
354 $$textref =~ s/>/&gt;/g;
355 $$textref =~ s/\[/&#91;/g;
356 $$textref =~ s/\]/&#93;/g;
357
358 $doc_obj->add_utf8_text($cursection, $$textref);
359
360 return 1;
361}
362
363
364# Improvement is to merge this with newer version in MetadataPass
365
366sub open_prettyprint_metadata_table
367{
368 my $self = shift(@_);
369
370 my $att = "width=100% cellspacing=2";
371 my $style = "style=\'border-bottom: 4px solid #000080\'";
372
373 $self->{'ppmd_table'} = "\n<table $att $style>";
374}
375
376sub add_prettyprint_metadata_line
377{
378 my $self = shift(@_);
379 my ($metaname, $metavalue_utf8) = @_;
380
381 $metavalue_utf8 = &util::hyperlink_text($metavalue_utf8);
382
383 $self->{'ppmd_table'} .= " <tr bgcolor=#b5d3cd>\n";
384 $self->{'ppmd_table'} .= " <td width=30%>\n";
385 $self->{'ppmd_table'} .= " $metaname\n";
386 $self->{'ppmd_table'} .= " </td>\n";
387 $self->{'ppmd_table'} .= " <td>\n";
388 $self->{'ppmd_table'} .= " $metavalue_utf8\n";
389 $self->{'ppmd_table'} .= " </td>\n";
390 $self->{'ppmd_table'} .= " </tr>\n";
391
392}
393
394sub close_prettyprint_metadata_table
395{
396 my $self = shift(@_);
397
398 $self->{'ppmd_table'} .= "</table>\n";
399}
400
401my $qualified_dc_mapping = {
402 "alternative" => "dc.title",
403 "tableOfContents" => "dc.description",
404 "abstract" => "dc.description",
405 "created" => "dc.date",
406 "valid" => "dc.date",
407 "available" => "dc.date",
408 "issued" => "dc.date",
409 "modified" => "dc.date",
410 "dateAccepted" => "dc.date",
411 "dateCopyrighted" => "dc.date",
412 "dateSubmitted" => "dc.date",
413 "extent" => "dc.format",
414 "medium" => "dc.format",
415 "isVersionOf" => "dc.relation",
416 "hasVersion" => "dc.relation",
417 "isReplacedBy" => "dc.relation",
418 "replaces" => "dc.relation",
419 "isRequiredBy" => "dc.relation",
420 "requires" => "dc.relation",
421 "isPartOf" => "dc.relation",
422 "hasPart" => "dc.relation",
423 "isReferencedBy" => "dc.relation",
424 "references" => "dc.relation",
425 "isFormatOf" => "dc.relation",
426 "hasFormat" => "dc.relation",
427 "conformsTo" => "dc.relation",
428 "spatial" => "dc.coverage",
429 "temporal" => "dc.coverage",
430# these are now top level elements in our qualified dc metadata set
431# "audience" => "dc.any",
432# "accrualMethod" => "dc.any",
433# "accrualPeriodicity" => "dc.any",
434# "accrualPolicy" => "dc.any",
435# "instructionalMethod" => "dc.any",
436# "provenance" => "dc.any",
437# "rightsHolder" => "dc.any",
438 "mediator" => "dc.audience",
439 "educationLevel" => "dc.audience",
440 "accessRights" => "dc.rights",
441 "license" => "dc.rights",
442 "bibliographicCitation" => "dc.identifier"
443 };
444
445sub remap_dc_metadata
446{
447 my $self = shift(@_);
448
449 my ($metaname) = @_;
450
451 my ($prefix,$name) = ($metaname =~ m/^(.*?)\.(.*?)$/);
452
453 if (defined $qualified_dc_mapping->{$name}) {
454
455 return $qualified_dc_mapping->{$name}."^".$name;
456 }
457
458
459 return $metaname; # didn't get a match, return param passed in unchanged
460}
461
462
463sub extract_oai_metadata {
464 my $self = shift (@_);
465 my ($textref, $metadata) = @_;
466 my $outhandle = $self->{'outhandle'};
467
468 $self->open_prettyprint_metadata_table();
469
470 if ($$textref =~ m/<metadata\s*>(.*?)<\/metadata\s*>/s)
471 {
472 my $metadata_text = $1;
473
474 # locate and remove outermost tag (ignoring any attribute information in top-level tag)
475 my ($outer_tagname,$inner_metadata_text) = ($metadata_text =~ m/<([^ >]+).*?>(.*?)<\/\1>/s);
476 # split tag into namespace and tag name
477 my($namespace,$top_level_prefix) = ($outer_tagname =~ m/^(.*?):(.*?)$/);
478 # sometimes, the dc namespace is not specified as the prefix in each element (like <dc:title>)
479 # but is rather defined in the wrapper element containing the various dc meta elements,
480 # like <dc><title></title><creator></creator></dc>.
481 # In such a case, we use this wrapper element as the top_level_prefix
482
483 # if there was no prefix, then the tag itself becomes the top_level_prefix
484 if(!defined $top_level_prefix && defined $outer_tagname) {
485 $top_level_prefix = $outer_tagname;
486 }
487
488 #process each element one by one
489 while ($inner_metadata_text =~ m/<([^ >]+).*?>(.*?)<\/\1>(.*)/s)
490 {
491
492 my $metaname = $1;
493 my $metavalue = $2;
494 $inner_metadata_text = $3;
495
496 # greenstone uses . for namespace, while oai uses :
497 $metaname =~ s/:/\./;
498 # if there is no namespace, then we use the outer tag name or
499 # namespace for this element
500 if ($metaname !~ m/\./)
501 {
502 $metaname = "$top_level_prefix.$metaname";
503 }
504
505 # if metadata set is auto, leave as is, otherwise convert to
506 # specified namespace
507 if ($self->{'metadata_set'} ne "auto") {
508 if ($metaname !~ /^gi\./) { # hack to not overwrite gi metadata
509 $metaname =~ s/^([^\.]*)\./$self->{'metadata_set'}\./;
510 if ($self->{'metadata_set'} eq "dc") {
511 # convert qualified dc terms to gs version, e.g.
512 # spatial becomes coverage^spatial
513 $metaname = $self->remap_dc_metadata($metaname);
514 }
515 }
516 }
517
518 # uppercase the first char of the name
519 $metaname =~ s/\.(.)/\.\u$1/;
520 $metavalue =~ s/\[/&#91;/g;
521 $metavalue =~ s/\]/&#93;/g;
522
523 # so that GLI can see this metadata, store here as ex.dc.Title etc
524 my $ex_metaname = "ex.$metaname";
525
526 if (defined $metadata->{$ex_metaname})
527 {
528 push(@{$metadata->{$ex_metaname}},$metavalue);
529
530 }
531 else
532 {
533 $metadata->{$ex_metaname} = [ $metavalue ];
534 }
535
536 # but don't add ex to the pretty print line
537 $self->add_prettyprint_metadata_line($metaname, $metavalue);
538
539 }
540 }
541
542 $self->close_prettyprint_metadata_table();
543}
544
545## we know from the file extension, so doesn't need to check the doctype
546sub check_doctype {
547
548 return 1;
549}
550
5511;
Note: See TracBrowser for help on using the repository browser.