source: gsdl/trunk/perllib/plugins/OAIPlugin.pm@ 17590

Last change on this file since 17590 was 17590, checked in by kjdon, 15 years ago

commit 17320 means that DirectoryPlugin now assumes that filepaths in extrametadata are relative to current directory, not import dir, so need to remove path info for the oai file

  • Property svn:keywords set to Author Date Id Revision
File size: 14.4 KB
Line 
1###########################################################################
2#
3# OAIPlug.pm -- basic Open Archives Initiative (OAI) plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package OAIPlugin;
28
29use unicode;
30use util;
31
32use strict;
33no strict 'refs'; # allow filehandles to be variables and viceversa
34
35use ReadXMLFile;
36use ReadTextFile; # needed for subroutine textcat_get_language_encoding
37use metadatautil;
38
39sub BEGIN {
40 @OAIPlugin::ISA = ('ReadXMLFile', 'ReadTextFile');
41}
42
43
44my $arguments =
45 [ { 'name' => "process_exp",
46 'desc' => "{BasePlugin.process_exp}",
47 'type' => "regexp",
48 'reqd' => "no",
49 'deft' => &get_default_process_exp() },
50 { 'name' => "document_field",
51 'desc' => "{OAIPlugin.document_field}",
52 'type' => "metadata",
53 'reqd' => "no",
54 'deft' => "gi.Sourcedoc" }
55 ];
56
57my $options = { 'name' => "OAIPlugin",
58 'desc' => "{OAIPlugin.desc}",
59 'abstract' => "no",
60 'inherits' => "yes",
61 'explodes' => "yes",
62 'args' => $arguments };
63
64
65sub new {
66 my ($class) = shift (@_);
67 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
68 push(@$pluginlist, $class);
69
70 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
71 push(@{$hashArgOptLists->{"OptList"}},$options);
72
73 new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists,1);
74 my $self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
75
76 return bless $self, $class;
77}
78
79sub get_default_process_exp {
80 my $self = shift (@_);
81
82 return q^(?i)(\.oai)$^;
83}
84
85sub get_doctype {
86 my $self = shift(@_);
87
88 return "OAI-PMH";
89}
90
91sub xml_start_document {
92 my $self = shift (@_);
93 $self->{'in_metadata_node'} = 0;
94 $self->{'rawxml'} = "";
95 $self->{'saved_metadata'} = {};
96}
97
98sub xml_end_document {
99}
100
101sub xml_doctype {
102 my $self = shift(@_);
103
104 my ($expat, $name, $sysid, $pubid, $internal) = @_;
105
106 ##die "" if ($name !~ /^OAI-PMH$/);
107
108 my $outhandle = $self->{'outhandle'};
109 print $outhandle "OAIPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
110 print STDERR "<Processing n='$self->{'file'}' p='OAIPlugin'>\n" if $self->{'gli'};
111
112}
113
114
115sub xml_start_tag {
116 my $self = shift(@_);
117 my ($expat,$element) = @_;
118
119 my %attr_hash = %_;
120
121 my $attr = "";
122 map { $attr .= " $_=$attr_hash{$_}"; } keys %attr_hash;
123
124 $self->{'rawxml'} .= "<$element$attr>";
125
126 if ($element eq "metadata") {
127 $self->{'in_metadata_node'} = 1;
128 $self->{'metadata_xml'} = "";
129 }
130
131 if ($self->{'in_metadata_node'}) {
132 $self->{'metadata_xml'} .= "<$element$attr>";
133 }
134}
135
136sub xml_end_tag {
137 my $self = shift(@_);
138 my ($expat, $element) = @_;
139
140 $self->{'rawxml'} .= "</$element>";
141
142 if ($self->{'in_metadata_node'}) {
143 $self->{'metadata_xml'} .= "</$element>";
144 }
145
146 if ($element eq "metadata") {
147 my $textref = \$self->{'metadata_xml'};
148 #my $metadata = $self->{'metadata'};
149 my $metadata = $self->{'saved_metadata'};
150 $self->extract_oai_metadata($textref,$metadata);
151
152 $self->{'in_metadata_node'} = 0;
153 }
154
155
156}
157
158sub xml_text {
159 my $self = shift(@_);
160 my ($expat) = @_;
161
162 $self->{'rawxml'} .= $_;
163
164 if ($self->{'in_metadata_node'}) {
165 $self->{'metadata_xml'} .= $_;
166 }
167}
168
169
170sub metadata_read {
171 my $self = shift (@_);
172
173 my ($pluginfo, $base_dir, $file, $block_hash, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
174
175 # can we process this file??
176 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
177 return undef unless $self->can_process_this_file($filename_full_path);
178
179 if (!$self->parse_file($filename_full_path, $file, $gli)) {
180 $self->{'saved_metadata'} = undef;
181 return undef;
182 }
183
184 my $new_metadata = $self->{'saved_metadata'};
185 $self->{'saved_metadata'} = undef;
186
187 # add the pretty metadata table as metadata
188 my $ppmd_table = $self->{'ppmd_table'};
189 $new_metadata->{'prettymd'} = $ppmd_table;
190 $self->{'ppmd_table'} = undef;
191
192 my $document_metadata_field = $self->{'document_field'};
193 my $url_array = $new_metadata->{$document_metadata_field};
194 my $num_urls = (defined $url_array) ? scalar(@$url_array) : 0;
195 ##print STDERR "$num_urls urls for $file\n";
196 my $srcdoc_exists = 0;
197 my $srcdoc_pos = 0;
198 my $filename_dir = &util::filename_head($filename_full_path);
199
200 # filenames in extrametadata must have no path info, as DirectoryPlugin
201 # adds path info on itself
202 my ($filename_for_metadata) = $file =~ /([^\\\/]+)$/; # this assumes there will only be one record per oai file - is this always the case??
203 for (my $i=0; $i<$num_urls; $i++) {
204
205 if ($url_array->[$i] !~ m/^(https?|ftp):/) {
206
207 my $src_filename = &util::filename_cat($filename_dir, $url_array->[$i]);
208
209 if (-e $src_filename) {
210 $srcdoc_pos = $i;
211 $srcdoc_exists = 1;
212 $filename_for_metadata = $url_array->[$i];
213 last;
214 }
215 }
216 }
217
218 if ($srcdoc_exists) {
219 $self->{'oai-files'}->{$file}->{'srcdoc_exists'} = 1;
220 }
221 else {
222 # save the rawxml for the source document
223 $self->{'oai-files'}->{$file}->{'srcdoc_exists'} = 0;
224 $self->{'oai-files'}->{$file}->{'rawxml'} = $self->{'rawxml'};
225 $self->{'rawxml'} = "";
226 }
227
228 # return all the metadata we have extracted to the caller.
229 # Directory plug will pass it back in at read time, so we don't need to extract it again.
230 # extrametadata keys should be regular expressions
231 $filename_for_metadata = &util::filename_to_regex($filename_for_metadata);
232 $extrametadata->{$filename_for_metadata} = $new_metadata;
233 push(@$extrametakeys, $filename_for_metadata);
234
235 return 1;
236
237}
238
239
240sub read {
241 my $self = shift (@_);
242
243 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
244
245 if (!defined $self->{'oai-files'}->{$file}) {
246 return undef;
247 }
248
249 my $srcdoc_exists = $self->{'oai-files'}->{$file}->{'srcdoc_exists'};
250 if ($srcdoc_exists) {
251 # do nothing more - all the metadata has been extracted and associated with the srcdoc
252 # no more need to access details of this $file => tidy up as you go
253 delete $self->{'oai-files'}->{$file};
254 return 0; # not processed here, but don't pass on to rest of plugins
255 }
256
257 my $filename = $file;
258 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
259
260 # Do encoding stuff on metadata
261 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
262
263 # create a new document
264 my $doc_obj = new doc ($filename, "indexed_doc");
265 my $top_section = $doc_obj->get_top_section;
266 my $plugin_type = $self->{'plugin_type'};
267
268 my ($filemeta) = $file =~ /([^\\\/]+)$/;
269 $self->set_Source_metadata($doc_obj, $filemeta, $encoding);
270 $doc_obj->add_utf8_metadata($top_section, "Language", $language);
271 $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding);
272 $doc_obj->add_utf8_metadata($top_section, "Plugin", $plugin_type);
273 $doc_obj->add_metadata($top_section, "FileFormat", "OAI");
274 $doc_obj->add_metadata($top_section, "FileSize", (-s $filename));
275
276 # include any metadata passed in from previous plugins
277 # note that this metadata is associated with the top level section
278 # this will include all the metadata from the oai file that we extracted
279 # during metadata_read
280 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
281
282 # do plugin specific processing of doc_obj
283 my $text = $self->{'oai-files'}->{$file}->{'rawxml'};
284 delete $self->{'oai-files'}->{$file};
285
286 unless (defined ($self->process(\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
287 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
288 return -1;
289 }
290
291 # do any automatic metadata extraction
292 $self->auto_extract_metadata ($doc_obj);
293
294 # add an OID
295 $self->add_OID($doc_obj);
296
297 # process the document
298 $processor->process($doc_obj);
299
300 $self->{'num_processed'} ++;
301
302 return 1; # processed the file
303}
304
305
306# do plugin specific processing of doc_obj
307sub process {
308 my $self = shift (@_);
309 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
310 my $outhandle = $self->{'outhandle'};
311
312 print STDERR "<Processing n='$file' p='OAIPlugin'>\n" if ($gli);
313 print $outhandle "OAIPlugin: processing $file\n"
314 if $self->{'verbosity'} > 1;
315
316 my $cursection = $doc_obj->get_top_section();
317
318## $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection);
319
320 # add text to document object
321
322# $$textref =~ s/<(.*?)>/$1 /g;
323 $$textref =~ s/</&lt;/g;
324 $$textref =~ s/>/&gt;/g;
325 $$textref =~ s/\[/&#91;/g;
326 $$textref =~ s/\]/&#93;/g;
327
328 $doc_obj->add_utf8_text($cursection, $$textref);
329
330 return 1;
331}
332
333
334# Improvement is to merge this with newer version in MetadataPass
335
336sub open_prettyprint_metadata_table
337{
338 my $self = shift(@_);
339
340 my $att = "width=100% cellspacing=2";
341 my $style = "style=\'border-bottom: 4px solid #000080\'";
342
343 $self->{'ppmd_table'} = "\n<table $att $style>";
344}
345
346sub add_prettyprint_metadata_line
347{
348 my $self = shift(@_);
349 my ($metaname, $metavalue_utf8) = @_;
350
351 $metavalue_utf8 = &util::hyperlink_text($metavalue_utf8);
352
353 $self->{'ppmd_table'} .= " <tr bgcolor=#b5d3cd>\n";
354 $self->{'ppmd_table'} .= " <td width=30%>\n";
355 $self->{'ppmd_table'} .= " $metaname\n";
356 $self->{'ppmd_table'} .= " </td>\n";
357 $self->{'ppmd_table'} .= " <td>\n";
358 $self->{'ppmd_table'} .= " $metavalue_utf8\n";
359 $self->{'ppmd_table'} .= " </td>\n";
360 $self->{'ppmd_table'} .= " </tr>\n";
361
362}
363
364sub close_prettyprint_metadata_table
365{
366 my $self = shift(@_);
367
368 $self->{'ppmd_table'} .= "</table>\n";
369}
370
371
372sub remap_dcterms_metadata
373{
374 my $self = shift(@_);
375
376 my ($metaname) = @_;
377
378 my $dcterm_mapping = {
379 "alternative" => "dc.title",
380 "tableOfContents" => "dc.description",
381 "abstract" => "dc.description",
382 "created" => "dc.date",
383 "valid" => "dc.date",
384 "available" => "dc.date",
385 "issued" => "dc.date",
386 "modified" => "dc.date",
387 "dateAccepted" => "dc.date",
388 "dateCopyrighted" => "dc.date",
389 "dateSubmitted" => "dc.date",
390 "extent" => "dc.format",
391 "medium" => "dc.format",
392 "isVersionOf" => "dc.relation",
393 "hasVersion" => "dc.relation",
394 "isReplacedBy" => "dc.relation",
395 "replaces" => "dc.relation",
396 "isRequiredBy" => "dc.relation",
397 "requires" => "dc.relation",
398 "isPartOf" => "dc.relation",
399 "hasPart" => "dc.relation",
400 "isReferencedBy" => "dc.relation",
401 "references" => "dc.relation",
402 "isFormatOf" => "dc.relation",
403 "hasFormat" => "dc.relation",
404 "conformsTo" => "dc.relation",
405 "spatial" => "dc.coverage",
406 "temporal" => "dc.coverage",
407 "audience" => "dc.any",
408 "accrualMethod" => "dc.any",
409 "accrualPeriodicity" => "dc.any",
410 "accrualPolicy" => "dc.any",
411 "instructionalMethod" => "dc.any",
412 "provenance" => "dc.any",
413 "rightsHolder" => "dc.any",
414 "mediator" => "audience",
415 "educationLevel" => "audience",
416 "accessRights" => "dc.rights",
417 "license" => "dc.rights",
418 "bibliographicCitation" => "dc.identifier"
419 };
420
421 my ($prefix,$name) = ($metaname =~ m/^(.*?)\.(.*?)$/);
422
423 if ($prefix eq "dcterms")
424 {
425 if (defined $dcterm_mapping->{$name})
426 {
427 return $dcterm_mapping->{$name}."^".$name;
428 }
429
430 }
431 return $metaname; # didn't get a match, return param passed in unchanged
432}
433
434
435sub extract_oai_metadata {
436 my $self = shift (@_);
437 my ($textref, $metadata) = @_;
438 my $outhandle = $self->{'outhandle'};
439
440 # Only handles DC metadata
441
442 $self->open_prettyprint_metadata_table();
443
444 if ($$textref =~ m/<metadata\s*>(.*?)<\/metadata\s*>/s)
445 {
446 my $metadata_text = $1;
447
448 # locate and remove outermost tag (ignoring any attribute information in top-level tag)
449 my ($wrapper_metadata_xml,$inner_metadata_text) = ($metadata_text =~ m/<([^ >]+).*?>(.*?)<\/\1>/s);
450
451 # split tag into namespace and tag name
452 my($namespace,$top_level_prefix) = ($wrapper_metadata_xml =~ m/^(.*?):(.*?)$/);
453
454 # sometimes, the dc namespace is not specified as the prefix in each element (like <dc:title>)
455 # but is rather defined in the wrapper element containing the various dc meta elements,
456 # like <dc><title></title><creator></creator></dc>.
457 # In such a case, we use this wrapper element as the top_level_prefix
458 if(!defined $top_level_prefix && defined $wrapper_metadata_xml && $wrapper_metadata_xml =~ m/dc$/) {
459 $top_level_prefix = $wrapper_metadata_xml;
460 }
461
462 if ($top_level_prefix !~ m/dc$/) {
463 print $outhandle "Warning: OAIPlugin currently only designed for Dublin Core (or variant) metadata\n";
464 print $outhandle " This recorded metadata section '$top_level_prefix' does not appear to match.\n";
465 print $outhandle " Metadata assumed to be in form: <prefix:tag>value</prefix:tag> and will be converted\n";
466 print $outhandle " into Greenstone metadata as prefix.tag = value\n";
467 }
468
469 while ($inner_metadata_text =~ m/<([^ >]+).*?>(.*?)<\/\1>(.*)/s)
470 {
471
472 my $metaname = $1;
473 my $metavalue = $2;
474 $inner_metadata_text = $3;
475
476 # $metaname =~ s/^(dc:)?(.)/\u$2/; # strip of optional prefix and uppercase first letter
477 $metaname =~ s/:/\./;
478 if ($metaname !~ m/\./)
479 {
480 $metaname = "$top_level_prefix.$metaname";
481 }
482 $metaname =~ s/\.(.)/\.\u$1/;
483
484 $metaname = $self->remap_dcterms_metadata($metaname);
485
486 $metavalue =~ s/\[/&#91;/g;
487 $metavalue =~ s/\]/&#93;/g;
488
489 if (defined $metadata->{$metaname})
490 {
491 push(@{$metadata->{$metaname}},$metavalue);
492
493 }
494 else
495 {
496 $metadata->{$metaname} = [ $metavalue ];
497 }
498
499 $self->add_prettyprint_metadata_line($metaname, $metavalue);
500
501 }
502 }
503
504 $self->close_prettyprint_metadata_table();
505}
506
507## we know from the file extension, so doesn't need to check the doctype
508sub check_doctype {
509
510 return 1;
511}
512
5131;
Note: See TracBrowser for help on using the repository browser.