source: gsdl/trunk/perllib/plugins/OAIPlugin.pm@ 17197

Last change on this file since 17197 was 17197, checked in by kjdon, 16 years ago

previous metadata changes meant that there was no longer URL metadata (used to find the source doc). Now it uses dc.Identifier

  • Property svn:keywords set to Author Date Id Revision
File size: 13.1 KB
RevLine 
[4726]1###########################################################################
2#
3# OAIPlug.pm -- basic Open Archives Initiative (OAI) plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
[15872]27package OAIPlugin;
[4726]28
29use unicode;
30use util;
31
[10254]32use strict;
33no strict 'refs'; # allow filehandles to be variables and viceversa
34
[15872]35use ReadXMLFile;
[17066]36use ReadTextFile; # needed for subroutine textcat_get_language_encoding
[9958]37
[4726]38sub BEGIN {
[17066]39 @OAIPlugin::ISA = ('ReadXMLFile', 'ReadTextFile');
[4726]40}
41
[9958]42
[6408]43my $arguments =
44 [ { 'name' => "process_exp",
[16013]45 'desc' => "{BasePlugin.process_exp}",
[6408]46 'type' => "regexp",
47 'reqd' => "no",
[17126]48 'deft' => &get_default_process_exp() }
[6408]49 ];
50
[15872]51my $options = { 'name' => "OAIPlugin",
52 'desc' => "{OAIPlugin.desc}",
[6408]53 'abstract' => "no",
54 'inherits' => "yes",
[17103]55 'explodes' => "yes",
[6408]56 'args' => $arguments };
[4747]57
[10254]58
[4726]59sub new {
[10218]60 my ($class) = shift (@_);
61 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
62 push(@$pluginlist, $class);
[4873]63
[15872]64 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
65 push(@{$hashArgOptLists->{"OptList"}},$options);
[4726]66
[17126]67 new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists,1);
[15872]68 my $self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
[4726]69
70 return bless $self, $class;
71}
72
73sub get_default_process_exp {
74 my $self = shift (@_);
75
76 return q^(?i)(\.oai)$^;
77}
78
[13222]79sub get_doctype {
80 my $self = shift(@_);
81
82 return "OAI-PMH";
83}
84
[9958]85sub xml_start_document {
[10254]86 my $self = shift (@_);
[9958]87 $self->{'in_metadata_node'} = 0;
88 $self->{'rawxml'} = "";
89}
[4726]90
[9958]91sub xml_end_document {
92}
[4726]93
[9958]94sub xml_doctype {
95 my $self = shift(@_);
96
97 my ($expat, $name, $sysid, $pubid, $internal) = @_;
98
[13886]99 ##die "" if ($name !~ /^OAI-PMH$/);
[9958]100
[4726]101 my $outhandle = $self->{'outhandle'};
[15872]102 print $outhandle "OAIPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
103 print STDERR "<Processing n='$self->{'file'}' p='OAIPlugin'>\n" if $self->{'gli'};
[4726]104
[9958]105}
[4726]106
[9958]107
108sub xml_start_tag {
109 my $self = shift(@_);
110 my ($expat,$element) = @_;
111
112 my %attr_hash = %_;
113
114 my $attr = "";
115 map { $attr .= " $_=$attr_hash{$_}"; } keys %attr_hash;
116
117 $self->{'rawxml'} .= "<$element$attr>";
118
119 if ($element eq "metadata") {
120 $self->{'in_metadata_node'} = 1;
121 $self->{'metadata_xml'} = "";
[4726]122 }
[9958]123
124 if ($self->{'in_metadata_node'}) {
125 $self->{'metadata_xml'} .= "<$element$attr>";
[4726]126 }
[9958]127}
[4726]128
[9958]129sub xml_end_tag {
130 my $self = shift(@_);
131 my ($expat, $element) = @_;
[4726]132
[9958]133 $self->{'rawxml'} .= "</$element>";
[4726]134
[9958]135 if ($self->{'in_metadata_node'}) {
136 $self->{'metadata_xml'} .= "</$element>";
[4726]137 }
138
[9958]139 if ($element eq "metadata") {
140 my $textref = \$self->{'metadata_xml'};
141 my $metadata = $self->{'metadata'};
142 $self->extract_oai_metadata($textref,$metadata);
[4726]143
[9958]144 $self->{'in_metadata_node'} = 0;
145 }
[4726]146
147
[9958]148}
[4726]149
[9958]150sub xml_text {
151 my $self = shift(@_);
152 my ($expat) = @_;
[8684]153
[9958]154 $self->{'rawxml'} .= $_;
[4726]155
[9958]156 if ($self->{'in_metadata_node'}) {
157 $self->{'metadata_xml'} .= $_;
[4726]158 }
[9958]159}
[4726]160
[8121]161
[4726]162
[5919]163
[9958]164sub read {
165 my $self = shift (@_);
166
[16392]167 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
[4726]168
[9958]169 my $outhandle = $self->{'outhandle'};
[4726]170
[9958]171 my $filename = $file;
172 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
[4726]173
[17197]174 # block the srcdocs dir - we will process files in them when we find an OAI record for them
[9958]175 return 0 if ((-d $filename) && ($filename =~ m/srcdocs$/));
176 if ($self->SUPER::read(@_)) {
177 # Do encoding stuff
178 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
179
[17197]180 my $url_array = $metadata->{'dc.Identifier'};
[10254]181 my $num_urls = (defined $url_array) ? scalar(@$url_array) : 0;
[9958]182
183 my $srcdoc_exists = 0;
184 my $srcdoc_pos = 0;
185 my $filename_dir = &util::filename_head($filename);
186
187 for (my $i=0; $i<$num_urls; $i++) {
188 if ($url_array->[$i] !~ m/^(http|ftp):/) {
189
190 my $src_filename = &util::filename_cat($filename_dir, $url_array->[$i]);
191 if (-e $src_filename) {
192 $srcdoc_pos = $i;
193 $srcdoc_exists = 1;
194 }
195 }
196 }
197
198 if ($srcdoc_exists)
199 {
[15872]200 print $outhandle "OAIPlugin: passing metadata on to $url_array->[0]\n"
[9958]201 if ($self->{'verbosity'}>1);
202
203
204 # Make pretty print metadata table stick with src filename
205 my $ppmd_table = $self->{'ppmd_table'};
206 $metadata->{'prettymd'} = [ $ppmd_table ];
207 $self->{'ppmd_table'} = undef;
208
209 return &plugin::read ($pluginfo, $filename_dir, $url_array->[0],
[16392]210 $block_hash, $metadata, $processor, $maxdocs,
211 $total_count, $gli);
[9958]212 }
213 else
214 {
215 # create a new document
216 my $doc_obj = new doc ($filename, "indexed_doc");
217 my $top_section = $doc_obj->get_top_section;
218 my $plugin_type = $self->{'plugin_type'};
219
220 $doc_obj->add_utf8_metadata($top_section, "Language", $language);
221 $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding);
222 $doc_obj->add_utf8_metadata($top_section, "Plugin", $plugin_type);
223 $doc_obj->add_metadata($top_section, "FileFormat", "OAI");
224 $doc_obj->add_metadata($top_section, "FileSize", (-s $filename));
225
226 # include any metadata passed in from previous plugins
227 # note that this metadata is associated with the top level section
228 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
229
230 # do plugin specific processing of doc_obj
231 my $textref = \$self->{'rawxml'};
232 unless (defined ($self->process($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
233 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
234 return -1;
235 }
236
237 # do any automatic metadata extraction
238 $self->auto_extract_metadata ($doc_obj);
239
240 # add an OID
[17026]241 $self->add_OID($doc_obj);
[9958]242
243 my $ppmd_table = $self->{'ppmd_table'};
244 $doc_obj->add_utf8_metadata($top_section,"prettymd",$ppmd_table);
245 $self->{'ppmd_table'} = undef;
246
247 # process the document
248 $processor->process($doc_obj);
249
250 $self->{'num_processed'} ++;
251
252 return 1; # processed the file
253 }
[4726]254 }
[9958]255 else {
256 return undef;
257 }
[4726]258}
259
260
261# do plugin specific processing of doc_obj
262sub process {
263 my $self = shift (@_);
[6332]264 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[4726]265 my $outhandle = $self->{'outhandle'};
266
[15872]267 print STDERR "<Processing n='$file' p='OAIPlugin'>\n" if ($gli);
268 print $outhandle "OAIPlugin: processing $file\n"
[4726]269 if $self->{'verbosity'} > 1;
270
271 my $cursection = $doc_obj->get_top_section();
272
273## $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection);
274
275 # add text to document object
276
277# $$textref =~ s/<(.*?)>/$1 /g;
278 $$textref =~ s/</&lt;/g;
279 $$textref =~ s/>/&gt;/g;
[14963]280 $$textref =~ s/\[/&#91;/g;
281 $$textref =~ s/\]/&#93;/g;
[4726]282
283## print STDERR "*** adding text: $$textref\n";
284
285 $doc_obj->add_utf8_text($cursection, $$textref);
286
287 return 1;
288}
289
290
[9958]291# Improvement is to merge this with newer version in MetadataPass
[4726]292
[9958]293sub open_prettyprint_metadata_table
294{
295 my $self = shift(@_);
296
297 my $att = "width=100% cellspacing=2";
298 my $style = "style=\'border-bottom: 4px solid #000080\'";
299
300 $self->{'ppmd_table'} = "\n<table $att $style>";
301}
302
303sub add_prettyprint_metadata_line
304{
305 my $self = shift(@_);
306 my ($metaname, $metavalue_utf8) = @_;
307
[14949]308### $metavalue_utf8 =~ s/hdl\.handle\.net/mcgonagall.cs.waikato.ac.nz:8080\/dspace\/handle/;
[9958]309 $metavalue_utf8 = &util::hyperlink_text($metavalue_utf8);
310
311 $self->{'ppmd_table'} .= " <tr bgcolor=#b5d3cd>\n";
312 $self->{'ppmd_table'} .= " <td width=30%>\n";
313 $self->{'ppmd_table'} .= " $metaname\n";
314 $self->{'ppmd_table'} .= " </td>\n";
315 $self->{'ppmd_table'} .= " <td>\n";
316 $self->{'ppmd_table'} .= " $metavalue_utf8\n";
317 $self->{'ppmd_table'} .= " </td>\n";
318 $self->{'ppmd_table'} .= " </tr>\n";
319
320}
321
322sub close_prettyprint_metadata_table
323{
324 my $self = shift(@_);
325
326 $self->{'ppmd_table'} .= "</table>\n";
327}
328
329
[14940]330sub remap_dcterms_metadata
331{
332 my $self = shift(@_);
[9958]333
[14940]334 my ($metaname) = @_;
[9958]335
[14940]336 my $dcterm_mapping = {
337 "alternative" => "dc.title",
338 "tableOfContents" => "dc.description",
339 "abstract" => "dc.description",
340 "created" => "dc.date",
341 "valid" => "dc.date",
342 "available" => "dc.date",
343 "issued" => "dc.date",
344 "modified" => "dc.date",
345 "dateAccepted" => "dc.date",
346 "dateCopyrighted" => "dc.date",
347 "dateSubmitted" => "dc.date",
348 "extent" => "dc.format",
349 "medium" => "dc.format",
350 "isVersionOf" => "dc.relation",
351 "hasVersion" => "dc.relation",
352 "isReplacedBy" => "dc.relation",
353 "replaces" => "dc.relation",
354 "isRequiredBy" => "dc.relation",
355 "requires" => "dc.relation",
356 "isPartOf" => "dc.relation",
357 "hasPart" => "dc.relation",
358 "isReferencedBy" => "dc.relation",
359 "references" => "dc.relation",
360 "isFormatOf" => "dc.relation",
361 "hasFormat" => "dc.relation",
362 "conformsTo" => "dc.relation",
363 "spatial" => "dc.coverage",
364 "temporal" => "dc.coverage",
365 "audience" => "dc.any",
366 "accrualMethod" => "dc.any",
367 "accrualPeriodicity" => "dc.any",
368 "accrualPolicy" => "dc.any",
369 "instructionalMethod" => "dc.any",
370 "provenance" => "dc.any",
371 "rightsHolder" => "dc.any",
372 "mediator" => "audience",
373 "educationLevel" => "audience",
374 "accessRights" => "dc.rights",
375 "license" => "dc.rights",
376 "bibliographicCitation" => "dc.identifier"
377 };
378
379 my ($prefix,$name) = ($metaname =~ m/^(.*?)\.(.*?)$/);
380
381 if ($prefix eq "dcterms")
382 {
383 if (defined $dcterm_mapping->{$name})
384 {
385 return $dcterm_mapping->{$name}."^".$name;
386 }
387
388 }
389 return $metaname; # didn't get a match, return param passed in unchanged
390}
391
392
[4726]393sub extract_oai_metadata {
394 my $self = shift (@_);
395 my ($textref, $metadata) = @_;
396 my $outhandle = $self->{'outhandle'};
397
[9958]398 # Only handles DC metadata
399
400 $self->open_prettyprint_metadata_table();
401
402 if ($$textref =~ m/<metadata\s*>(.*?)<\/metadata\s*>/s)
[4726]403 {
[10254]404 my $metadata_text = $1;
[4726]405
[14940]406 # locate and remove outermost tag (ignoring any attribute information in top-level tag)
407 my ($wrapper_metadata_xml,$inner_metadata_text) = ($metadata_text =~ m/<([^ >]+).*?>(.*?)<\/\1>/s);
408
409 # split tag into namespace and tag name
410 my($namespace,$top_level_prefix) = ($wrapper_metadata_xml =~ m/^(.*?):(.*?)$/);
411
[17066]412 # sometimes, the dc namespace is not specified as the prefix in each element (like <dc:title>)
413 # but is rather defined in the wrapper element containing the various dc meta elements,
414 # like <dc><title></title><creator></creator></dc>.
415 # In such a case, we use this wrapper element as the top_level_prefix
416 if(!defined $top_level_prefix && defined $wrapper_metadata_xml && $wrapper_metadata_xml =~ m/dc$/) {
417 $top_level_prefix = $wrapper_metadata_xml;
418 }
419
420 if ($top_level_prefix !~ m/dc$/) {
[15872]421 print $outhandle "Warning: OAIPlugin currently only designed for Dublin Core (or variant) metadata\n";
[14940]422 print $outhandle " This recorded metadata section '$top_level_prefix' does not appear to match.\n";
423 print $outhandle " Metadata assumed to be in form: <prefix:tag>value</prefix:tag> and will be converted\n";
424 print $outhandle " into Greenstone metadata as prefix.tag = value\n";
425 }
426
[14949]427 while ($inner_metadata_text =~ m/<([^ >]+).*?>(.*?)<\/\1>(.*)/s)
[4726]428 {
429 # if URL given for document as identifier metadata, store it ...
430 # $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
[9958]431
[4726]432 my $metaname = $1;
433 my $metavalue = $2;
[14949]434 $inner_metadata_text = $3;
435
436# print STDERR "*** metaname = $metaname\n";
437# print STDERR "*** metavalue = $metavalue\n";
438
[14940]439 # $metaname =~ s/^(dc:)?(.)/\u$2/; # strip of optional prefix and uppercase first letter
440 $metaname =~ s/:/\./;
441 if ($metaname !~ m/\./)
[4726]442 {
[14940]443 $metaname = "$top_level_prefix.$metaname";
[17066]444# print STDERR "*** metaname = $metaname\tmetavalue = $metavalue\n";
[4726]445 }
[14963]446 $metaname =~ s/\.(.)/\.\u$1/;
[4726]447
[14940]448 $metaname = $self->remap_dcterms_metadata($metaname);
449
[14963]450 $metavalue =~ s/\[/&#91;/g;
451 $metavalue =~ s/\]/&#93;/g;
452
453
[14940]454# if ($metaname eq "Identifier")
455# {
456# # name clashes with GSDL reserved metadata name for hash id
457# $metaname = "URL";
458# }
459
[4726]460 if (defined $metadata->{$metaname})
461 {
462 push(@{$metadata->{$metaname}},$metavalue);
[8121]463
[4726]464 }
465 else
466 {
467 $metadata->{$metaname} = [ $metavalue ];
468 }
469
[9958]470 $self->add_prettyprint_metadata_line($metaname, $metavalue);
471
[4726]472 }
473 }
[9958]474
475 $self->close_prettyprint_metadata_table();
[4726]476}
477
[13886]478## we know from the file extension, so doesn't need to check the doctype
479sub check_doctype {
480
481 return 1;
482}
483
[4726]4841;
Note: See TracBrowser for help on using the repository browser.