source: gsdl/trunk/perllib/plugins/OAIPlugin.pm@ 17197

Last change on this file since 17197 was 17197, checked in by kjdon, 12 years ago

previous metadata changes meant that there was no longer URL metadata (used to find the source doc). Now it uses dc.Identifier

  • Property svn:keywords set to Author Date Id Revision
File size: 13.1 KB
Line 
1###########################################################################
2#
3# OAIPlug.pm -- basic Open Archives Initiative (OAI) plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package OAIPlugin;
28
29use unicode;
30use util;
31
32use strict;
33no strict 'refs'; # allow filehandles to be variables and viceversa
34
35use ReadXMLFile;
36use ReadTextFile; # needed for subroutine textcat_get_language_encoding
37
38sub BEGIN {
39 @OAIPlugin::ISA = ('ReadXMLFile', 'ReadTextFile');
40}
41
42
43my $arguments =
44 [ { 'name' => "process_exp",
45 'desc' => "{BasePlugin.process_exp}",
46 'type' => "regexp",
47 'reqd' => "no",
48 'deft' => &get_default_process_exp() }
49 ];
50
51my $options = { 'name' => "OAIPlugin",
52 'desc' => "{OAIPlugin.desc}",
53 'abstract' => "no",
54 'inherits' => "yes",
55 'explodes' => "yes",
56 'args' => $arguments };
57
58
59sub new {
60 my ($class) = shift (@_);
61 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
62 push(@$pluginlist, $class);
63
64 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
65 push(@{$hashArgOptLists->{"OptList"}},$options);
66
67 new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists,1);
68 my $self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
69
70 return bless $self, $class;
71}
72
73sub get_default_process_exp {
74 my $self = shift (@_);
75
76 return q^(?i)(\.oai)$^;
77}
78
79sub get_doctype {
80 my $self = shift(@_);
81
82 return "OAI-PMH";
83}
84
85sub xml_start_document {
86 my $self = shift (@_);
87 $self->{'in_metadata_node'} = 0;
88 $self->{'rawxml'} = "";
89}
90
91sub xml_end_document {
92}
93
94sub xml_doctype {
95 my $self = shift(@_);
96
97 my ($expat, $name, $sysid, $pubid, $internal) = @_;
98
99 ##die "" if ($name !~ /^OAI-PMH$/);
100
101 my $outhandle = $self->{'outhandle'};
102 print $outhandle "OAIPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
103 print STDERR "<Processing n='$self->{'file'}' p='OAIPlugin'>\n" if $self->{'gli'};
104
105}
106
107
108sub xml_start_tag {
109 my $self = shift(@_);
110 my ($expat,$element) = @_;
111
112 my %attr_hash = %_;
113
114 my $attr = "";
115 map { $attr .= " $_=$attr_hash{$_}"; } keys %attr_hash;
116
117 $self->{'rawxml'} .= "<$element$attr>";
118
119 if ($element eq "metadata") {
120 $self->{'in_metadata_node'} = 1;
121 $self->{'metadata_xml'} = "";
122 }
123
124 if ($self->{'in_metadata_node'}) {
125 $self->{'metadata_xml'} .= "<$element$attr>";
126 }
127}
128
129sub xml_end_tag {
130 my $self = shift(@_);
131 my ($expat, $element) = @_;
132
133 $self->{'rawxml'} .= "</$element>";
134
135 if ($self->{'in_metadata_node'}) {
136 $self->{'metadata_xml'} .= "</$element>";
137 }
138
139 if ($element eq "metadata") {
140 my $textref = \$self->{'metadata_xml'};
141 my $metadata = $self->{'metadata'};
142 $self->extract_oai_metadata($textref,$metadata);
143
144 $self->{'in_metadata_node'} = 0;
145 }
146
147
148}
149
150sub xml_text {
151 my $self = shift(@_);
152 my ($expat) = @_;
153
154 $self->{'rawxml'} .= $_;
155
156 if ($self->{'in_metadata_node'}) {
157 $self->{'metadata_xml'} .= $_;
158 }
159}
160
161
162
163
164sub read {
165 my $self = shift (@_);
166
167 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
168
169 my $outhandle = $self->{'outhandle'};
170
171 my $filename = $file;
172 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
173
174 # block the srcdocs dir - we will process files in them when we find an OAI record for them
175 return 0 if ((-d $filename) && ($filename =~ m/srcdocs$/));
176 if ($self->SUPER::read(@_)) {
177 # Do encoding stuff
178 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
179
180 my $url_array = $metadata->{'dc.Identifier'};
181 my $num_urls = (defined $url_array) ? scalar(@$url_array) : 0;
182
183 my $srcdoc_exists = 0;
184 my $srcdoc_pos = 0;
185 my $filename_dir = &util::filename_head($filename);
186
187 for (my $i=0; $i<$num_urls; $i++) {
188 if ($url_array->[$i] !~ m/^(http|ftp):/) {
189
190 my $src_filename = &util::filename_cat($filename_dir, $url_array->[$i]);
191 if (-e $src_filename) {
192 $srcdoc_pos = $i;
193 $srcdoc_exists = 1;
194 }
195 }
196 }
197
198 if ($srcdoc_exists)
199 {
200 print $outhandle "OAIPlugin: passing metadata on to $url_array->[0]\n"
201 if ($self->{'verbosity'}>1);
202
203
204 # Make pretty print metadata table stick with src filename
205 my $ppmd_table = $self->{'ppmd_table'};
206 $metadata->{'prettymd'} = [ $ppmd_table ];
207 $self->{'ppmd_table'} = undef;
208
209 return &plugin::read ($pluginfo, $filename_dir, $url_array->[0],
210 $block_hash, $metadata, $processor, $maxdocs,
211 $total_count, $gli);
212 }
213 else
214 {
215 # create a new document
216 my $doc_obj = new doc ($filename, "indexed_doc");
217 my $top_section = $doc_obj->get_top_section;
218 my $plugin_type = $self->{'plugin_type'};
219
220 $doc_obj->add_utf8_metadata($top_section, "Language", $language);
221 $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding);
222 $doc_obj->add_utf8_metadata($top_section, "Plugin", $plugin_type);
223 $doc_obj->add_metadata($top_section, "FileFormat", "OAI");
224 $doc_obj->add_metadata($top_section, "FileSize", (-s $filename));
225
226 # include any metadata passed in from previous plugins
227 # note that this metadata is associated with the top level section
228 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
229
230 # do plugin specific processing of doc_obj
231 my $textref = \$self->{'rawxml'};
232 unless (defined ($self->process($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
233 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
234 return -1;
235 }
236
237 # do any automatic metadata extraction
238 $self->auto_extract_metadata ($doc_obj);
239
240 # add an OID
241 $self->add_OID($doc_obj);
242
243 my $ppmd_table = $self->{'ppmd_table'};
244 $doc_obj->add_utf8_metadata($top_section,"prettymd",$ppmd_table);
245 $self->{'ppmd_table'} = undef;
246
247 # process the document
248 $processor->process($doc_obj);
249
250 $self->{'num_processed'} ++;
251
252 return 1; # processed the file
253 }
254 }
255 else {
256 return undef;
257 }
258}
259
260
261# do plugin specific processing of doc_obj
262sub process {
263 my $self = shift (@_);
264 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
265 my $outhandle = $self->{'outhandle'};
266
267 print STDERR "<Processing n='$file' p='OAIPlugin'>\n" if ($gli);
268 print $outhandle "OAIPlugin: processing $file\n"
269 if $self->{'verbosity'} > 1;
270
271 my $cursection = $doc_obj->get_top_section();
272
273## $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection);
274
275 # add text to document object
276
277# $$textref =~ s/<(.*?)>/$1 /g;
278 $$textref =~ s/</&lt;/g;
279 $$textref =~ s/>/&gt;/g;
280 $$textref =~ s/\[/&#91;/g;
281 $$textref =~ s/\]/&#93;/g;
282
283## print STDERR "*** adding text: $$textref\n";
284
285 $doc_obj->add_utf8_text($cursection, $$textref);
286
287 return 1;
288}
289
290
291# Improvement is to merge this with newer version in MetadataPass
292
293sub open_prettyprint_metadata_table
294{
295 my $self = shift(@_);
296
297 my $att = "width=100% cellspacing=2";
298 my $style = "style=\'border-bottom: 4px solid #000080\'";
299
300 $self->{'ppmd_table'} = "\n<table $att $style>";
301}
302
303sub add_prettyprint_metadata_line
304{
305 my $self = shift(@_);
306 my ($metaname, $metavalue_utf8) = @_;
307
308### $metavalue_utf8 =~ s/hdl\.handle\.net/mcgonagall.cs.waikato.ac.nz:8080\/dspace\/handle/;
309 $metavalue_utf8 = &util::hyperlink_text($metavalue_utf8);
310
311 $self->{'ppmd_table'} .= " <tr bgcolor=#b5d3cd>\n";
312 $self->{'ppmd_table'} .= " <td width=30%>\n";
313 $self->{'ppmd_table'} .= " $metaname\n";
314 $self->{'ppmd_table'} .= " </td>\n";
315 $self->{'ppmd_table'} .= " <td>\n";
316 $self->{'ppmd_table'} .= " $metavalue_utf8\n";
317 $self->{'ppmd_table'} .= " </td>\n";
318 $self->{'ppmd_table'} .= " </tr>\n";
319
320}
321
322sub close_prettyprint_metadata_table
323{
324 my $self = shift(@_);
325
326 $self->{'ppmd_table'} .= "</table>\n";
327}
328
329
330sub remap_dcterms_metadata
331{
332 my $self = shift(@_);
333
334 my ($metaname) = @_;
335
336 my $dcterm_mapping = {
337 "alternative" => "dc.title",
338 "tableOfContents" => "dc.description",
339 "abstract" => "dc.description",
340 "created" => "dc.date",
341 "valid" => "dc.date",
342 "available" => "dc.date",
343 "issued" => "dc.date",
344 "modified" => "dc.date",
345 "dateAccepted" => "dc.date",
346 "dateCopyrighted" => "dc.date",
347 "dateSubmitted" => "dc.date",
348 "extent" => "dc.format",
349 "medium" => "dc.format",
350 "isVersionOf" => "dc.relation",
351 "hasVersion" => "dc.relation",
352 "isReplacedBy" => "dc.relation",
353 "replaces" => "dc.relation",
354 "isRequiredBy" => "dc.relation",
355 "requires" => "dc.relation",
356 "isPartOf" => "dc.relation",
357 "hasPart" => "dc.relation",
358 "isReferencedBy" => "dc.relation",
359 "references" => "dc.relation",
360 "isFormatOf" => "dc.relation",
361 "hasFormat" => "dc.relation",
362 "conformsTo" => "dc.relation",
363 "spatial" => "dc.coverage",
364 "temporal" => "dc.coverage",
365 "audience" => "dc.any",
366 "accrualMethod" => "dc.any",
367 "accrualPeriodicity" => "dc.any",
368 "accrualPolicy" => "dc.any",
369 "instructionalMethod" => "dc.any",
370 "provenance" => "dc.any",
371 "rightsHolder" => "dc.any",
372 "mediator" => "audience",
373 "educationLevel" => "audience",
374 "accessRights" => "dc.rights",
375 "license" => "dc.rights",
376 "bibliographicCitation" => "dc.identifier"
377 };
378
379 my ($prefix,$name) = ($metaname =~ m/^(.*?)\.(.*?)$/);
380
381 if ($prefix eq "dcterms")
382 {
383 if (defined $dcterm_mapping->{$name})
384 {
385 return $dcterm_mapping->{$name}."^".$name;
386 }
387
388 }
389 return $metaname; # didn't get a match, return param passed in unchanged
390}
391
392
393sub extract_oai_metadata {
394 my $self = shift (@_);
395 my ($textref, $metadata) = @_;
396 my $outhandle = $self->{'outhandle'};
397
398 # Only handles DC metadata
399
400 $self->open_prettyprint_metadata_table();
401
402 if ($$textref =~ m/<metadata\s*>(.*?)<\/metadata\s*>/s)
403 {
404 my $metadata_text = $1;
405
406 # locate and remove outermost tag (ignoring any attribute information in top-level tag)
407 my ($wrapper_metadata_xml,$inner_metadata_text) = ($metadata_text =~ m/<([^ >]+).*?>(.*?)<\/\1>/s);
408
409 # split tag into namespace and tag name
410 my($namespace,$top_level_prefix) = ($wrapper_metadata_xml =~ m/^(.*?):(.*?)$/);
411
412 # sometimes, the dc namespace is not specified as the prefix in each element (like <dc:title>)
413 # but is rather defined in the wrapper element containing the various dc meta elements,
414 # like <dc><title></title><creator></creator></dc>.
415 # In such a case, we use this wrapper element as the top_level_prefix
416 if(!defined $top_level_prefix && defined $wrapper_metadata_xml && $wrapper_metadata_xml =~ m/dc$/) {
417 $top_level_prefix = $wrapper_metadata_xml;
418 }
419
420 if ($top_level_prefix !~ m/dc$/) {
421 print $outhandle "Warning: OAIPlugin currently only designed for Dublin Core (or variant) metadata\n";
422 print $outhandle " This recorded metadata section '$top_level_prefix' does not appear to match.\n";
423 print $outhandle " Metadata assumed to be in form: <prefix:tag>value</prefix:tag> and will be converted\n";
424 print $outhandle " into Greenstone metadata as prefix.tag = value\n";
425 }
426
427 while ($inner_metadata_text =~ m/<([^ >]+).*?>(.*?)<\/\1>(.*)/s)
428 {
429 # if URL given for document as identifier metadata, store it ...
430 # $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
431
432 my $metaname = $1;
433 my $metavalue = $2;
434 $inner_metadata_text = $3;
435
436# print STDERR "*** metaname = $metaname\n";
437# print STDERR "*** metavalue = $metavalue\n";
438
439 # $metaname =~ s/^(dc:)?(.)/\u$2/; # strip of optional prefix and uppercase first letter
440 $metaname =~ s/:/\./;
441 if ($metaname !~ m/\./)
442 {
443 $metaname = "$top_level_prefix.$metaname";
444# print STDERR "*** metaname = $metaname\tmetavalue = $metavalue\n";
445 }
446 $metaname =~ s/\.(.)/\.\u$1/;
447
448 $metaname = $self->remap_dcterms_metadata($metaname);
449
450 $metavalue =~ s/\[/&#91;/g;
451 $metavalue =~ s/\]/&#93;/g;
452
453
454# if ($metaname eq "Identifier")
455# {
456# # name clashes with GSDL reserved metadata name for hash id
457# $metaname = "URL";
458# }
459
460 if (defined $metadata->{$metaname})
461 {
462 push(@{$metadata->{$metaname}},$metavalue);
463
464 }
465 else
466 {
467 $metadata->{$metaname} = [ $metavalue ];
468 }
469
470 $self->add_prettyprint_metadata_line($metaname, $metavalue);
471
472 }
473 }
474
475 $self->close_prettyprint_metadata_table();
476}
477
478## we know from the file extension, so doesn't need to check the doctype
479sub check_doctype {
480
481 return 1;
482}
483
4841;
Note: See TracBrowser for help on using the repository browser.