source: gsdl/trunk/perllib/plugins/OAIPlugin.pm@ 16013

Last change on this file since 16013 was 16013, checked in by kjdon, 16 years ago

updated soem plugin names in some of the keys for strings.properties

  • Property svn:keywords set to Author Date Id Revision
File size: 12.3 KB
Line 
1###########################################################################
2#
3# OAIPlug.pm -- basic Open Archives Initiative (OAI) plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package OAIPlugin;
28
29use unicode;
30use util;
31
32use strict;
33no strict 'refs'; # allow filehandles to be variables and viceversa
34
35use ReadXMLFile;
36
37sub BEGIN {
38 @OAIPlugin::ISA = ('ReadXMLFile');
39}
40
41
42my $arguments =
43 [ { 'name' => "process_exp",
44 'desc' => "{BasePlugin.process_exp}",
45 'type' => "regexp",
46 'reqd' => "no",
47 'deft' => &get_default_process_exp() },
48 ];
49
50my $options = { 'name' => "OAIPlugin",
51 'desc' => "{OAIPlugin.desc}",
52 'abstract' => "no",
53 'inherits' => "yes",
54 'args' => $arguments };
55
56
57sub new {
58 my ($class) = shift (@_);
59 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
60 push(@$pluginlist, $class);
61
62 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
63 push(@{$hashArgOptLists->{"OptList"}},$options);
64
65 my $self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
66
67 return bless $self, $class;
68}
69
70sub get_default_process_exp {
71 my $self = shift (@_);
72
73 return q^(?i)(\.oai)$^;
74}
75
76sub get_doctype {
77 my $self = shift(@_);
78
79 return "OAI-PMH";
80}
81
82sub xml_start_document {
83 my $self = shift (@_);
84 $self->{'in_metadata_node'} = 0;
85 $self->{'rawxml'} = "";
86}
87
88sub xml_end_document {
89}
90
91sub xml_doctype {
92 my $self = shift(@_);
93
94 my ($expat, $name, $sysid, $pubid, $internal) = @_;
95
96 ##die "" if ($name !~ /^OAI-PMH$/);
97
98 my $outhandle = $self->{'outhandle'};
99 print $outhandle "OAIPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
100 print STDERR "<Processing n='$self->{'file'}' p='OAIPlugin'>\n" if $self->{'gli'};
101
102}
103
104
105sub xml_start_tag {
106 my $self = shift(@_);
107 my ($expat,$element) = @_;
108
109 my %attr_hash = %_;
110
111 my $attr = "";
112 map { $attr .= " $_=$attr_hash{$_}"; } keys %attr_hash;
113
114 $self->{'rawxml'} .= "<$element$attr>";
115
116 if ($element eq "metadata") {
117 $self->{'in_metadata_node'} = 1;
118 $self->{'metadata_xml'} = "";
119 }
120
121 if ($self->{'in_metadata_node'}) {
122 $self->{'metadata_xml'} .= "<$element$attr>";
123 }
124}
125
126sub xml_end_tag {
127 my $self = shift(@_);
128 my ($expat, $element) = @_;
129
130 $self->{'rawxml'} .= "</$element>";
131
132 if ($self->{'in_metadata_node'}) {
133 $self->{'metadata_xml'} .= "</$element>";
134 }
135
136 if ($element eq "metadata") {
137 my $textref = \$self->{'metadata_xml'};
138 my $metadata = $self->{'metadata'};
139 $self->extract_oai_metadata($textref,$metadata);
140
141 $self->{'in_metadata_node'} = 0;
142 }
143
144
145}
146
147sub xml_text {
148 my $self = shift(@_);
149 my ($expat) = @_;
150
151 $self->{'rawxml'} .= $_;
152
153 if ($self->{'in_metadata_node'}) {
154 $self->{'metadata_xml'} .= $_;
155 }
156}
157
158
159
160
161sub read {
162 my $self = shift (@_);
163
164 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
165
166 my $outhandle = $self->{'outhandle'};
167
168 my $filename = $file;
169 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
170
171 return 0 if ((-d $filename) && ($filename =~ m/srcdocs$/));
172
173 if ($self->SUPER::read(@_)) {
174
175 # Do encoding stuff
176 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
177
178 my $url_array = $metadata->{'URL'};
179 my $num_urls = (defined $url_array) ? scalar(@$url_array) : 0;
180
181 my $srcdoc_exists = 0;
182 my $srcdoc_pos = 0;
183 my $filename_dir = &util::filename_head($filename);
184
185 for (my $i=0; $i<$num_urls; $i++) {
186
187 if ($url_array->[$i] !~ m/^(http|ftp):/) {
188
189 my $src_filename = &util::filename_cat($filename_dir, $url_array->[$i]);
190
191 if (-e $src_filename) {
192 $srcdoc_pos = $i;
193 $srcdoc_exists = 1;
194 }
195 }
196 }
197
198 if ($srcdoc_exists)
199 {
200 print $outhandle "OAIPlugin: passing metadata on to $url_array->[0]\n"
201 if ($self->{'verbosity'}>1);
202
203
204 # Make pretty print metadata table stick with src filename
205 my $ppmd_table = $self->{'ppmd_table'};
206 $metadata->{'prettymd'} = [ $ppmd_table ];
207 $self->{'ppmd_table'} = undef;
208
209 return &plugin::read ($pluginfo, $filename_dir, $url_array->[0],
210 $metadata, $processor, $maxdocs, $total_count, $gli);
211 }
212 else
213 {
214 # create a new document
215 my $doc_obj = new doc ($filename, "indexed_doc");
216 my $top_section = $doc_obj->get_top_section;
217 my $plugin_type = $self->{'plugin_type'};
218
219 $doc_obj->add_utf8_metadata($top_section, "Language", $language);
220 $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding);
221 $doc_obj->add_utf8_metadata($top_section, "Plugin", $plugin_type);
222 $doc_obj->add_metadata($top_section, "FileFormat", "OAI");
223 $doc_obj->add_metadata($top_section, "FileSize", (-s $filename));
224
225 # include any metadata passed in from previous plugins
226 # note that this metadata is associated with the top level section
227 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
228
229 # do plugin specific processing of doc_obj
230 my $textref = \$self->{'rawxml'};
231 unless (defined ($self->process($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
232 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
233 return -1;
234 }
235
236 # do any automatic metadata extraction
237 $self->auto_extract_metadata ($doc_obj);
238
239 # add an OID
240 $doc_obj->set_OID();
241
242 my $ppmd_table = $self->{'ppmd_table'};
243 $doc_obj->add_utf8_metadata($top_section,"prettymd",$ppmd_table);
244 $self->{'ppmd_table'} = undef;
245
246 # process the document
247 $processor->process($doc_obj);
248
249 $self->{'num_processed'} ++;
250
251 return 1; # processed the file
252 }
253 }
254 else {
255 return undef;
256 }
257}
258
259
260# do plugin specific processing of doc_obj
261sub process {
262 my $self = shift (@_);
263 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
264 my $outhandle = $self->{'outhandle'};
265
266 print STDERR "<Processing n='$file' p='OAIPlugin'>\n" if ($gli);
267 print $outhandle "OAIPlugin: processing $file\n"
268 if $self->{'verbosity'} > 1;
269
270 my $cursection = $doc_obj->get_top_section();
271
272## $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection);
273
274 # add text to document object
275
276# $$textref =~ s/<(.*?)>/$1 /g;
277 $$textref =~ s/</&lt;/g;
278 $$textref =~ s/>/&gt;/g;
279 $$textref =~ s/\[/&#91;/g;
280 $$textref =~ s/\]/&#93;/g;
281
282## print STDERR "*** adding text: $$textref\n";
283
284 $doc_obj->add_utf8_text($cursection, $$textref);
285
286 return 1;
287}
288
289
290# Improvement is to merge this with newer version in MetadataPass
291
292sub open_prettyprint_metadata_table
293{
294 my $self = shift(@_);
295
296 my $att = "width=100% cellspacing=2";
297 my $style = "style=\'border-bottom: 4px solid #000080\'";
298
299 $self->{'ppmd_table'} = "\n<table $att $style>";
300}
301
302sub add_prettyprint_metadata_line
303{
304 my $self = shift(@_);
305 my ($metaname, $metavalue_utf8) = @_;
306
307### $metavalue_utf8 =~ s/hdl\.handle\.net/mcgonagall.cs.waikato.ac.nz:8080\/dspace\/handle/;
308 $metavalue_utf8 = &util::hyperlink_text($metavalue_utf8);
309
310 $self->{'ppmd_table'} .= " <tr bgcolor=#b5d3cd>\n";
311 $self->{'ppmd_table'} .= " <td width=30%>\n";
312 $self->{'ppmd_table'} .= " $metaname\n";
313 $self->{'ppmd_table'} .= " </td>\n";
314 $self->{'ppmd_table'} .= " <td>\n";
315 $self->{'ppmd_table'} .= " $metavalue_utf8\n";
316 $self->{'ppmd_table'} .= " </td>\n";
317 $self->{'ppmd_table'} .= " </tr>\n";
318
319}
320
321sub close_prettyprint_metadata_table
322{
323 my $self = shift(@_);
324
325 $self->{'ppmd_table'} .= "</table>\n";
326}
327
328
329sub remap_dcterms_metadata
330{
331 my $self = shift(@_);
332
333 my ($metaname) = @_;
334
335 my $dcterm_mapping = {
336 "alternative" => "dc.title",
337 "tableOfContents" => "dc.description",
338 "abstract" => "dc.description",
339 "created" => "dc.date",
340 "valid" => "dc.date",
341 "available" => "dc.date",
342 "issued" => "dc.date",
343 "modified" => "dc.date",
344 "dateAccepted" => "dc.date",
345 "dateCopyrighted" => "dc.date",
346 "dateSubmitted" => "dc.date",
347 "extent" => "dc.format",
348 "medium" => "dc.format",
349 "isVersionOf" => "dc.relation",
350 "hasVersion" => "dc.relation",
351 "isReplacedBy" => "dc.relation",
352 "replaces" => "dc.relation",
353 "isRequiredBy" => "dc.relation",
354 "requires" => "dc.relation",
355 "isPartOf" => "dc.relation",
356 "hasPart" => "dc.relation",
357 "isReferencedBy" => "dc.relation",
358 "references" => "dc.relation",
359 "isFormatOf" => "dc.relation",
360 "hasFormat" => "dc.relation",
361 "conformsTo" => "dc.relation",
362 "spatial" => "dc.coverage",
363 "temporal" => "dc.coverage",
364 "audience" => "dc.any",
365 "accrualMethod" => "dc.any",
366 "accrualPeriodicity" => "dc.any",
367 "accrualPolicy" => "dc.any",
368 "instructionalMethod" => "dc.any",
369 "provenance" => "dc.any",
370 "rightsHolder" => "dc.any",
371 "mediator" => "audience",
372 "educationLevel" => "audience",
373 "accessRights" => "dc.rights",
374 "license" => "dc.rights",
375 "bibliographicCitation" => "dc.identifier"
376 };
377
378 my ($prefix,$name) = ($metaname =~ m/^(.*?)\.(.*?)$/);
379
380 if ($prefix eq "dcterms")
381 {
382 if (defined $dcterm_mapping->{$name})
383 {
384 return $dcterm_mapping->{$name}."^".$name;
385 }
386
387 }
388 return $metaname; # didn't get a match, return param passed in unchanged
389}
390
391
392sub extract_oai_metadata {
393 my $self = shift (@_);
394 my ($textref, $metadata) = @_;
395 my $outhandle = $self->{'outhandle'};
396
397 # Only handles DC metadata
398
399 $self->open_prettyprint_metadata_table();
400
401 if ($$textref =~ m/<metadata\s*>(.*?)<\/metadata\s*>/s)
402 {
403 my $metadata_text = $1;
404
405 # locate and remove outermost tag (ignoring any attribute information in top-level tag)
406 my ($wrapper_metadata_xml,$inner_metadata_text) = ($metadata_text =~ m/<([^ >]+).*?>(.*?)<\/\1>/s);
407
408 # split tag into namespace and tag name
409 my($namespace,$top_level_prefix) = ($wrapper_metadata_xml =~ m/^(.*?):(.*?)$/);
410
411 if ($top_level_prefix !~ /dc$/) {
412 print $outhandle "Warning: OAIPlugin currently only designed for Dublin Core (or variant) metadata\n";
413 print $outhandle " This recorded metadata section '$top_level_prefix' does not appear to match.\n";
414 print $outhandle " Metadata assumed to be in form: <prefix:tag>value</prefix:tag> and will be converted\n";
415 print $outhandle " into Greenstone metadata as prefix.tag = value\n";
416 }
417
418 while ($inner_metadata_text =~ m/<([^ >]+).*?>(.*?)<\/\1>(.*)/s)
419 {
420 # if URL given for document as identifier metadata, store it ...
421 # $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
422
423 my $metaname = $1;
424 my $metavalue = $2;
425 $inner_metadata_text = $3;
426
427# print STDERR "*** metaname = $metaname\n";
428# print STDERR "*** metavalue = $metavalue\n";
429
430 # $metaname =~ s/^(dc:)?(.)/\u$2/; # strip of optional prefix and uppercase first letter
431 $metaname =~ s/:/\./;
432 if ($metaname !~ m/\./)
433 {
434 $metaname = "$top_level_prefix.$metaname";
435 }
436 $metaname =~ s/\.(.)/\.\u$1/;
437
438 $metaname = $self->remap_dcterms_metadata($metaname);
439
440 $metavalue =~ s/\[/&#91;/g;
441 $metavalue =~ s/\]/&#93;/g;
442
443
444# if ($metaname eq "Identifier")
445# {
446# # name clashes with GSDL reserved metadata name for hash id
447# $metaname = "URL";
448# }
449
450 if (defined $metadata->{$metaname})
451 {
452 push(@{$metadata->{$metaname}},$metavalue);
453
454 }
455 else
456 {
457 $metadata->{$metaname} = [ $metavalue ];
458 }
459
460 $self->add_prettyprint_metadata_line($metaname, $metavalue);
461
462 }
463 }
464
465 $self->close_prettyprint_metadata_table();
466}
467
468## we know from the file extension, so doesn't need to check the doctype
469sub check_doctype {
470
471 return 1;
472}
473
4741;
Note: See TracBrowser for help on using the repository browser.