source: gsdl/trunk/perllib/plugins/OAIPlug.pm@ 14963

Last change on this file since 14963 was 14963, checked in by davidb, 16 years ago

Further massaging of text and metadata values to avoid special characters in Greenstone such as [ and ], and upper-casing first letter after metadata prefix

  • Property svn:keywords set to Author Date Id Revision
File size: 12.3 KB
Line 
1###########################################################################
2#
3# OAIPlug.pm -- basic Open Archives Initiative (OAI) plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package OAIPlug;
28
29use BasPlug;
30use unicode;
31use util;
32
33use strict;
34no strict 'refs'; # allow filehandles to be variables and viceversa
35
36use XMLPlug;
37
38sub BEGIN {
39 @OAIPlug::ISA = ('XMLPlug');
40}
41
42
43my $arguments =
44 [ { 'name' => "process_exp",
45 'desc' => "{BasPlug.process_exp}",
46 'type' => "regexp",
47 'reqd' => "no",
48 'deft' => &get_default_process_exp() },
49 ];
50
51my $options = { 'name' => "OAIPlug",
52 'desc' => "{OAIPlug.desc}",
53 'abstract' => "no",
54 'inherits' => "yes",
55 'args' => $arguments };
56
57
58sub new {
59 my ($class) = shift (@_);
60 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
61 push(@$pluginlist, $class);
62
63 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
64 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
65
66 my $self = new XMLPlug($pluginlist, $inputargs, $hashArgOptLists);
67
68 return bless $self, $class;
69}
70
71sub get_default_process_exp {
72 my $self = shift (@_);
73
74 return q^(?i)(\.oai)$^;
75}
76
77sub get_doctype {
78 my $self = shift(@_);
79
80 return "OAI-PMH";
81}
82
83sub xml_start_document {
84 my $self = shift (@_);
85 $self->{'in_metadata_node'} = 0;
86 $self->{'rawxml'} = "";
87}
88
89sub xml_end_document {
90}
91
92sub xml_doctype {
93 my $self = shift(@_);
94
95 my ($expat, $name, $sysid, $pubid, $internal) = @_;
96
97 ##die "" if ($name !~ /^OAI-PMH$/);
98
99 my $outhandle = $self->{'outhandle'};
100 print $outhandle "OAIPlug: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
101 print STDERR "<Processing n='$self->{'file'}' p='OAIPlug'>\n" if $self->{'gli'};
102
103}
104
105
106sub xml_start_tag {
107 my $self = shift(@_);
108 my ($expat,$element) = @_;
109
110 my %attr_hash = %_;
111
112 my $attr = "";
113 map { $attr .= " $_=$attr_hash{$_}"; } keys %attr_hash;
114
115 $self->{'rawxml'} .= "<$element$attr>";
116
117 if ($element eq "metadata") {
118 $self->{'in_metadata_node'} = 1;
119 $self->{'metadata_xml'} = "";
120 }
121
122 if ($self->{'in_metadata_node'}) {
123 $self->{'metadata_xml'} .= "<$element$attr>";
124 }
125}
126
127sub xml_end_tag {
128 my $self = shift(@_);
129 my ($expat, $element) = @_;
130
131 $self->{'rawxml'} .= "</$element>";
132
133 if ($self->{'in_metadata_node'}) {
134 $self->{'metadata_xml'} .= "</$element>";
135 }
136
137 if ($element eq "metadata") {
138 my $textref = \$self->{'metadata_xml'};
139 my $metadata = $self->{'metadata'};
140 $self->extract_oai_metadata($textref,$metadata);
141
142 $self->{'in_metadata_node'} = 0;
143 }
144
145
146}
147
148sub xml_text {
149 my $self = shift(@_);
150 my ($expat) = @_;
151
152 $self->{'rawxml'} .= $_;
153
154 if ($self->{'in_metadata_node'}) {
155 $self->{'metadata_xml'} .= $_;
156 }
157}
158
159
160
161
162sub read {
163 my $self = shift (@_);
164
165 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
166
167 my $outhandle = $self->{'outhandle'};
168
169 my $filename = $file;
170 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
171
172 return 0 if ((-d $filename) && ($filename =~ m/srcdocs$/));
173
174 if ($self->SUPER::read(@_)) {
175
176 # Do encoding stuff
177 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
178
179 my $url_array = $metadata->{'URL'};
180 my $num_urls = (defined $url_array) ? scalar(@$url_array) : 0;
181
182 my $srcdoc_exists = 0;
183 my $srcdoc_pos = 0;
184 my $filename_dir = &util::filename_head($filename);
185
186 for (my $i=0; $i<$num_urls; $i++) {
187
188 if ($url_array->[$i] !~ m/^(http|ftp):/) {
189
190 my $src_filename = &util::filename_cat($filename_dir, $url_array->[$i]);
191
192 if (-e $src_filename) {
193 $srcdoc_pos = $i;
194 $srcdoc_exists = 1;
195 }
196 }
197 }
198
199 if ($srcdoc_exists)
200 {
201 print $outhandle "OAIPlug: passing metadata on to $url_array->[0]\n"
202 if ($self->{'verbosity'}>1);
203
204
205 # Make pretty print metadata table stick with src filename
206 my $ppmd_table = $self->{'ppmd_table'};
207 $metadata->{'prettymd'} = [ $ppmd_table ];
208 $self->{'ppmd_table'} = undef;
209
210 return &plugin::read ($pluginfo, $filename_dir, $url_array->[0],
211 $metadata, $processor, $maxdocs, $total_count, $gli);
212 }
213 else
214 {
215 # create a new document
216 my $doc_obj = new doc ($filename, "indexed_doc");
217 my $top_section = $doc_obj->get_top_section;
218 my $plugin_type = $self->{'plugin_type'};
219
220 $doc_obj->add_utf8_metadata($top_section, "Language", $language);
221 $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding);
222 $doc_obj->add_utf8_metadata($top_section, "Plugin", $plugin_type);
223 $doc_obj->add_metadata($top_section, "FileFormat", "OAI");
224 $doc_obj->add_metadata($top_section, "FileSize", (-s $filename));
225
226 # include any metadata passed in from previous plugins
227 # note that this metadata is associated with the top level section
228 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
229
230 # do plugin specific processing of doc_obj
231 my $textref = \$self->{'rawxml'};
232 unless (defined ($self->process($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
233 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
234 return -1;
235 }
236
237 # do any automatic metadata extraction
238 $self->auto_extract_metadata ($doc_obj);
239
240 # add an OID
241 $doc_obj->set_OID();
242
243 my $ppmd_table = $self->{'ppmd_table'};
244 $doc_obj->add_utf8_metadata($top_section,"prettymd",$ppmd_table);
245 $self->{'ppmd_table'} = undef;
246
247 # process the document
248 $processor->process($doc_obj);
249
250 $self->{'num_processed'} ++;
251
252 return 1; # processed the file
253 }
254 }
255 else {
256 return undef;
257 }
258}
259
260
261# do plugin specific processing of doc_obj
262sub process {
263 my $self = shift (@_);
264 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
265 my $outhandle = $self->{'outhandle'};
266
267 print STDERR "<Processing n='$file' p='OAIPlug'>\n" if ($gli);
268 print $outhandle "OAIPlug: processing $file\n"
269 if $self->{'verbosity'} > 1;
270
271 my $cursection = $doc_obj->get_top_section();
272
273## $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection);
274
275 # add text to document object
276
277# $$textref =~ s/<(.*?)>/$1 /g;
278 $$textref =~ s/</&lt;/g;
279 $$textref =~ s/>/&gt;/g;
280 $$textref =~ s/\[/&#91;/g;
281 $$textref =~ s/\]/&#93;/g;
282
283## print STDERR "*** adding text: $$textref\n";
284
285 $doc_obj->add_utf8_text($cursection, $$textref);
286
287 return 1;
288}
289
290
291# Improvement is to merge this with newer version in MetadataPass
292
293sub open_prettyprint_metadata_table
294{
295 my $self = shift(@_);
296
297 my $att = "width=100% cellspacing=2";
298 my $style = "style=\'border-bottom: 4px solid #000080\'";
299
300 $self->{'ppmd_table'} = "\n<table $att $style>";
301}
302
303sub add_prettyprint_metadata_line
304{
305 my $self = shift(@_);
306 my ($metaname, $metavalue_utf8) = @_;
307
308### $metavalue_utf8 =~ s/hdl\.handle\.net/mcgonagall.cs.waikato.ac.nz:8080\/dspace\/handle/;
309 $metavalue_utf8 = &util::hyperlink_text($metavalue_utf8);
310
311 $self->{'ppmd_table'} .= " <tr bgcolor=#b5d3cd>\n";
312 $self->{'ppmd_table'} .= " <td width=30%>\n";
313 $self->{'ppmd_table'} .= " $metaname\n";
314 $self->{'ppmd_table'} .= " </td>\n";
315 $self->{'ppmd_table'} .= " <td>\n";
316 $self->{'ppmd_table'} .= " $metavalue_utf8\n";
317 $self->{'ppmd_table'} .= " </td>\n";
318 $self->{'ppmd_table'} .= " </tr>\n";
319
320}
321
322sub close_prettyprint_metadata_table
323{
324 my $self = shift(@_);
325
326 $self->{'ppmd_table'} .= "</table>\n";
327}
328
329
330sub remap_dcterms_metadata
331{
332 my $self = shift(@_);
333
334 my ($metaname) = @_;
335
336 my $dcterm_mapping = {
337 "alternative" => "dc.title",
338 "tableOfContents" => "dc.description",
339 "abstract" => "dc.description",
340 "created" => "dc.date",
341 "valid" => "dc.date",
342 "available" => "dc.date",
343 "issued" => "dc.date",
344 "modified" => "dc.date",
345 "dateAccepted" => "dc.date",
346 "dateCopyrighted" => "dc.date",
347 "dateSubmitted" => "dc.date",
348 "extent" => "dc.format",
349 "medium" => "dc.format",
350 "isVersionOf" => "dc.relation",
351 "hasVersion" => "dc.relation",
352 "isReplacedBy" => "dc.relation",
353 "replaces" => "dc.relation",
354 "isRequiredBy" => "dc.relation",
355 "requires" => "dc.relation",
356 "isPartOf" => "dc.relation",
357 "hasPart" => "dc.relation",
358 "isReferencedBy" => "dc.relation",
359 "references" => "dc.relation",
360 "isFormatOf" => "dc.relation",
361 "hasFormat" => "dc.relation",
362 "conformsTo" => "dc.relation",
363 "spatial" => "dc.coverage",
364 "temporal" => "dc.coverage",
365 "audience" => "dc.any",
366 "accrualMethod" => "dc.any",
367 "accrualPeriodicity" => "dc.any",
368 "accrualPolicy" => "dc.any",
369 "instructionalMethod" => "dc.any",
370 "provenance" => "dc.any",
371 "rightsHolder" => "dc.any",
372 "mediator" => "audience",
373 "educationLevel" => "audience",
374 "accessRights" => "dc.rights",
375 "license" => "dc.rights",
376 "bibliographicCitation" => "dc.identifier"
377 };
378
379 my ($prefix,$name) = ($metaname =~ m/^(.*?)\.(.*?)$/);
380
381 if ($prefix eq "dcterms")
382 {
383 if (defined $dcterm_mapping->{$name})
384 {
385 return $dcterm_mapping->{$name}."^".$name;
386 }
387
388 }
389 return $metaname; # didn't get a match, return param passed in unchanged
390}
391
392
393sub extract_oai_metadata {
394 my $self = shift (@_);
395 my ($textref, $metadata) = @_;
396 my $outhandle = $self->{'outhandle'};
397
398 # Only handles DC metadata
399
400 $self->open_prettyprint_metadata_table();
401
402 if ($$textref =~ m/<metadata\s*>(.*?)<\/metadata\s*>/s)
403 {
404 my $metadata_text = $1;
405
406 # locate and remove outermost tag (ignoring any attribute information in top-level tag)
407 my ($wrapper_metadata_xml,$inner_metadata_text) = ($metadata_text =~ m/<([^ >]+).*?>(.*?)<\/\1>/s);
408
409 # split tag into namespace and tag name
410 my($namespace,$top_level_prefix) = ($wrapper_metadata_xml =~ m/^(.*?):(.*?)$/);
411
412 if ($top_level_prefix !~ /dc$/) {
413 print $outhandle "Warning: OAIPlug currently only designed for Dublin Core (or variant) metadata\n";
414 print $outhandle " This recorded metadata section '$top_level_prefix' does not appear to match.\n";
415 print $outhandle " Metadata assumed to be in form: <prefix:tag>value</prefix:tag> and will be converted\n";
416 print $outhandle " into Greenstone metadata as prefix.tag = value\n";
417 }
418
419 while ($inner_metadata_text =~ m/<([^ >]+).*?>(.*?)<\/\1>(.*)/s)
420 {
421 # if URL given for document as identifier metadata, store it ...
422 # $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
423
424 my $metaname = $1;
425 my $metavalue = $2;
426 $inner_metadata_text = $3;
427
428# print STDERR "*** metaname = $metaname\n";
429# print STDERR "*** metavalue = $metavalue\n";
430
431 # $metaname =~ s/^(dc:)?(.)/\u$2/; # strip of optional prefix and uppercase first letter
432 $metaname =~ s/:/\./;
433 if ($metaname !~ m/\./)
434 {
435 $metaname = "$top_level_prefix.$metaname";
436 }
437 $metaname =~ s/\.(.)/\.\u$1/;
438
439 $metaname = $self->remap_dcterms_metadata($metaname);
440
441 $metavalue =~ s/\[/&#91;/g;
442 $metavalue =~ s/\]/&#93;/g;
443
444
445# if ($metaname eq "Identifier")
446# {
447# # name clashes with GSDL reserved metadata name for hash id
448# $metaname = "URL";
449# }
450
451 if (defined $metadata->{$metaname})
452 {
453 push(@{$metadata->{$metaname}},$metavalue);
454
455 }
456 else
457 {
458 $metadata->{$metaname} = [ $metavalue ];
459 }
460
461 $self->add_prettyprint_metadata_line($metaname, $metavalue);
462
463 }
464 }
465
466 $self->close_prettyprint_metadata_table();
467}
468
469## we know from the file extension, so doesn't need to check the doctype
470sub check_doctype {
471
472 return 1;
473}
474
4751;
Note: See TracBrowser for help on using the repository browser.