source: gsdl/trunk/perllib/plugins/OAIPlugin.pm@ 17216

Last change on this file since 17216 was 17216, checked in by kjdon, 12 years ago

trying to get OAI files exploding. Have copied in some code from one of David's obsolete files. I think it works but haven't tested fully yet. Wanted to get the code committed though.

  • Property svn:keywords set to Author Date Id Revision
File size: 17.2 KB
Line 
1###########################################################################
2#
3# OAIPlug.pm -- basic Open Archives Initiative (OAI) plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package OAIPlugin;
28
29use unicode;
30use util;
31
32use strict;
33no strict 'refs'; # allow filehandles to be variables and viceversa
34
35use ReadXMLFile;
36use ReadTextFile; # needed for subroutine textcat_get_language_encoding
37use metadatautil;
38
39sub BEGIN {
40 @OAIPlugin::ISA = ('ReadXMLFile', 'ReadTextFile');
41}
42
43
44my $arguments =
45 [ { 'name' => "process_exp",
46 'desc' => "{BasePlugin.process_exp}",
47 'type' => "regexp",
48 'reqd' => "no",
49 'deft' => &get_default_process_exp() }
50 ];
51
52my $options = { 'name' => "OAIPlugin",
53 'desc' => "{OAIPlugin.desc}",
54 'abstract' => "no",
55 'inherits' => "yes",
56 'explodes' => "yes",
57 'args' => $arguments };
58
59
60sub new {
61 my ($class) = shift (@_);
62 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
63 push(@$pluginlist, $class);
64
65 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
66 push(@{$hashArgOptLists->{"OptList"}},$options);
67
68 new ReadTextFile($pluginlist, $inputargs, $hashArgOptLists,1);
69 my $self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
70
71 return bless $self, $class;
72}
73
74sub get_default_process_exp {
75 my $self = shift (@_);
76
77 return q^(?i)(\.oai)$^;
78}
79
80sub get_doctype {
81 my $self = shift(@_);
82
83 return "OAI-PMH";
84}
85
86sub xml_start_document {
87 my $self = shift (@_);
88 $self->{'in_metadata_node'} = 0;
89 $self->{'rawxml'} = "";
90}
91
92sub xml_end_document {
93}
94
95sub xml_doctype {
96 my $self = shift(@_);
97
98 my ($expat, $name, $sysid, $pubid, $internal) = @_;
99
100 ##die "" if ($name !~ /^OAI-PMH$/);
101
102 my $outhandle = $self->{'outhandle'};
103 print $outhandle "OAIPlugin: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
104 print STDERR "<Processing n='$self->{'file'}' p='OAIPlugin'>\n" if $self->{'gli'};
105
106}
107
108
109sub xml_start_tag {
110 my $self = shift(@_);
111 my ($expat,$element) = @_;
112
113 my %attr_hash = %_;
114
115 my $attr = "";
116 map { $attr .= " $_=$attr_hash{$_}"; } keys %attr_hash;
117
118 $self->{'rawxml'} .= "<$element$attr>";
119
120 if ($element eq "metadata") {
121 $self->{'in_metadata_node'} = 1;
122 $self->{'metadata_xml'} = "";
123 }
124
125 if ($self->{'in_metadata_node'}) {
126 $self->{'metadata_xml'} .= "<$element$attr>";
127 }
128}
129
130sub xml_end_tag {
131 my $self = shift(@_);
132 my ($expat, $element) = @_;
133
134 $self->{'rawxml'} .= "</$element>";
135
136 if ($self->{'in_metadata_node'}) {
137 $self->{'metadata_xml'} .= "</$element>";
138 }
139
140 if ($element eq "metadata") {
141 my $textref = \$self->{'metadata_xml'};
142 my $metadata = $self->{'metadata'};
143 $self->extract_oai_metadata($textref,$metadata);
144
145 $self->{'in_metadata_node'} = 0;
146 }
147
148
149}
150
151sub xml_text {
152 my $self = shift(@_);
153 my ($expat) = @_;
154
155 $self->{'rawxml'} .= $_;
156
157 if ($self->{'in_metadata_node'}) {
158 $self->{'metadata_xml'} .= $_;
159 }
160}
161
162
163sub metadata_read {
164 my $self = shift (@_);
165
166 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $extrametakeys, $extrametadata, $processor, $maxdocs, $gli) = @_;
167
168 # can we process this file??
169 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
170 return undef unless $self->can_process_this_file($filename_full_path);
171
172 my $total_count = 0; # is total count used?
173 if ($self->SUPER::read($pluginfo,$base_dir,$file,$block_hash,$metadata,$processor,$maxdocs,$total_count, $gli)) {
174 # calling "SUPER::read" at this point sets up $metadata
175 # data-structure. We can then, later, in OAIPlug::read decide
176 # whether this $metadata will stick to an accompanying file,
177 # or else needs a new doc object to be formed that contains
178 # purely metadata
179
180 $self->{'metadata'} = undef;
181
182 #my $url_array = $metadata->{'gi.Sourcedoc'};
183 my $url_array = $metadata->{'dc.Identifier'};
184 my $num_urls = (defined $url_array) ? scalar(@$url_array) : 0;
185
186 my $srcdoc_exists = 0;
187 my $srcdoc_pos = 0;
188 my $filename_dir = &util::filename_head($filename_full_path);
189
190 for (my $i=0; $i<$num_urls; $i++) {
191
192 if ($url_array->[$i] !~ m/^(https?|ftp):/) {
193
194 my $src_filename = &util::filename_cat($filename_dir, $url_array->[$i]);
195
196 if (-e $src_filename) {
197 $srcdoc_pos = $i;
198 $srcdoc_exists = 1;
199 last;
200 }
201 }
202 }
203
204
205 if ($srcdoc_exists)
206 {
207 $self->{'oai-files'}->{$file}->{'srcdoc_exists'} = 1;
208
209### print STDERR "**** storing OAI file: $file\n";
210
211 # Make pretty print metadata table stick with src filename
212 my $ppmd_table = $self->{'ppmd_table'};
213
214 $metadata->{'prettymd'} = [ $ppmd_table ];
215 $self->{'ppmd_table'} = undef;
216
217 }
218 else {
219 $self->{'oai-files'}->{$file}->{'srcdoc_exists'} = 0;
220 }
221
222 }
223 else {
224 return undef;
225 }
226}
227
228
229sub read {
230 my $self = shift (@_);
231
232 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
233
234
235### print STDERR "**** checking OAI read: $file\n";
236
237 if (defined $self->{'oai-files'}->{$file}) {
238
239 my $srcdoc_exists = $self->{'oai-files'}->{$file}->{'srcdoc_exists'};
240
241 # no more need to access details of this $file => tidy up as you go
242 delete $self->{'oai-files'}->{$file};
243
244### print STDERR "**** !!!!! srcdoc_exists = $srcdoc_exists\n";
245 if (!$srcdoc_exists)
246 {
247
248 my $filename = $file;
249 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
250
251 # Do encoding stuff on metadata
252 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
253
254 # create a new document
255 my $doc_obj = new doc ($filename, "indexed_doc");
256 my $top_section = $doc_obj->get_top_section;
257 my $plugin_type = $self->{'plugin_type'};
258
259 $doc_obj->add_utf8_metadata($top_section, "Language", $language);
260 $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding);
261 $doc_obj->add_utf8_metadata($top_section, "Plugin", $plugin_type);
262 $doc_obj->add_metadata($top_section, "FileFormat", "OAI");
263 $doc_obj->add_metadata($top_section, "FileSize", (-s $filename));
264
265 # include any metadata passed in from previous plugins
266 # note that this metadata is associated with the top level section
267 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
268
269 # do plugin specific processing of doc_obj
270 my $textref = \$self->{'rawxml'};
271 unless (defined ($self->process($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
272 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
273 return -1;
274 }
275
276 # do any automatic metadata extraction
277 $self->auto_extract_metadata ($doc_obj);
278
279 # add an OID
280 $self->add_OID($doc_obj);
281
282 my $prettymds = $self->{'prettymd'};
283 foreach my $prettymd (@$prettymds) {
284 $doc_obj->add_utf8_metadata($top_section,"prettymd",$prettymd);
285 }
286 $self->{'prettymd'} = undef;
287
288 # process the document
289 $processor->process($doc_obj);
290
291 $self->{'num_processed'} ++;
292
293 return 1; # processed the file
294 }
295 }
296 else {
297 return undef;
298 }
299}
300
301
302sub read_old {
303 my $self = shift (@_);
304
305 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
306
307 my $outhandle = $self->{'outhandle'};
308
309 my $filename = $file;
310 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
311
312 # block the srcdocs dir - we will process files in them when we find an OAI record for them
313 return 0 if ((-d $filename) && ($filename =~ m/srcdocs$/));
314 if ($self->SUPER::read(@_)) {
315 # Do encoding stuff
316 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
317
318 my $url_array = $metadata->{'dc.Identifier'};
319 my $num_urls = (defined $url_array) ? scalar(@$url_array) : 0;
320
321 my $srcdoc_exists = 0;
322 my $srcdoc_pos = 0;
323 my $filename_dir = &util::filename_head($filename);
324
325 for (my $i=0; $i<$num_urls; $i++) {
326 if ($url_array->[$i] !~ m/^(http|ftp):/) {
327
328 my $src_filename = &util::filename_cat($filename_dir, $url_array->[$i]);
329 if (-e $src_filename) {
330 $srcdoc_pos = $i;
331 $srcdoc_exists = 1;
332 }
333 }
334 }
335
336 if ($srcdoc_exists)
337 {
338 print $outhandle "OAIPlugin: passing metadata on to $url_array->[0]\n"
339 if ($self->{'verbosity'}>1);
340
341
342 # Make pretty print metadata table stick with src filename
343 my $ppmd_table = $self->{'ppmd_table'};
344 $metadata->{'prettymd'} = [ $ppmd_table ];
345 $self->{'ppmd_table'} = undef;
346
347 return &plugin::read ($pluginfo, $filename_dir, $url_array->[0],
348 $block_hash, $metadata, $processor, $maxdocs,
349 $total_count, $gli);
350 }
351 else
352 {
353 # create a new document
354 my $doc_obj = new doc ($filename, "indexed_doc");
355 my $top_section = $doc_obj->get_top_section;
356 my $plugin_type = $self->{'plugin_type'};
357
358 $doc_obj->add_utf8_metadata($top_section, "Language", $language);
359 $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding);
360 $doc_obj->add_utf8_metadata($top_section, "Plugin", $plugin_type);
361 $doc_obj->add_metadata($top_section, "FileFormat", "OAI");
362 $doc_obj->add_metadata($top_section, "FileSize", (-s $filename));
363
364 # include any metadata passed in from previous plugins
365 # note that this metadata is associated with the top level section
366 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
367
368 # do plugin specific processing of doc_obj
369 my $textref = \$self->{'rawxml'};
370 unless (defined ($self->process($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
371 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
372 return -1;
373 }
374
375 # do any automatic metadata extraction
376 $self->auto_extract_metadata ($doc_obj);
377
378 # add an OID
379 $self->add_OID($doc_obj);
380
381 my $ppmd_table = $self->{'ppmd_table'};
382 $doc_obj->add_utf8_metadata($top_section,"prettymd",$ppmd_table);
383 $self->{'ppmd_table'} = undef;
384
385 # process the document
386 $processor->process($doc_obj);
387
388 $self->{'num_processed'} ++;
389
390 return 1; # processed the file
391 }
392 }
393 else {
394 return undef;
395 }
396}
397
398
399# do plugin specific processing of doc_obj
400sub process {
401 my $self = shift (@_);
402 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
403 my $outhandle = $self->{'outhandle'};
404
405 print STDERR "<Processing n='$file' p='OAIPlugin'>\n" if ($gli);
406 print $outhandle "OAIPlugin: processing $file\n"
407 if $self->{'verbosity'} > 1;
408
409 my $cursection = $doc_obj->get_top_section();
410
411## $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection);
412
413 # add text to document object
414
415# $$textref =~ s/<(.*?)>/$1 /g;
416 $$textref =~ s/</&lt;/g;
417 $$textref =~ s/>/&gt;/g;
418 $$textref =~ s/\[/&#91;/g;
419 $$textref =~ s/\]/&#93;/g;
420
421## print STDERR "*** adding text: $$textref\n";
422
423 $doc_obj->add_utf8_text($cursection, $$textref);
424
425 return 1;
426}
427
428
429# Improvement is to merge this with newer version in MetadataPass
430
431sub open_prettyprint_metadata_table
432{
433 my $self = shift(@_);
434
435 my $att = "width=100% cellspacing=2";
436 my $style = "style=\'border-bottom: 4px solid #000080\'";
437
438 $self->{'ppmd_table'} = "\n<table $att $style>";
439}
440
441sub add_prettyprint_metadata_line
442{
443 my $self = shift(@_);
444 my ($metaname, $metavalue_utf8) = @_;
445
446### $metavalue_utf8 =~ s/hdl\.handle\.net/mcgonagall.cs.waikato.ac.nz:8080\/dspace\/handle/;
447 $metavalue_utf8 = &util::hyperlink_text($metavalue_utf8);
448
449 $self->{'ppmd_table'} .= " <tr bgcolor=#b5d3cd>\n";
450 $self->{'ppmd_table'} .= " <td width=30%>\n";
451 $self->{'ppmd_table'} .= " $metaname\n";
452 $self->{'ppmd_table'} .= " </td>\n";
453 $self->{'ppmd_table'} .= " <td>\n";
454 $self->{'ppmd_table'} .= " $metavalue_utf8\n";
455 $self->{'ppmd_table'} .= " </td>\n";
456 $self->{'ppmd_table'} .= " </tr>\n";
457
458}
459
460sub close_prettyprint_metadata_table
461{
462 my $self = shift(@_);
463
464 $self->{'ppmd_table'} .= "</table>\n";
465}
466
467
468sub remap_dcterms_metadata
469{
470 my $self = shift(@_);
471
472 my ($metaname) = @_;
473
474 my $dcterm_mapping = {
475 "alternative" => "dc.title",
476 "tableOfContents" => "dc.description",
477 "abstract" => "dc.description",
478 "created" => "dc.date",
479 "valid" => "dc.date",
480 "available" => "dc.date",
481 "issued" => "dc.date",
482 "modified" => "dc.date",
483 "dateAccepted" => "dc.date",
484 "dateCopyrighted" => "dc.date",
485 "dateSubmitted" => "dc.date",
486 "extent" => "dc.format",
487 "medium" => "dc.format",
488 "isVersionOf" => "dc.relation",
489 "hasVersion" => "dc.relation",
490 "isReplacedBy" => "dc.relation",
491 "replaces" => "dc.relation",
492 "isRequiredBy" => "dc.relation",
493 "requires" => "dc.relation",
494 "isPartOf" => "dc.relation",
495 "hasPart" => "dc.relation",
496 "isReferencedBy" => "dc.relation",
497 "references" => "dc.relation",
498 "isFormatOf" => "dc.relation",
499 "hasFormat" => "dc.relation",
500 "conformsTo" => "dc.relation",
501 "spatial" => "dc.coverage",
502 "temporal" => "dc.coverage",
503 "audience" => "dc.any",
504 "accrualMethod" => "dc.any",
505 "accrualPeriodicity" => "dc.any",
506 "accrualPolicy" => "dc.any",
507 "instructionalMethod" => "dc.any",
508 "provenance" => "dc.any",
509 "rightsHolder" => "dc.any",
510 "mediator" => "audience",
511 "educationLevel" => "audience",
512 "accessRights" => "dc.rights",
513 "license" => "dc.rights",
514 "bibliographicCitation" => "dc.identifier"
515 };
516
517 my ($prefix,$name) = ($metaname =~ m/^(.*?)\.(.*?)$/);
518
519 if ($prefix eq "dcterms")
520 {
521 if (defined $dcterm_mapping->{$name})
522 {
523 return $dcterm_mapping->{$name}."^".$name;
524 }
525
526 }
527 return $metaname; # didn't get a match, return param passed in unchanged
528}
529
530
531sub extract_oai_metadata {
532 my $self = shift (@_);
533 my ($textref, $metadata) = @_;
534 my $outhandle = $self->{'outhandle'};
535
536 # Only handles DC metadata
537
538 $self->open_prettyprint_metadata_table();
539
540 if ($$textref =~ m/<metadata\s*>(.*?)<\/metadata\s*>/s)
541 {
542 my $metadata_text = $1;
543
544 # locate and remove outermost tag (ignoring any attribute information in top-level tag)
545 my ($wrapper_metadata_xml,$inner_metadata_text) = ($metadata_text =~ m/<([^ >]+).*?>(.*?)<\/\1>/s);
546
547 # split tag into namespace and tag name
548 my($namespace,$top_level_prefix) = ($wrapper_metadata_xml =~ m/^(.*?):(.*?)$/);
549
550 # sometimes, the dc namespace is not specified as the prefix in each element (like <dc:title>)
551 # but is rather defined in the wrapper element containing the various dc meta elements,
552 # like <dc><title></title><creator></creator></dc>.
553 # In such a case, we use this wrapper element as the top_level_prefix
554 if(!defined $top_level_prefix && defined $wrapper_metadata_xml && $wrapper_metadata_xml =~ m/dc$/) {
555 $top_level_prefix = $wrapper_metadata_xml;
556 }
557
558 if ($top_level_prefix !~ m/dc$/) {
559 print $outhandle "Warning: OAIPlugin currently only designed for Dublin Core (or variant) metadata\n";
560 print $outhandle " This recorded metadata section '$top_level_prefix' does not appear to match.\n";
561 print $outhandle " Metadata assumed to be in form: <prefix:tag>value</prefix:tag> and will be converted\n";
562 print $outhandle " into Greenstone metadata as prefix.tag = value\n";
563 }
564
565 while ($inner_metadata_text =~ m/<([^ >]+).*?>(.*?)<\/\1>(.*)/s)
566 {
567 # if URL given for document as identifier metadata, store it ...
568 # $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
569
570 my $metaname = $1;
571 my $metavalue = $2;
572 $inner_metadata_text = $3;
573
574# print STDERR "*** metaname = $metaname\n";
575# print STDERR "*** metavalue = $metavalue\n";
576
577 # $metaname =~ s/^(dc:)?(.)/\u$2/; # strip of optional prefix and uppercase first letter
578 $metaname =~ s/:/\./;
579 if ($metaname !~ m/\./)
580 {
581 $metaname = "$top_level_prefix.$metaname";
582# print STDERR "*** metaname = $metaname\tmetavalue = $metavalue\n";
583 }
584 $metaname =~ s/\.(.)/\.\u$1/;
585
586 $metaname = $self->remap_dcterms_metadata($metaname);
587
588 $metavalue =~ s/\[/&#91;/g;
589 $metavalue =~ s/\]/&#93;/g;
590
591
592# if ($metaname eq "Identifier")
593# {
594# # name clashes with GSDL reserved metadata name for hash id
595# $metaname = "URL";
596# }
597
598 if (defined $metadata->{$metaname})
599 {
600 push(@{$metadata->{$metaname}},$metavalue);
601
602 }
603 else
604 {
605 $metadata->{$metaname} = [ $metavalue ];
606 }
607
608 $self->add_prettyprint_metadata_line($metaname, $metavalue);
609
610 }
611 }
612
613 $self->close_prettyprint_metadata_table();
614}
615
616## we know from the file extension, so doesn't need to check the doctype
617sub check_doctype {
618
619 return 1;
620}
621
6221;
Note: See TracBrowser for help on using the repository browser.