source: main/trunk/greenstone2/perllib/plugins/OAIMetadataXMLPlugin.pm@ 22705

Last change on this file since 22705 was 22232, checked in by mdewsnip, 14 years ago

New OAIMetadataXMLPlugin.pm for extracting information from OAI servers where metadata.xml files specify what to download from the OAI server. Done for Koha integration, but may be useful for other OAI servers. By Jeffrey Ke at DL Consulting Ltd.

File size: 8.8 KB
Line 
1###########################################################################
2#
3# OAIMetadataXMLPlugin.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2010 DL Consulting Ltd
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# OAIMetadataXMLPlugin is a child of MetadataXMLPlugin
27# It processes the metadata.xml file just like MetadataXMLPlugin.
28# Additionally, it uses the "dc.Identifier" field and extracts OAI metadata from the specified OAI server (-oai_server_http_path)
29
30package OAIMetadataXMLPlugin;
31
32use strict;
33no strict 'refs';
34
35use MetadataXMLPlugin;
36
37sub BEGIN {
38 @OAIMetadataXMLPlugin::ISA = ('MetadataXMLPlugin');
39 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
40}
41
42my $arguments = [
43 { 'name' => "oai_server_http_path",
44 'desc' => "{OAIMetadataXMLPlugin.oai_server_http_path}",
45 'type' => "string",
46 'deft' => "" },
47
48 { 'name' => "metadata_prefix",
49 'desc' => "{OAIMetadataXMLPlugin.metadata_prefix}",
50 'type' => "string",
51 'deft' => "oai_dc" },
52
53 # If koha_mode flag is specified, the plugin will try to generate the oaiextracted.koharecordlink metadata
54 # This metadata contains the link back to Koha document
55 { 'name' => "koha_mode",
56 'desc' => "{OAIMetadataXMLPlugin.koha_mode}",
57 'type' => "flag",
58 'reqd' => "no" },
59 ];
60
61my $options = { 'name' => "OAIMetadataXMLPlugin",
62 'desc' => "{OAIMetadataXMLPlugin.desc}",
63 'abstract' => "no",
64 'inherits' => "yes",
65 'args' => $arguments };
66
67
68sub new
69{
70 my ($class) = shift (@_);
71 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
72 push(@$pluginlist, $class);
73
74 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
75 push(@{$hashArgOptLists->{"OptList"}},$options);
76
77 my $self = new MetadataXMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
78
79 return bless $self, $class;
80}
81
82
83sub metadata_read
84{
85 my $self = shift (@_);
86 my ($pluginfo, $base_dir, $file, $block_hash, $extrametakeys, $extrametadata,$extrametafile, $processor, $maxdocs, $gli) = @_;
87
88 # Read in the normal metadata.xml file
89 $self->SUPER::metadata_read(@_);
90
91 my $outhandle = $self->{'outhandle'};
92
93 #======================================================================#
94 # Checks to make sure the OAI-PMH server is connectable [START]
95 #======================================================================#
96 print $outhandle "OAIMetadataXMLPlugin: Checking OAI server (" . $self->{"oai_server_http_path"} . ") connection\n" if ($self->{'verbosity'})> 1;
97
98 # Checks to make sure LWP (5.64) is available, it should always be available if you have Perl installed
99 # However if you are using the Greenstone's cut-down version of Perl, this LWP module will not be included
100 eval { require LWP };
101 if ($@)
102 {
103 print STDERR "Error: Failed to load Perl module LWP: $@\n";
104 return;
105 }
106
107 # Create the LWP module
108 my $browser = LWP::UserAgent->new;
109 my $response = $browser->get($self->{"oai_server_http_path"});
110
111 # Do not go further if the OAI server is not accessible
112 if (!$response->is_success)
113 {
114 print $outhandle "OAIMetadataXMLPlugin: Error! OAI server (" . $self->{"oai_server_http_path"} . ") unavailable\n";
115 return;
116 }
117 #======================================================================#
118 # Checks to make sure the OAI-PMH server is connectable [END]
119 #======================================================================#
120
121 #======================================================================#
122 # Process each fileset [START]
123 #======================================================================#
124 foreach my $one_file (@{$extrametakeys})
125 {
126 # Don't harvest file sets that don't have dc.Identifier set, "dc.Identifier" is usde as the key between Greenstone and OAI Server!
127 next if (!defined($extrametadata->{$one_file}->{"dc.Identifier"}->[0]) || $extrametadata->{$one_file}->{"dc.Identifier"}->[0] eq "");
128
129 #======================================================================#
130 # Only try to harvest file set with dc.Identifier specified. [START]
131 #======================================================================#
132 # The dc.Identifier has to be the same as the OAI record identifier
133 my $oai_identifier = $extrametadata->{$one_file}->{"dc.Identifier"}->[0];
134
135 # Now, let's get the OAI metadata
136 my $request = $self->{"oai_server_http_path"} . "?verb=GetRecord&identifier=" . $oai_identifier. "&metadataPrefix=" . $self->{"metadata_prefix"};
137 print $outhandle "OAIMetadataXMLPlugin: OAI Harvesting Request (" . $request . ")\n";
138 $response = undef;
139 $response = $browser->get($request);
140 die "OAIMetadataXMLPlugin: This should never be happening - \"get\" should always be successful unless the OAI server was temporary down (some kind of race condition)\n" unless ($response->is_success);
141 my $reponse_content = $response->content();
142
143 # Check to make sure there is no error in the OAI response
144 if ($reponse_content =~ /\<error\scode\=[\"\']([^\"\']+)[\"\']>([^\<]*)\<\/error\>/)
145 {
146 print $outhandle "OAIMetadataXMLPlugin: Failed to retrive OAI record (" . $oai_identifier . "). ErrorCode: [$1] ErrorMessage: [$2], skip.\n";
147 next;
148 }
149 print $outhandle "OAIMetadataXMLPlugin: OAI record (" . $oai_identifier . ") found.\n";
150
151 # Get the oai metadata (We will need to extend this code to support future metadataPrefix)
152 my $oai_content = undef;
153
154 # Special Note for KOHA OAI Server: there is an error in the KOHA's OAI-PMH server (it is still under development at the time when I am writting this)
155 # The metadata set should be oai_dc:dc tag, but they incorrectly output the tag as oaidc:dc (which doesn't match with the metadataPrefix)
156 if ($self->{"metadata_prefix"} eq "oai_dc" && $reponse_content =~ /\<oai\_?dc:dc[^\>]+\>(.*?)\<\/oai\_?dc\:dc\>/s)
157 {
158 $oai_content = $1;
159 }
160 else
161 {
162 my $reg_match = "\<" . $self->{"metadata_prefix"} . "\:" . $self->{"metadata_prefix"} . "[^\>]+\>(.*?)\<\/" . $self->{"metadata_prefix"} . "\:" . $self->{"metadata_prefix"} . "\>";
163 if ($reponse_content =~ /$reg_match/s)
164 {
165 $oai_content = $1;
166 }
167 else
168 {
169 print $outhandle "OAIMetadataXMLPlugin: Failed to match " . $self->{"metadata_prefix"} . ":" . $self->{"metadata_prefix"} . " metadata set, skip\n " . $reponse_content . "\n";
170 next;
171 }
172 }
173
174 # Get each metadata field and value
175 while ($oai_content =~ /\<([^\>]+)\>([^\<]+)\<\/[^\>]+\>/g)
176 {
177 my $field_name = "oaiextracted." . lc($1);
178 my $value = $2;
179
180 # Special hack for Koha data from Nitesh
181 # Some of their data contain " \" as the value... that is pretty wrong.
182 # If the value is empty, ignore it.
183 if ($value =~ /^[^\w]*$/)
184 {
185 print STDERR "Ignore value:[" . $value . "]\n";
186 next;
187 }
188
189 # Special case for identifier
190 if ($self->{"koha_mode"} == 1 && $1 eq "identifier" && $2 =~ /https?\:\/\//)
191 {
192 $field_name = "oaiextracted.koharecordlink";
193
194 # Koha OAI server is not up-to-date... so it was still pointing to the old interface
195 # This might need change over once they update the Koha OAI server
196 $value =~ s/\/opac\/opac\-detail\.pl\?bib\=/\/catalogue\/detail\.pl\?biblionumber\=/;
197 }
198 $extrametadata->{$one_file}->{$field_name} = [] if (!defined($extrametadata->{$one_file}->{$field_name}));
199 push(@{$extrametadata->{$one_file}->{$field_name}}, $value);
200 }
201 #======================================================================#
202 # Only try to harvest file set with dc.Identifier specified. [END]
203 #======================================================================#
204 }
205 #======================================================================#
206 # Process each fileset [END]
207 #======================================================================#
208}
209
2101;
Note: See TracBrowser for help on using the repository browser.