source: main/trunk/greenstone2/perllib/plugins/OAIMetadataXMLPlugin.pm@ 23212

Last change on this file since 23212 was 23212, checked in by kjdon, 14 years ago

metadata_read no longer takes maxdocs args - metadata_read must process all docs, so that whatever few are actually processed by read will get their metadata

File size: 8.8 KB
Line 
1###########################################################################
2#
3# OAIMetadataXMLPlugin.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2010 DL Consulting Ltd
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# OAIMetadataXMLPlugin is a child of MetadataXMLPlugin
27# It processes the metadata.xml file just like MetadataXMLPlugin.
28# Additionally, it uses the "dc.Identifier" field and extracts OAI metadata from the specified OAI server (-oai_server_http_path)
29
30package OAIMetadataXMLPlugin;
31
32use strict;
33no strict 'refs';
34
35use MetadataXMLPlugin;
36
37sub BEGIN {
38 @OAIMetadataXMLPlugin::ISA = ('MetadataXMLPlugin');
39 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
40}
41
42my $arguments = [
43 { 'name' => "oai_server_http_path",
44 'desc' => "{OAIMetadataXMLPlugin.oai_server_http_path}",
45 'type' => "string",
46 'deft' => "" },
47
48 { 'name' => "metadata_prefix",
49 'desc' => "{OAIMetadataXMLPlugin.metadata_prefix}",
50 'type' => "string",
51 'deft' => "oai_dc" },
52
53 # If koha_mode flag is specified, the plugin will try to generate the oaiextracted.koharecordlink metadata
54 # This metadata contains the link back to Koha document
55 { 'name' => "koha_mode",
56 'desc' => "{OAIMetadataXMLPlugin.koha_mode}",
57 'type' => "flag",
58 'reqd' => "no" },
59 ];
60
61my $options = { 'name' => "OAIMetadataXMLPlugin",
62 'desc' => "{OAIMetadataXMLPlugin.desc}",
63 'abstract' => "no",
64 'inherits' => "yes",
65 'args' => $arguments };
66
67
68sub new
69{
70 my ($class) = shift (@_);
71 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
72 push(@$pluginlist, $class);
73
74 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
75 push(@{$hashArgOptLists->{"OptList"}},$options);
76
77 my $self = new MetadataXMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
78
79 return bless $self, $class;
80}
81
82
83sub metadata_read
84{
85 my $self = shift (@_);
86 my ($pluginfo, $base_dir, $file, $block_hash, $extrametakeys, $extrametadata,$extrametafile, $processor, $gli, $aux) = @_;
87
88 # Read in the normal metadata.xml file
89 $self->SUPER::metadata_read(@_);
90
91 my $outhandle = $self->{'outhandle'};
92
93 #======================================================================#
94 # Checks to make sure the OAI-PMH server is connectable [START]
95 #======================================================================#
96 print $outhandle "OAIMetadataXMLPlugin: Checking OAI server (" . $self->{"oai_server_http_path"} . ") connection\n" if ($self->{'verbosity'})> 1;
97
98 # Checks to make sure LWP (5.64) is available, it should always be available if you have Perl installed
99 # However if you are using the Greenstone's cut-down version of Perl, this LWP module will not be included
100 eval { require LWP };
101 if ($@)
102 {
103 print STDERR "Error: Failed to load Perl module LWP: $@\n";
104 return;
105 }
106
107 # Create the LWP module
108 my $browser = LWP::UserAgent->new;
109 my $response = $browser->get($self->{"oai_server_http_path"});
110
111 # Do not go further if the OAI server is not accessible
112 if (!$response->is_success)
113 {
114 print $outhandle "OAIMetadataXMLPlugin: Error! OAI server (" . $self->{"oai_server_http_path"} . ") unavailable\n";
115 return;
116 }
117 #======================================================================#
118 # Checks to make sure the OAI-PMH server is connectable [END]
119 #======================================================================#
120
121 #======================================================================#
122 # Process each fileset [START]
123 #======================================================================#
124 foreach my $one_file (@{$extrametakeys})
125 {
126 # Don't harvest file sets that don't have dc.Identifier set, "dc.Identifier" is usde as the key between Greenstone and OAI Server!
127 next if (!defined($extrametadata->{$one_file}->{"dc.Identifier"}->[0]) || $extrametadata->{$one_file}->{"dc.Identifier"}->[0] eq "");
128
129 #======================================================================#
130 # Only try to harvest file set with dc.Identifier specified. [START]
131 #======================================================================#
132 # The dc.Identifier has to be the same as the OAI record identifier
133 my $oai_identifier = $extrametadata->{$one_file}->{"dc.Identifier"}->[0];
134
135 # Now, let's get the OAI metadata
136 my $request = $self->{"oai_server_http_path"} . "?verb=GetRecord&identifier=" . $oai_identifier. "&metadataPrefix=" . $self->{"metadata_prefix"};
137 print $outhandle "OAIMetadataXMLPlugin: OAI Harvesting Request (" . $request . ")\n";
138 $response = undef;
139 $response = $browser->get($request);
140 die "OAIMetadataXMLPlugin: This should never be happening - \"get\" should always be successful unless the OAI server was temporary down (some kind of race condition)\n" unless ($response->is_success);
141 my $reponse_content = $response->content();
142
143 # Check to make sure there is no error in the OAI response
144 if ($reponse_content =~ /\<error\scode\=[\"\']([^\"\']+)[\"\']>([^\<]*)\<\/error\>/)
145 {
146 print $outhandle "OAIMetadataXMLPlugin: Failed to retrive OAI record (" . $oai_identifier . "). ErrorCode: [$1] ErrorMessage: [$2], skip.\n";
147 next;
148 }
149 print $outhandle "OAIMetadataXMLPlugin: OAI record (" . $oai_identifier . ") found.\n";
150
151 # Get the oai metadata (We will need to extend this code to support future metadataPrefix)
152 my $oai_content = undef;
153
154 # Special Note for KOHA OAI Server: there is an error in the KOHA's OAI-PMH server (it is still under development at the time when I am writting this)
155 # The metadata set should be oai_dc:dc tag, but they incorrectly output the tag as oaidc:dc (which doesn't match with the metadataPrefix)
156 if ($self->{"metadata_prefix"} eq "oai_dc" && $reponse_content =~ /\<oai\_?dc:dc[^\>]+\>(.*?)\<\/oai\_?dc\:dc\>/s)
157 {
158 $oai_content = $1;
159 }
160 else
161 {
162 my $reg_match = "\<" . $self->{"metadata_prefix"} . "\:" . $self->{"metadata_prefix"} . "[^\>]+\>(.*?)\<\/" . $self->{"metadata_prefix"} . "\:" . $self->{"metadata_prefix"} . "\>";
163 if ($reponse_content =~ /$reg_match/s)
164 {
165 $oai_content = $1;
166 }
167 else
168 {
169 print $outhandle "OAIMetadataXMLPlugin: Failed to match " . $self->{"metadata_prefix"} . ":" . $self->{"metadata_prefix"} . " metadata set, skip\n " . $reponse_content . "\n";
170 next;
171 }
172 }
173
174 # Get each metadata field and value
175 while ($oai_content =~ /\<([^\>]+)\>([^\<]+)\<\/[^\>]+\>/g)
176 {
177 my $field_name = "oaiextracted." . lc($1);
178 my $value = $2;
179
180 # Special hack for Koha data from Nitesh
181 # Some of their data contain " \" as the value... that is pretty wrong.
182 # If the value is empty, ignore it.
183 if ($value =~ /^[^\w]*$/)
184 {
185 print STDERR "Ignore value:[" . $value . "]\n";
186 next;
187 }
188
189 # Special case for identifier
190 if ($self->{"koha_mode"} == 1 && $1 eq "identifier" && $2 =~ /https?\:\/\//)
191 {
192 $field_name = "oaiextracted.koharecordlink";
193
194 # Koha OAI server is not up-to-date... so it was still pointing to the old interface
195 # This might need change over once they update the Koha OAI server
196 $value =~ s/\/opac\/opac\-detail\.pl\?bib\=/\/catalogue\/detail\.pl\?biblionumber\=/;
197 }
198 $extrametadata->{$one_file}->{$field_name} = [] if (!defined($extrametadata->{$one_file}->{$field_name}));
199 push(@{$extrametadata->{$one_file}->{$field_name}}, $value);
200 }
201 #======================================================================#
202 # Only try to harvest file set with dc.Identifier specified. [END]
203 #======================================================================#
204 }
205 #======================================================================#
206 # Process each fileset [END]
207 #======================================================================#
208}
209
2101;
Note: See TracBrowser for help on using the repository browser.