source: gs2-extensions/parallel-building/trunk/src/perllib/plugins/OAIMetadataXMLPlugin.pm@ 24626

Last change on this file since 24626 was 24626, checked in by jmt12, 13 years ago

An (almost) complete copy of the perllib directory from a (circa SEP2011) head checkout from Greenstone 2 trunk - in order to try and make merging in this extension a little easier later on (as there have been some major changes to buildcol.pl commited in the main trunk but not in the x64 branch)

File size: 9.0 KB
Line 
1###########################################################################
2#
3# OAIMetadataXMLPlugin.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2010 DL Consulting Ltd
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# OAIMetadataXMLPlugin is a child of MetadataXMLPlugin
27# It processes the metadata.xml file just like MetadataXMLPlugin.
28# Additionally, it uses the "dc.Identifier" field and extracts OAI metadata from the specified OAI server (-oai_server_http_path)
29
30package OAIMetadataXMLPlugin;
31
32use strict;
33no strict 'refs';
34
35use MetadataXMLPlugin;
36
37sub BEGIN {
38 @OAIMetadataXMLPlugin::ISA = ('MetadataXMLPlugin');
39
40 # ensure Greenstone's CPAN is on the path (but check first) [jmt12]
41 my $inc_paths = join(':', @INC);
42 my $gsdl_perllib_path = $ENV{'GSDLHOME'} . '/perllib/cpan';
43 if ($inc_paths !~ /$gsdl_perllib_path/)
44 {
45 unshift (@INC, $gsdl_perllib_path);
46 }
47}
48
49my $arguments = [
50 { 'name' => "oai_server_http_path",
51 'desc' => "{OAIMetadataXMLPlugin.oai_server_http_path}",
52 'type' => "string",
53 'deft' => "" },
54
55 { 'name' => "metadata_prefix",
56 'desc' => "{OAIMetadataXMLPlugin.metadata_prefix}",
57 'type' => "string",
58 'deft' => "oai_dc" },
59
60 # If koha_mode flag is specified, the plugin will try to generate the oaiextracted.koharecordlink metadata
61 # This metadata contains the link back to Koha document
62 { 'name' => "koha_mode",
63 'desc' => "{OAIMetadataXMLPlugin.koha_mode}",
64 'type' => "flag",
65 'reqd' => "no" },
66 ];
67
68my $options = { 'name' => "OAIMetadataXMLPlugin",
69 'desc' => "{OAIMetadataXMLPlugin.desc}",
70 'abstract' => "no",
71 'inherits' => "yes",
72 'args' => $arguments };
73
74
75sub new
76{
77 my ($class) = shift (@_);
78 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
79 push(@$pluginlist, $class);
80
81 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
82 push(@{$hashArgOptLists->{"OptList"}},$options);
83
84 my $self = new MetadataXMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
85
86 return bless $self, $class;
87}
88
89
90sub metadata_read
91{
92 my $self = shift (@_);
93 my ($pluginfo, $base_dir, $file, $block_hash, $extrametakeys, $extrametadata,$extrametafile, $processor, $gli, $aux) = @_;
94
95 # Read in the normal metadata.xml file
96 $self->SUPER::metadata_read(@_);
97
98 my $outhandle = $self->{'outhandle'};
99
100 #======================================================================#
101 # Checks to make sure the OAI-PMH server is connectable [START]
102 #======================================================================#
103 print $outhandle "OAIMetadataXMLPlugin: Checking OAI server (" . $self->{"oai_server_http_path"} . ") connection\n" if ($self->{'verbosity'})> 1;
104
105 # Checks to make sure LWP (5.64) is available, it should always be available if you have Perl installed
106 # However if you are using the Greenstone's cut-down version of Perl, this LWP module will not be included
107 eval { require LWP };
108 if ($@)
109 {
110 print STDERR "Error: Failed to load Perl module LWP: $@\n";
111 return;
112 }
113
114 # Create the LWP module
115 my $browser = LWP::UserAgent->new;
116 my $response = $browser->get($self->{"oai_server_http_path"});
117
118 # Do not go further if the OAI server is not accessible
119 if (!$response->is_success)
120 {
121 print $outhandle "OAIMetadataXMLPlugin: Error! OAI server (" . $self->{"oai_server_http_path"} . ") unavailable\n";
122 return;
123 }
124 #======================================================================#
125 # Checks to make sure the OAI-PMH server is connectable [END]
126 #======================================================================#
127
128 #======================================================================#
129 # Process each fileset [START]
130 #======================================================================#
131 foreach my $one_file (@{$extrametakeys})
132 {
133 # Don't harvest file sets that don't have dc.Identifier set, "dc.Identifier" is usde as the key between Greenstone and OAI Server!
134 next if (!defined($extrametadata->{$one_file}->{"dc.Identifier"}->[0]) || $extrametadata->{$one_file}->{"dc.Identifier"}->[0] eq "");
135
136 #======================================================================#
137 # Only try to harvest file set with dc.Identifier specified. [START]
138 #======================================================================#
139 # The dc.Identifier has to be the same as the OAI record identifier
140 my $oai_identifier = $extrametadata->{$one_file}->{"dc.Identifier"}->[0];
141
142 # Now, let's get the OAI metadata
143 my $request = $self->{"oai_server_http_path"} . "?verb=GetRecord&identifier=" . $oai_identifier. "&metadataPrefix=" . $self->{"metadata_prefix"};
144 print $outhandle "OAIMetadataXMLPlugin: OAI Harvesting Request (" . $request . ")\n";
145 $response = undef;
146 $response = $browser->get($request);
147 die "OAIMetadataXMLPlugin: This should never be happening - \"get\" should always be successful unless the OAI server was temporary down (some kind of race condition)\n" unless ($response->is_success);
148 my $reponse_content = $response->content();
149
150 # Check to make sure there is no error in the OAI response
151 if ($reponse_content =~ /\<error\scode\=[\"\']([^\"\']+)[\"\']>([^\<]*)\<\/error\>/)
152 {
153 print $outhandle "OAIMetadataXMLPlugin: Failed to retrive OAI record (" . $oai_identifier . "). ErrorCode: [$1] ErrorMessage: [$2], skip.\n";
154 next;
155 }
156 print $outhandle "OAIMetadataXMLPlugin: OAI record (" . $oai_identifier . ") found.\n";
157
158 # Get the oai metadata (We will need to extend this code to support future metadataPrefix)
159 my $oai_content = undef;
160
161 # Special Note for KOHA OAI Server: there is an error in the KOHA's OAI-PMH server (it is still under development at the time when I am writting this)
162 # The metadata set should be oai_dc:dc tag, but they incorrectly output the tag as oaidc:dc (which doesn't match with the metadataPrefix)
163 if ($self->{"metadata_prefix"} eq "oai_dc" && $reponse_content =~ /\<oai\_?dc:dc[^\>]+\>(.*?)\<\/oai\_?dc\:dc\>/s)
164 {
165 $oai_content = $1;
166 }
167 else
168 {
169 my $reg_match = "\<" . $self->{"metadata_prefix"} . "\:" . $self->{"metadata_prefix"} . "[^\>]+\>(.*?)\<\/" . $self->{"metadata_prefix"} . "\:" . $self->{"metadata_prefix"} . "\>";
170 if ($reponse_content =~ /$reg_match/s)
171 {
172 $oai_content = $1;
173 }
174 else
175 {
176 print $outhandle "OAIMetadataXMLPlugin: Failed to match " . $self->{"metadata_prefix"} . ":" . $self->{"metadata_prefix"} . " metadata set, skip\n " . $reponse_content . "\n";
177 next;
178 }
179 }
180
181 # Get each metadata field and value
182 while ($oai_content =~ /\<([^\>]+)\>([^\<]+)\<\/[^\>]+\>/g)
183 {
184 my $field_name = "oaiextracted." . lc($1);
185 my $value = $2;
186
187 # Special hack for Koha data from Nitesh
188 # Some of their data contain " \" as the value... that is pretty wrong.
189 # If the value is empty, ignore it.
190 if ($value =~ /^[^\w]*$/)
191 {
192 print STDERR "Ignore value:[" . $value . "]\n";
193 next;
194 }
195
196 # Special case for identifier
197 if ($self->{"koha_mode"} == 1 && $1 eq "identifier" && $2 =~ /https?\:\/\//)
198 {
199 $field_name = "oaiextracted.koharecordlink";
200
201 # Koha OAI server is not up-to-date... so it was still pointing to the old interface
202 # This might need change over once they update the Koha OAI server
203 $value =~ s/\/opac\/opac\-detail\.pl\?bib\=/\/catalogue\/detail\.pl\?biblionumber\=/;
204 }
205 $extrametadata->{$one_file}->{$field_name} = [] if (!defined($extrametadata->{$one_file}->{$field_name}));
206 push(@{$extrametadata->{$one_file}->{$field_name}}, $value);
207 }
208 #======================================================================#
209 # Only try to harvest file set with dc.Identifier specified. [END]
210 #======================================================================#
211 }
212 #======================================================================#
213 # Process each fileset [END]
214 #======================================================================#
215}
216
2171;
Note: See TracBrowser for help on using the repository browser.