source: main/trunk/greenstone2/perllib/plugins/OAIMetadataXMLPlugin.pm@ 32341

Last change on this file since 32341 was 24951, checked in by ak19, 12 years ago

All perlcode that accesses extrametakeys, extrametadata, extrametafile data structures has been moved into a new perl module called extrametautil.pm. The next step will be to ensure that the file_regexes used to index into these data structures are consistent (using consistent slashes, like URL style slashes).

File size: 9.0 KB
Line 
1###########################################################################
2#
3# OAIMetadataXMLPlugin.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2010 DL Consulting Ltd
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# OAIMetadataXMLPlugin is a child of MetadataXMLPlugin
27# It processes the metadata.xml file just like MetadataXMLPlugin.
28# Additionally, it uses the "dc.Identifier" field and extracts OAI metadata from the specified OAI server (-oai_server_http_path)
29
30package OAIMetadataXMLPlugin;
31
32use strict;
33no strict 'refs';
34
35use extrametautil;
36use MetadataXMLPlugin;
37
38sub BEGIN {
39 @OAIMetadataXMLPlugin::ISA = ('MetadataXMLPlugin');
40 unshift (@INC, "$ENV{'GSDLHOME'}/perllib/cpan");
41}
42
43my $arguments = [
44 { 'name' => "oai_server_http_path",
45 'desc' => "{OAIMetadataXMLPlugin.oai_server_http_path}",
46 'type' => "string",
47 'deft' => "" },
48
49 { 'name' => "metadata_prefix",
50 'desc' => "{OAIMetadataXMLPlugin.metadata_prefix}",
51 'type' => "string",
52 'deft' => "oai_dc" },
53
54 # If koha_mode flag is specified, the plugin will try to generate the oaiextracted.koharecordlink metadata
55 # This metadata contains the link back to Koha document
56 { 'name' => "koha_mode",
57 'desc' => "{OAIMetadataXMLPlugin.koha_mode}",
58 'type' => "flag",
59 'reqd' => "no" },
60 ];
61
62my $options = { 'name' => "OAIMetadataXMLPlugin",
63 'desc' => "{OAIMetadataXMLPlugin.desc}",
64 'abstract' => "no",
65 'inherits' => "yes",
66 'args' => $arguments };
67
68
69sub new
70{
71 my ($class) = shift (@_);
72 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
73 push(@$pluginlist, $class);
74
75 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
76 push(@{$hashArgOptLists->{"OptList"}},$options);
77
78 my $self = new MetadataXMLPlugin($pluginlist, $inputargs, $hashArgOptLists);
79
80 return bless $self, $class;
81}
82
83
84sub metadata_read
85{
86 my $self = shift (@_);
87 my ($pluginfo, $base_dir, $file, $block_hash, $extrametakeys, $extrametadata,$extrametafile, $processor, $gli, $aux) = @_;
88
89 # Read in the normal metadata.xml file
90 $self->SUPER::metadata_read(@_);
91
92 my $outhandle = $self->{'outhandle'};
93
94 #======================================================================#
95 # Checks to make sure the OAI-PMH server is connectable [START]
96 #======================================================================#
97 print $outhandle "OAIMetadataXMLPlugin: Checking OAI server (" . $self->{"oai_server_http_path"} . ") connection\n" if ($self->{'verbosity'})> 1;
98
99 # Checks to make sure LWP (5.64) is available, it should always be available if you have Perl installed
100 # However if you are using the Greenstone's cut-down version of Perl, this LWP module will not be included
101 eval { require LWP };
102 if ($@)
103 {
104 print STDERR "Error: Failed to load Perl module LWP: $@\n";
105 return;
106 }
107
108 # Create the LWP module
109 my $browser = LWP::UserAgent->new;
110 my $response = $browser->get($self->{"oai_server_http_path"});
111
112 # Do not go further if the OAI server is not accessible
113 if (!$response->is_success)
114 {
115 print $outhandle "OAIMetadataXMLPlugin: Error! OAI server (" . $self->{"oai_server_http_path"} . ") unavailable\n";
116 return;
117 }
118 #======================================================================#
119 # Checks to make sure the OAI-PMH server is connectable [END]
120 #======================================================================#
121
122 #======================================================================#
123 # Process each fileset [START]
124 #======================================================================#
125 foreach my $one_file (@{$extrametakeys})
126 {
127 # Don't harvest file sets that don't have dc.Identifier set, "dc.Identifier" is usde as the key between Greenstone and OAI Server!
128 my $dc_identifier = &extrametautil::getmetadata_for_named_pos($extrametadata, $one_file, "dc.Identifier", 0);
129 next if (!defined($dc_identifier) || $dc_identifier eq "");
130
131 #======================================================================#
132 # Only try to harvest file set with dc.Identifier specified. [START]
133 #======================================================================#
134 # The dc.Identifier has to be the same as the OAI record identifier
135 my $oai_identifier = $dc_identifier;
136
137 # Now, let's get the OAI metadata
138 my $request = $self->{"oai_server_http_path"} . "?verb=GetRecord&identifier=" . $oai_identifier. "&metadataPrefix=" . $self->{"metadata_prefix"};
139 print $outhandle "OAIMetadataXMLPlugin: OAI Harvesting Request (" . $request . ")\n";
140 $response = undef;
141 $response = $browser->get($request);
142 die "OAIMetadataXMLPlugin: This should never be happening - \"get\" should always be successful unless the OAI server was temporary down (some kind of race condition)\n" unless ($response->is_success);
143 my $reponse_content = $response->content();
144
145 # Check to make sure there is no error in the OAI response
146 if ($reponse_content =~ /\<error\scode\=[\"\']([^\"\']+)[\"\']>([^\<]*)\<\/error\>/)
147 {
148 print $outhandle "OAIMetadataXMLPlugin: Failed to retrive OAI record (" . $oai_identifier . "). ErrorCode: [$1] ErrorMessage: [$2], skip.\n";
149 next;
150 }
151 print $outhandle "OAIMetadataXMLPlugin: OAI record (" . $oai_identifier . ") found.\n";
152
153 # Get the oai metadata (We will need to extend this code to support future metadataPrefix)
154 my $oai_content = undef;
155
156 # Special Note for KOHA OAI Server: there is an error in the KOHA's OAI-PMH server (it is still under development at the time when I am writting this)
157 # The metadata set should be oai_dc:dc tag, but they incorrectly output the tag as oaidc:dc (which doesn't match with the metadataPrefix)
158 if ($self->{"metadata_prefix"} eq "oai_dc" && $reponse_content =~ /\<oai\_?dc:dc[^\>]+\>(.*?)\<\/oai\_?dc\:dc\>/s)
159 {
160 $oai_content = $1;
161 }
162 else
163 {
164 my $reg_match = "\<" . $self->{"metadata_prefix"} . "\:" . $self->{"metadata_prefix"} . "[^\>]+\>(.*?)\<\/" . $self->{"metadata_prefix"} . "\:" . $self->{"metadata_prefix"} . "\>";
165 if ($reponse_content =~ /$reg_match/s)
166 {
167 $oai_content = $1;
168 }
169 else
170 {
171 print $outhandle "OAIMetadataXMLPlugin: Failed to match " . $self->{"metadata_prefix"} . ":" . $self->{"metadata_prefix"} . " metadata set, skip\n " . $reponse_content . "\n";
172 next;
173 }
174 }
175
176 # Get each metadata field and value
177 while ($oai_content =~ /\<([^\>]+)\>([^\<]+)\<\/[^\>]+\>/g)
178 {
179 my $field_name = "oaiextracted." . lc($1);
180 my $value = $2;
181
182 # Special hack for Koha data from Nitesh
183 # Some of their data contain " \" as the value... that is pretty wrong.
184 # If the value is empty, ignore it.
185 if ($value =~ /^[^\w]*$/)
186 {
187 print STDERR "Ignore value:[" . $value . "]\n";
188 next;
189 }
190
191 # Special case for identifier
192 if ($self->{"koha_mode"} == 1 && $1 eq "identifier" && $2 =~ /https?\:\/\//)
193 {
194 $field_name = "oaiextracted.koharecordlink";
195
196 # Koha OAI server is not up-to-date... so it was still pointing to the old interface
197 # This might need change over once they update the Koha OAI server
198 $value =~ s/\/opac\/opac\-detail\.pl\?bib\=/\/catalogue\/detail\.pl\?biblionumber\=/;
199 }
200
201 &extrametautil::setmetadata_for_named_metaname($extrametadata, $one_file, $field_name, []) if (!defined (&extrametautil::getmetadata_for_named_metaname($extrametadata, $one_file, $field_name)));
202 &extrametautil::addmetadata_for_named_metaname($extrametadata, $one_file, $field_name, $value);
203 }
204 #======================================================================#
205 # Only try to harvest file set with dc.Identifier specified. [END]
206 #======================================================================#
207 }
208 #======================================================================#
209 # Process each fileset [END]
210 #======================================================================#
211}
212
2131;
Note: See TracBrowser for help on using the repository browser.