source: trunk/gsdl/perllib/plugins/OAIPlug.pm@ 12610

Last change on this file since 12610 was 12169, checked in by mdewsnip, 18 years ago

Tidied up that horrible long line in the new() function of every plugin.

  • Property svn:keywords set to Author Date Id Revision
File size: 9.5 KB
Line 
1###########################################################################
2#
3# OAIPlug.pm -- basic Open Archives Initiative (OAI) plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package OAIPlug;
28
29use BasPlug;
30use unicode;
31use util;
32
33use strict;
34no strict 'refs'; # allow filehandles to be variables and viceversa
35
36use XMLPlug;
37
38sub BEGIN {
39 @OAIPlug::ISA = ('XMLPlug');
40}
41
42
43my $arguments =
44 [ { 'name' => "process_exp",
45 'desc' => "{BasPlug.process_exp}",
46 'type' => "regexp",
47 'reqd' => "no",
48 'deft' => &get_default_process_exp() },
49 ];
50
51my $options = { 'name' => "OAIPlug",
52 'desc' => "{OAIPlug.desc}",
53 'abstract' => "no",
54 'inherits' => "yes",
55 'args' => $arguments };
56
57
58sub new {
59 my ($class) = shift (@_);
60 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
61 push(@$pluginlist, $class);
62
63 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
64 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
65
66 my $self = new XMLPlug($pluginlist, $inputargs, $hashArgOptLists);
67
68 return bless $self, $class;
69}
70
71sub get_default_process_exp {
72 my $self = shift (@_);
73
74 return q^(?i)(\.oai)$^;
75}
76
77sub xml_start_document {
78 my $self = shift (@_);
79 $self->{'in_metadata_node'} = 0;
80 $self->{'rawxml'} = "";
81}
82
83sub xml_end_document {
84}
85
86sub xml_doctype {
87 my $self = shift(@_);
88
89 my ($expat, $name, $sysid, $pubid, $internal) = @_;
90
91 # allow the short-lived and badly named "GreenstoneArchive" files to be processed
92 # as well as the "Archive" files which should now be created by import.pl
93 die "" if ($name !~ /^OAI-PMH$/);
94
95 my $outhandle = $self->{'outhandle'};
96 print $outhandle "OAIPlug: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
97 print STDERR "<Processing n='$self->{'file'}' p='OAIPlug'>\n" if $self->{'gli'};
98
99}
100
101
102sub xml_start_tag {
103 my $self = shift(@_);
104 my ($expat,$element) = @_;
105
106 my %attr_hash = %_;
107
108 my $attr = "";
109 map { $attr .= " $_=$attr_hash{$_}"; } keys %attr_hash;
110
111 $self->{'rawxml'} .= "<$element$attr>";
112
113 if ($element eq "metadata") {
114 $self->{'in_metadata_node'} = 1;
115 $self->{'metadata_xml'} = "";
116 }
117
118 if ($self->{'in_metadata_node'}) {
119 $self->{'metadata_xml'} .= "<$element$attr>";
120 }
121}
122
123sub xml_end_tag {
124 my $self = shift(@_);
125 my ($expat, $element) = @_;
126
127 $self->{'rawxml'} .= "</$element>";
128
129 if ($self->{'in_metadata_node'}) {
130 $self->{'metadata_xml'} .= "</$element>";
131 }
132
133 if ($element eq "metadata") {
134 my $textref = \$self->{'metadata_xml'};
135 my $metadata = $self->{'metadata'};
136 $self->extract_oai_metadata($textref,$metadata);
137
138 $self->{'in_metadata_node'} = 0;
139 }
140
141
142}
143
144sub xml_text {
145 my $self = shift(@_);
146 my ($expat) = @_;
147
148 $self->{'rawxml'} .= $_;
149
150 if ($self->{'in_metadata_node'}) {
151 $self->{'metadata_xml'} .= $_;
152 }
153}
154
155
156
157
158sub read {
159 my $self = shift (@_);
160
161 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
162
163 my $outhandle = $self->{'outhandle'};
164
165 my $filename = $file;
166 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
167
168 return 0 if ((-d $filename) && ($filename =~ m/srcdocs$/));
169
170 if ($self->SUPER::read(@_)) {
171
172 # Do encoding stuff
173 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
174
175 my $url_array = $metadata->{'URL'};
176 my $num_urls = (defined $url_array) ? scalar(@$url_array) : 0;
177
178 my $srcdoc_exists = 0;
179 my $srcdoc_pos = 0;
180 my $filename_dir = &util::filename_head($filename);
181
182 for (my $i=0; $i<$num_urls; $i++) {
183
184 if ($url_array->[$i] !~ m/^(http|ftp):/) {
185
186 my $src_filename = &util::filename_cat($filename_dir, $url_array->[$i]);
187
188 if (-e $src_filename) {
189 $srcdoc_pos = $i;
190 $srcdoc_exists = 1;
191 }
192 }
193 }
194
195 if ($srcdoc_exists)
196 {
197 print $outhandle "OAIPlug: passing metadata on to $url_array->[0]\n"
198 if ($self->{'verbosity'}>1);
199
200
201 # Make pretty print metadata table stick with src filename
202 my $ppmd_table = $self->{'ppmd_table'};
203 $metadata->{'prettymd'} = [ $ppmd_table ];
204 $self->{'ppmd_table'} = undef;
205
206 return &plugin::read ($pluginfo, $filename_dir, $url_array->[0],
207 $metadata, $processor, $maxdocs, $total_count, $gli);
208 }
209 else
210 {
211 # create a new document
212 my $doc_obj = new doc ($filename, "indexed_doc");
213 my $top_section = $doc_obj->get_top_section;
214 my $plugin_type = $self->{'plugin_type'};
215
216 $doc_obj->add_utf8_metadata($top_section, "Language", $language);
217 $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding);
218 $doc_obj->add_utf8_metadata($top_section, "Plugin", $plugin_type);
219 $doc_obj->add_metadata($top_section, "FileFormat", "OAI");
220 $doc_obj->add_metadata($top_section, "FileSize", (-s $filename));
221
222 # include any metadata passed in from previous plugins
223 # note that this metadata is associated with the top level section
224 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
225
226 # do plugin specific processing of doc_obj
227 my $textref = \$self->{'rawxml'};
228 unless (defined ($self->process($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
229 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
230 return -1;
231 }
232
233 # do any automatic metadata extraction
234 $self->auto_extract_metadata ($doc_obj);
235
236 # add an OID
237 $doc_obj->set_OID();
238
239 my $ppmd_table = $self->{'ppmd_table'};
240 $doc_obj->add_utf8_metadata($top_section,"prettymd",$ppmd_table);
241 $self->{'ppmd_table'} = undef;
242
243 # process the document
244 $processor->process($doc_obj);
245
246 $self->{'num_processed'} ++;
247
248 return 1; # processed the file
249 }
250 }
251 else {
252 return undef;
253 }
254}
255
256
257# do plugin specific processing of doc_obj
258sub process {
259 my $self = shift (@_);
260 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
261 my $outhandle = $self->{'outhandle'};
262
263 print STDERR "<Processing n='$file' p='OAIPlug'>\n" if ($gli);
264 print $outhandle "OAIPlug: processing $file\n"
265 if $self->{'verbosity'} > 1;
266
267 my $cursection = $doc_obj->get_top_section();
268
269## $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection);
270
271 # add text to document object
272
273# $$textref =~ s/<(.*?)>/$1 /g;
274 $$textref =~ s/</&lt;/g;
275 $$textref =~ s/>/&gt;/g;
276
277## print STDERR "*** adding text: $$textref\n";
278
279 $doc_obj->add_utf8_text($cursection, $$textref);
280
281 return 1;
282}
283
284
285# Improvement is to merge this with newer version in MetadataPass
286
287sub open_prettyprint_metadata_table
288{
289 my $self = shift(@_);
290
291 my $att = "width=100% cellspacing=2";
292 my $style = "style=\'border-bottom: 4px solid #000080\'";
293
294 $self->{'ppmd_table'} = "\n<table $att $style>";
295}
296
297sub add_prettyprint_metadata_line
298{
299 my $self = shift(@_);
300 my ($metaname, $metavalue_utf8) = @_;
301
302 $metavalue_utf8 =~ s/hdl\.handle\.net/mcgonagall.cs.waikato.ac.nz:8080\/dspace\/handle/;
303 $metavalue_utf8 = &util::hyperlink_text($metavalue_utf8);
304
305 $self->{'ppmd_table'} .= " <tr bgcolor=#b5d3cd>\n";
306 $self->{'ppmd_table'} .= " <td width=30%>\n";
307 $self->{'ppmd_table'} .= " $metaname\n";
308 $self->{'ppmd_table'} .= " </td>\n";
309 $self->{'ppmd_table'} .= " <td>\n";
310 $self->{'ppmd_table'} .= " $metavalue_utf8\n";
311 $self->{'ppmd_table'} .= " </td>\n";
312 $self->{'ppmd_table'} .= " </tr>\n";
313
314}
315
316sub close_prettyprint_metadata_table
317{
318 my $self = shift(@_);
319
320 $self->{'ppmd_table'} .= "</table>\n";
321}
322
323
324
325
326sub extract_oai_metadata {
327 my $self = shift (@_);
328 my ($textref, $metadata) = @_;
329 my $outhandle = $self->{'outhandle'};
330
331 # Only handles DC metadata
332
333 $self->open_prettyprint_metadata_table();
334
335 if ($$textref =~ m/<metadata\s*>(.*?)<\/metadata\s*>/s)
336 {
337 my $metadata_text = $1;
338 $metadata_text =~ s/^.*?<(oai_dc:)?dc.*?>(.*?)<\/(oai_dc:)?dc>.*?/$2/s;
339
340 while ($metadata_text =~ m/<(.*?)>(.*?)<\/(.*?)>(.*)/s)
341 {
342 # if URL given for document as identifier metadata, store it ...
343 # $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
344
345 my $metaname = $1;
346 my $metavalue = $2;
347 $metadata_text = $4;
348
349 $metaname =~ s/^(dc:)?(.)/\u$2/;
350
351 if ($metaname eq "Identifier")
352 {
353 # name clashes with GSDL reserved metadata name for hash id
354 $metaname = "URL";
355 }
356
357 if (defined $metadata->{$metaname})
358 {
359 push(@{$metadata->{$metaname}},$metavalue);
360
361 }
362 else
363 {
364 $metadata->{$metaname} = [ $metavalue ];
365 }
366
367 $self->add_prettyprint_metadata_line($metaname, $metavalue);
368
369 }
370 }
371
372 $self->close_prettyprint_metadata_table();
373}
374
3751;
Note: See TracBrowser for help on using the repository browser.