source: trunk/gsdl/perllib/plugins/OAIPlug.pm@ 10254

Last change on this file since 10254 was 10254, checked in by kjdon, 19 years ago

added 'use strict' to all plugins, and made modifications (mostly adding 'my') to make them compile

  • Property svn:keywords set to Author Date Id Revision
File size: 9.6 KB
Line 
1###########################################################################
2#
3# OAIPlug.pm -- basic Open Archives Initiative (OAI) plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package OAIPlug;
28
29use BasPlug;
30use unicode;
31use util;
32
33use strict;
34no strict 'refs'; # allow filehandles to be variables and viceversa
35
36use XMLPlug;
37
38sub BEGIN {
39 @OAIPlug::ISA = ('XMLPlug');
40}
41
42
43my $arguments =
44 [ { 'name' => "process_exp",
45 'desc' => "{BasPlug.process_exp}",
46 'type' => "regexp",
47 'reqd' => "no",
48 'deft' => &get_default_process_exp() },
49 ];
50
51my $options = { 'name' => "OAIPlug",
52 'desc' => "{OAIPlug.desc}",
53 'abstract' => "no",
54 'inherits' => "yes",
55 'args' => $arguments };
56
57
58sub new {
59 my ($class) = shift (@_);
60 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
61 push(@$pluginlist, $class);
62
63 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
64 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
65
66 my $self = (defined $hashArgOptLists)? new XMLPlug($pluginlist,$inputargs,$hashArgOptLists): new XMLPlug($pluginlist,$inputargs);
67
68 return bless $self, $class;
69}
70
71sub get_default_process_exp {
72 my $self = shift (@_);
73
74 return q^(?i)(\.oai)$^;
75}
76
77sub xml_start_document {
78 my $self = shift (@_);
79 $self->{'in_metadata_node'} = 0;
80 $self->{'rawxml'} = "";
81}
82
83sub xml_end_document {
84}
85
86sub xml_doctype {
87 my $self = shift(@_);
88
89 my ($expat, $name, $sysid, $pubid, $internal) = @_;
90
91 # allow the short-lived and badly named "GreenstoneArchive" files to be processed
92 # as well as the "Archive" files which should now be created by import.pl
93 die "" if ($name !~ /^OAI-PMH$/);
94
95 my $outhandle = $self->{'outhandle'};
96 print $outhandle "OAIPlug: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
97 print STDERR "<Processing n='$self->{'file'}' p='OAIPlug'>\n" if $self->{'gli'};
98
99}
100
101
102sub xml_start_tag {
103 my $self = shift(@_);
104 my ($expat,$element) = @_;
105
106 my %attr_hash = %_;
107
108 my $attr = "";
109 map { $attr .= " $_=$attr_hash{$_}"; } keys %attr_hash;
110
111 $self->{'rawxml'} .= "<$element$attr>";
112
113 if ($element eq "metadata") {
114 $self->{'in_metadata_node'} = 1;
115 $self->{'metadata_xml'} = "";
116 }
117
118 if ($self->{'in_metadata_node'}) {
119 $self->{'metadata_xml'} .= "<$element$attr>";
120 }
121}
122
123sub xml_end_tag {
124 my $self = shift(@_);
125 my ($expat, $element) = @_;
126
127 $self->{'rawxml'} .= "</$element>";
128
129 if ($self->{'in_metadata_node'}) {
130 $self->{'metadata_xml'} .= "</$element>";
131 }
132
133 if ($element eq "metadata") {
134 my $textref = \$self->{'metadata_xml'};
135 my $metadata = $self->{'metadata'};
136 $self->extract_oai_metadata($textref,$metadata);
137
138 $self->{'in_metadata_node'} = 0;
139 }
140
141
142}
143
144sub xml_text {
145 my $self = shift(@_);
146 my ($expat) = @_;
147
148 $self->{'rawxml'} .= $_;
149
150 if ($self->{'in_metadata_node'}) {
151 $self->{'metadata_xml'} .= $_;
152 }
153}
154
155
156
157
158sub read {
159 my $self = shift (@_);
160
161 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
162
163 my $outhandle = $self->{'outhandle'};
164
165 my $filename = $file;
166 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
167
168 return 0 if ((-d $filename) && ($filename =~ m/srcdocs$/));
169
170 if ($self->SUPER::read(@_)) {
171
172 # Do encoding stuff
173 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
174
175 my $url_array = $metadata->{'URL'};
176 my $num_urls = (defined $url_array) ? scalar(@$url_array) : 0;
177
178 my $srcdoc_exists = 0;
179 my $srcdoc_pos = 0;
180 my $filename_dir = &util::filename_head($filename);
181
182 for (my $i=0; $i<$num_urls; $i++) {
183
184 if ($url_array->[$i] !~ m/^(http|ftp):/) {
185
186 my $src_filename = &util::filename_cat($filename_dir, $url_array->[$i]);
187
188 if (-e $src_filename) {
189 $srcdoc_pos = $i;
190 $srcdoc_exists = 1;
191 }
192 }
193 }
194
195 if ($srcdoc_exists)
196 {
197 print $outhandle "OAIPlug: passing metadata on to $url_array->[0]\n"
198 if ($self->{'verbosity'}>1);
199
200
201 # Make pretty print metadata table stick with src filename
202 my $ppmd_table = $self->{'ppmd_table'};
203 $metadata->{'prettymd'} = [ $ppmd_table ];
204 $self->{'ppmd_table'} = undef;
205
206 return &plugin::read ($pluginfo, $filename_dir, $url_array->[0],
207 $metadata, $processor, $maxdocs, $total_count, $gli);
208 }
209 else
210 {
211 # create a new document
212 my $doc_obj = new doc ($filename, "indexed_doc");
213 my $top_section = $doc_obj->get_top_section;
214 my $plugin_type = $self->{'plugin_type'};
215
216 $doc_obj->add_utf8_metadata($top_section, "Language", $language);
217 $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding);
218 $doc_obj->add_utf8_metadata($top_section, "Plugin", $plugin_type);
219 $doc_obj->add_metadata($top_section, "FileFormat", "OAI");
220 $doc_obj->add_metadata($top_section, "FileSize", (-s $filename));
221
222 # include any metadata passed in from previous plugins
223 # note that this metadata is associated with the top level section
224 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
225
226 # do plugin specific processing of doc_obj
227 my $textref = \$self->{'rawxml'};
228 unless (defined ($self->process($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
229 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
230 return -1;
231 }
232
233 # do any automatic metadata extraction
234 $self->auto_extract_metadata ($doc_obj);
235
236 # add an OID
237 $doc_obj->set_OID();
238
239 my $ppmd_table = $self->{'ppmd_table'};
240 $doc_obj->add_utf8_metadata($top_section,"prettymd",$ppmd_table);
241 $self->{'ppmd_table'} = undef;
242
243 # process the document
244 $processor->process($doc_obj);
245
246 $self->{'num_processed'} ++;
247
248 return 1; # processed the file
249 }
250 }
251 else {
252 return undef;
253 }
254}
255
256
257# do plugin specific processing of doc_obj
258sub process {
259 my $self = shift (@_);
260 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
261 my $outhandle = $self->{'outhandle'};
262
263 print STDERR "<Processing n='$file' p='OAIPlug'>\n" if ($gli);
264 print $outhandle "OAIPlug: processing $file\n"
265 if $self->{'verbosity'} > 1;
266
267 my $cursection = $doc_obj->get_top_section();
268
269## $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection);
270
271 # add text to document object
272
273# $$textref =~ s/<(.*?)>/$1 /g;
274 $$textref =~ s/</&lt;/g;
275 $$textref =~ s/>/&gt;/g;
276
277## print STDERR "*** adding text: $$textref\n";
278
279 $doc_obj->add_utf8_text($cursection, $$textref);
280
281 return 1;
282}
283
284
285# Improvement is to merge this with newer version in MetadataPass
286
287sub open_prettyprint_metadata_table
288{
289 my $self = shift(@_);
290
291 my $att = "width=100% cellspacing=2";
292 my $style = "style=\'border-bottom: 4px solid #000080\'";
293
294 $self->{'ppmd_table'} = "\n<table $att $style>";
295}
296
297sub add_prettyprint_metadata_line
298{
299 my $self = shift(@_);
300 my ($metaname, $metavalue_utf8) = @_;
301
302 $metavalue_utf8 =~ s/hdl\.handle\.net/mcgonagall.cs.waikato.ac.nz:8080\/dspace\/handle/;
303 $metavalue_utf8 = &util::hyperlink_text($metavalue_utf8);
304
305 $self->{'ppmd_table'} .= " <tr bgcolor=#b5d3cd>\n";
306 $self->{'ppmd_table'} .= " <td width=30%>\n";
307 $self->{'ppmd_table'} .= " $metaname\n";
308 $self->{'ppmd_table'} .= " </td>\n";
309 $self->{'ppmd_table'} .= " <td>\n";
310 $self->{'ppmd_table'} .= " $metavalue_utf8\n";
311 $self->{'ppmd_table'} .= " </td>\n";
312 $self->{'ppmd_table'} .= " </tr>\n";
313
314}
315
316sub close_prettyprint_metadata_table
317{
318 my $self = shift(@_);
319
320 $self->{'ppmd_table'} .= "</table>\n";
321}
322
323
324
325
326sub extract_oai_metadata {
327 my $self = shift (@_);
328 my ($textref, $metadata) = @_;
329 my $outhandle = $self->{'outhandle'};
330
331 # Only handles DC metadata
332
333 $self->open_prettyprint_metadata_table();
334
335 if ($$textref =~ m/<metadata\s*>(.*?)<\/metadata\s*>/s)
336 {
337 my $metadata_text = $1;
338 $metadata_text =~ s/^.*?<(oai_dc:)?dc.*?>(.*?)<\/(oai_dc:)?dc>.*?/$2/s;
339
340 while ($metadata_text =~ m/<(.*?)>(.*?)<\/(.*?)>(.*)/s)
341 {
342 # if URL given for document as identifier metadata, store it ...
343 # $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
344
345 my $metaname = $1;
346 my $metavalue = $2;
347 $metadata_text = $4;
348
349 $metaname =~ s/^(dc:)?(.)/\u$2/;
350
351 if ($metaname eq "Identifier")
352 {
353 # name clashes with GSDL reserved metadata name for hash id
354 $metaname = "URL";
355 }
356
357 if (defined $metadata->{$metaname})
358 {
359 push(@{$metadata->{$metaname}},$metavalue);
360
361 }
362 else
363 {
364 $metadata->{$metaname} = [ $metavalue ];
365 }
366
367 $self->add_prettyprint_metadata_line($metaname, $metavalue);
368
369 }
370 }
371
372 $self->close_prettyprint_metadata_table();
373}
374
3751;
Note: See TracBrowser for help on using the repository browser.