source: trunk/gsdl/perllib/plugins/OAIPlug.pm@ 10111

Last change on this file since 10111 was 10111, checked in by davidb, 19 years ago

Access to scalar(@$url_array) better protected with 'defined' to ensure
variable exists before the length of it is calculated.

  • Property svn:keywords set to Author Date Id Revision
File size: 9.6 KB
Line 
1###########################################################################
2#
3# OAIPlug.pm -- basic Open Archives Initiative (OAI) plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package OAIPlug;
28
29use BasPlug;
30use unicode;
31use util;
32use parsargv;
33
34use XMLPlug;
35
36sub BEGIN {
37 @OAIPlug::ISA = ('XMLPlug');
38}
39
40
41my $arguments =
42 [ { 'name' => "process_exp",
43 'desc' => "{BasPlug.process_exp}",
44 'type' => "regexp",
45 'reqd' => "no",
46 'deft' => &get_default_process_exp() },
47 ];
48
49my $options = { 'name' => "OAIPlug",
50 'desc' => "{OAIPlug.desc}",
51 'abstract' => "no",
52 'inherits' => "yes",
53 'args' => $arguments };
54
55sub new {
56 my $class = shift (@_);
57 my $self = new XMLPlug ($class, @_);
58 $self->{'plugin_type'} = "OAIPlug";
59 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
60 my $option_list = $self->{'option_list'};
61 push( @{$option_list}, $options );
62
63 if (!parsargv::parse(\@_,
64 "allow_extra_options")) {
65
66 print STDERR "\nIncorrect options passed to OAIPlug, check your collect.cfg configuration file\n";
67 $self->print_txt_usage(""); # Use default resource bundle
68 die "\n";
69 }
70
71 return bless $self, $class;
72}
73
74sub get_default_process_exp {
75 my $self = shift (@_);
76
77 return q^(?i)(\.oai)$^;
78}
79
80sub xml_start_document {
81 $self->{'in_metadata_node'} = 0;
82 $self->{'rawxml'} = "";
83}
84
85sub xml_end_document {
86}
87
88sub xml_doctype {
89 my $self = shift(@_);
90
91 my ($expat, $name, $sysid, $pubid, $internal) = @_;
92
93 # allow the short-lived and badly named "GreenstoneArchive" files to be processed
94 # as well as the "Archive" files which should now be created by import.pl
95 die "" if ($name !~ /^OAI-PMH$/);
96
97 my $outhandle = $self->{'outhandle'};
98 print $outhandle "OAIPlug: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
99 print STDERR "<Processing n='$self->{'file'}' p='OAIPlug'>\n" if $self->{'gli'};
100
101}
102
103
104sub xml_start_tag {
105 my $self = shift(@_);
106 my ($expat,$element) = @_;
107
108 my %attr_hash = %_;
109
110 my $attr = "";
111 map { $attr .= " $_=$attr_hash{$_}"; } keys %attr_hash;
112
113 $self->{'rawxml'} .= "<$element$attr>";
114
115 if ($element eq "metadata") {
116 $self->{'in_metadata_node'} = 1;
117 $self->{'metadata_xml'} = "";
118 }
119
120 if ($self->{'in_metadata_node'}) {
121 $self->{'metadata_xml'} .= "<$element$attr>";
122 }
123}
124
125sub xml_end_tag {
126 my $self = shift(@_);
127 my ($expat, $element) = @_;
128
129 $self->{'rawxml'} .= "</$element>";
130
131 if ($self->{'in_metadata_node'}) {
132 $self->{'metadata_xml'} .= "</$element>";
133 }
134
135 if ($element eq "metadata") {
136 my $textref = \$self->{'metadata_xml'};
137 my $metadata = $self->{'metadata'};
138 $self->extract_oai_metadata($textref,$metadata);
139
140 $self->{'in_metadata_node'} = 0;
141 }
142
143
144}
145
146sub xml_text {
147 my $self = shift(@_);
148 my ($expat) = @_;
149
150 $self->{'rawxml'} .= $_;
151
152 if ($self->{'in_metadata_node'}) {
153 $self->{'metadata_xml'} .= $_;
154 }
155}
156
157
158
159
160sub read {
161 my $self = shift (@_);
162
163 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
164
165 my $outhandle = $self->{'outhandle'};
166
167 my $filename = $file;
168 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
169
170 return 0 if ((-d $filename) && ($filename =~ m/srcdocs$/));
171
172 if ($self->SUPER::read(@_)) {
173
174 # Do encoding stuff
175 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
176
177 my $url_array = $metadata->{'URL'};
178 my $num_urls = (defined $url_arry) ? scalar(@$url_array) : 0;
179
180 my $srcdoc_exists = 0;
181 my $srcdoc_pos = 0;
182 my $filename_dir = &util::filename_head($filename);
183
184 for (my $i=0; $i<$num_urls; $i++) {
185
186 if ($url_array->[$i] !~ m/^(http|ftp):/) {
187
188 my $src_filename = &util::filename_cat($filename_dir, $url_array->[$i]);
189
190 if (-e $src_filename) {
191 $srcdoc_pos = $i;
192 $srcdoc_exists = 1;
193 }
194 }
195 }
196
197 if ($srcdoc_exists)
198 {
199 print $outhandle "OAIPlug: passing metadata on to $url_array->[0]\n"
200 if ($self->{'verbosity'}>1);
201
202
203 # Make pretty print metadata table stick with src filename
204 my $ppmd_table = $self->{'ppmd_table'};
205 $metadata->{'prettymd'} = [ $ppmd_table ];
206 $self->{'ppmd_table'} = undef;
207
208 return &plugin::read ($pluginfo, $filename_dir, $url_array->[0],
209 $metadata, $processor, $maxdocs, $total_count, $gli);
210 }
211 else
212 {
213 # create a new document
214 my $doc_obj = new doc ($filename, "indexed_doc");
215 my $top_section = $doc_obj->get_top_section;
216 my $plugin_type = $self->{'plugin_type'};
217
218 $doc_obj->add_utf8_metadata($top_section, "Language", $language);
219 $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding);
220 $doc_obj->add_utf8_metadata($top_section, "Plugin", $plugin_type);
221 $doc_obj->add_metadata($top_section, "FileFormat", "OAI");
222 $doc_obj->add_metadata($top_section, "FileSize", (-s $filename));
223
224 # include any metadata passed in from previous plugins
225 # note that this metadata is associated with the top level section
226 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
227
228 # do plugin specific processing of doc_obj
229 my $textref = \$self->{'rawxml'};
230 unless (defined ($self->process($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
231 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
232 return -1;
233 }
234
235 # do any automatic metadata extraction
236 $self->auto_extract_metadata ($doc_obj);
237
238 # add an OID
239 $doc_obj->set_OID();
240
241 my $ppmd_table = $self->{'ppmd_table'};
242 $doc_obj->add_utf8_metadata($top_section,"prettymd",$ppmd_table);
243 $self->{'ppmd_table'} = undef;
244
245 # process the document
246 $processor->process($doc_obj);
247
248 $self->{'num_processed'} ++;
249
250 return 1; # processed the file
251 }
252 }
253 else {
254 return undef;
255 }
256}
257
258
259# do plugin specific processing of doc_obj
260sub process {
261 my $self = shift (@_);
262 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
263 my $outhandle = $self->{'outhandle'};
264
265 print STDERR "<Processing n='$file' p='OAIPlug'>\n" if ($gli);
266 print $outhandle "OAIPlug: processing $file\n"
267 if $self->{'verbosity'} > 1;
268
269 my $cursection = $doc_obj->get_top_section();
270
271## $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection);
272
273 # add text to document object
274
275# $$textref =~ s/<(.*?)>/$1 /g;
276 $$textref =~ s/</&lt;/g;
277 $$textref =~ s/>/&gt;/g;
278
279## print STDERR "*** adding text: $$textref\n";
280
281 $doc_obj->add_utf8_text($cursection, $$textref);
282
283 return 1;
284}
285
286
287# Improvement is to merge this with newer version in MetadataPass
288
289sub open_prettyprint_metadata_table
290{
291 my $self = shift(@_);
292
293 my $att = "width=100% cellspacing=2";
294 my $style = "style=\'border-bottom: 4px solid #000080\'";
295
296 $self->{'ppmd_table'} = "\n<table $att $style>";
297}
298
299sub add_prettyprint_metadata_line
300{
301 my $self = shift(@_);
302 my ($metaname, $metavalue_utf8) = @_;
303
304 $metavalue_utf8 =~ s/hdl\.handle\.net/mcgonagall.cs.waikato.ac.nz:8080\/dspace\/handle/;
305 $metavalue_utf8 = &util::hyperlink_text($metavalue_utf8);
306
307 $self->{'ppmd_table'} .= " <tr bgcolor=#b5d3cd>\n";
308 $self->{'ppmd_table'} .= " <td width=30%>\n";
309 $self->{'ppmd_table'} .= " $metaname\n";
310 $self->{'ppmd_table'} .= " </td>\n";
311 $self->{'ppmd_table'} .= " <td>\n";
312 $self->{'ppmd_table'} .= " $metavalue_utf8\n";
313 $self->{'ppmd_table'} .= " </td>\n";
314 $self->{'ppmd_table'} .= " </tr>\n";
315
316}
317
318sub close_prettyprint_metadata_table
319{
320 my $self = shift(@_);
321
322 $self->{'ppmd_table'} .= "</table>\n";
323}
324
325
326
327
328sub extract_oai_metadata {
329 my $self = shift (@_);
330 my ($textref, $metadata) = @_;
331 my $outhandle = $self->{'outhandle'};
332
333 # Only handles DC metadata
334
335 $self->open_prettyprint_metadata_table();
336
337 if ($$textref =~ m/<metadata\s*>(.*?)<\/metadata\s*>/s)
338 {
339 $metadata_text = $1;
340 $metadata_text =~ s/^.*?<(oai_dc:)?dc.*?>(.*?)<\/(oai_dc:)?dc>.*?/$2/s;
341
342 while ($metadata_text =~ m/<(.*?)>(.*?)<\/(.*?)>(.*)/s)
343 {
344 # if URL given for document as identifier metadata, store it ...
345 # $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
346
347 my $metaname = $1;
348 my $metavalue = $2;
349 $metadata_text = $4;
350
351 $metaname =~ s/^(dc:)?(.)/\u$2/;
352
353 if ($metaname eq "Identifier")
354 {
355 # name clashes with GSDL reserved metadata name for hash id
356 $metaname = "URL";
357 }
358
359 if (defined $metadata->{$metaname})
360 {
361 push(@{$metadata->{$metaname}},$metavalue);
362
363 }
364 else
365 {
366 $metadata->{$metaname} = [ $metavalue ];
367 }
368
369 $self->add_prettyprint_metadata_line($metaname, $metavalue);
370
371 }
372 }
373
374 $self->close_prettyprint_metadata_table();
375}
376
3771;
Note: See TracBrowser for help on using the repository browser.