source: trunk/gsdl/perllib/plugins/OAIPlug.pm@ 9958

Last change on this file since 9958 was 9958, checked in by davidb, 19 years ago

Upgrading of OAIPlug to be based on XMLPlug. OAIPlug was originally written
before XMLPlug. The rewrite means it can now take advantage of the -xslt
option.

  • Property svn:keywords set to Author Date Id Revision
File size: 9.5 KB
Line 
1###########################################################################
2#
3# OAIPlug.pm -- basic Open Archives Initiative (OAI) plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package OAIPlug;
28
29use BasPlug;
30use unicode;
31use util;
32use parsargv;
33
34use XMLPlug;
35
36sub BEGIN {
37 @OAIPlug::ISA = ('XMLPlug');
38}
39
40
41my $arguments =
42 [ { 'name' => "process_exp",
43 'desc' => "{BasPlug.process_exp}",
44 'type' => "regexp",
45 'reqd' => "no",
46 'deft' => &get_default_process_exp() },
47 ];
48
49my $options = { 'name' => "OAIPlug",
50 'desc' => "{OAIPlug.desc}",
51 'abstract' => "no",
52 'inherits' => "yes",
53 'args' => $arguments };
54
55sub new {
56 my $class = shift (@_);
57 my $self = new XMLPlug ($class, @_);
58 $self->{'plugin_type'} = "OAIPlug";
59 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
60 my $option_list = $self->{'option_list'};
61 push( @{$option_list}, $options );
62
63 if (!parsargv::parse(\@_,
64 "allow_extra_options")) {
65
66 print STDERR "\nIncorrect options passed to OAIPlug, check your collect.cfg configuration file\n";
67 $self->print_txt_usage(""); # Use default resource bundle
68 die "\n";
69 }
70
71 return bless $self, $class;
72}
73
74sub get_default_process_exp {
75 my $self = shift (@_);
76
77 return q^(?i)(\.oai)$^;
78}
79
80sub xml_start_document {
81 $self->{'in_metadata_node'} = 0;
82 $self->{'rawxml'} = "";
83}
84
85sub xml_end_document {
86}
87
88sub xml_doctype {
89 my $self = shift(@_);
90
91 my ($expat, $name, $sysid, $pubid, $internal) = @_;
92
93 # allow the short-lived and badly named "GreenstoneArchive" files to be processed
94 # as well as the "Archive" files which should now be created by import.pl
95 die "" if ($name !~ /^OAI-PMH$/);
96
97 my $outhandle = $self->{'outhandle'};
98 print $outhandle "OAIPlug: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
99 print STDERR "<Processing n='$self->{'file'}' p='OAIPlug'>\n" if $self->{'gli'};
100
101}
102
103
104sub xml_start_tag {
105 my $self = shift(@_);
106 my ($expat,$element) = @_;
107
108 my %attr_hash = %_;
109
110 my $attr = "";
111 map { $attr .= " $_=$attr_hash{$_}"; } keys %attr_hash;
112
113 $self->{'rawxml'} .= "<$element$attr>";
114
115 if ($element eq "metadata") {
116 $self->{'in_metadata_node'} = 1;
117 $self->{'metadata_xml'} = "";
118 }
119
120 if ($self->{'in_metadata_node'}) {
121 $self->{'metadata_xml'} .= "<$element$attr>";
122 }
123}
124
125sub xml_end_tag {
126 my $self = shift(@_);
127 my ($expat, $element) = @_;
128
129 $self->{'rawxml'} .= "</$element>";
130
131 if ($self->{'in_metadata_node'}) {
132 $self->{'metadata_xml'} .= "</$element>";
133 }
134
135 if ($element eq "metadata") {
136 my $textref = \$self->{'metadata_xml'};
137 my $metadata = $self->{'metadata'};
138 $self->extract_oai_metadata($textref,$metadata);
139
140 $self->{'in_metadata_node'} = 0;
141 }
142
143
144}
145
146sub xml_text {
147 my $self = shift(@_);
148 my ($expat) = @_;
149
150 $self->{'rawxml'} .= $_;
151
152 if ($self->{'in_metadata_node'}) {
153 $self->{'metadata_xml'} .= $_;
154 }
155}
156
157
158
159
160sub read {
161 my $self = shift (@_);
162
163 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
164
165 my $outhandle = $self->{'outhandle'};
166
167 my $filename = $file;
168 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
169
170 return 0 if ((-d $filename) && ($filename =~ m/srcdocs$/));
171
172 if ($self->SUPER::read(@_)) {
173
174 # Do encoding stuff
175 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
176
177 my $url_array = $metadata->{'URL'};
178 my $num_urls = scalar(@$url_array);
179
180 my $srcdoc_exists = 0;
181 my $srcdoc_pos = 0;
182 my $filename_dir = &util::filename_head($filename);
183
184 for (my $i=0; $i<$num_urls; $i++) {
185
186 if ($url_array->[$i] !~ m/^(http|ftp):/) {
187
188 my $src_filename = &util::filename_cat($filename_dir, $url_array->[$i]);
189
190 if (-e $src_filename) {
191 $srcdoc_pos = $i;
192 $srcdoc_exists = 1;
193 }
194 }
195 }
196
197 if ($srcdoc_exists)
198 {
199 print $outhandle "OAIPlug: passing metadata on to $url_array->[0]\n"
200 if ($self->{'verbosity'}>1);
201
202
203 # Make pretty print metadata table stick with src filename
204 my $ppmd_table = $self->{'ppmd_table'};
205 $metadata->{'prettymd'} = [ $ppmd_table ];
206 $self->{'ppmd_table'} = undef;
207
208 return &plugin::read ($pluginfo, $filename_dir, $url_array->[0],
209 $metadata, $processor, $maxdocs, $total_count, $gli);
210 }
211 else
212 {
213 # create a new document
214 my $doc_obj = new doc ($filename, "indexed_doc");
215 my $top_section = $doc_obj->get_top_section;
216 my $plugin_type = $self->{'plugin_type'};
217
218 $doc_obj->add_utf8_metadata($top_section, "Language", $language);
219 $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding);
220 $doc_obj->add_utf8_metadata($top_section, "Plugin", $plugin_type);
221 $doc_obj->add_metadata($top_section, "FileFormat", "OAI");
222 $doc_obj->add_metadata($top_section, "FileSize", (-s $filename));
223
224 # include any metadata passed in from previous plugins
225 # note that this metadata is associated with the top level section
226 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
227
228 # do plugin specific processing of doc_obj
229 my $textref = \$self->{'rawxml'};
230 unless (defined ($self->process($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
231 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
232 return -1;
233 }
234
235 # do any automatic metadata extraction
236 $self->auto_extract_metadata ($doc_obj);
237
238 # add an OID
239 $doc_obj->set_OID();
240
241 my $ppmd_table = $self->{'ppmd_table'};
242 $doc_obj->add_utf8_metadata($top_section,"prettymd",$ppmd_table);
243 $self->{'ppmd_table'} = undef;
244
245 # process the document
246 $processor->process($doc_obj);
247
248 $self->{'num_processed'} ++;
249
250 return 1; # processed the file
251 }
252 }
253 else {
254 return undef;
255 }
256}
257
258
259# do plugin specific processing of doc_obj
260sub process {
261 my $self = shift (@_);
262 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
263 my $outhandle = $self->{'outhandle'};
264
265 print STDERR "<Processing n='$file' p='OAIPlug'>\n" if ($gli);
266 print $outhandle "OAIPlug: processing $file\n"
267 if $self->{'verbosity'} > 1;
268
269 my $cursection = $doc_obj->get_top_section();
270
271## $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection);
272
273 # add text to document object
274
275# $$textref =~ s/<(.*?)>/$1 /g;
276 $$textref =~ s/</&lt;/g;
277 $$textref =~ s/>/&gt;/g;
278
279## print STDERR "*** adding text: $$textref\n";
280
281 $doc_obj->add_utf8_text($cursection, $$textref);
282
283 return 1;
284}
285
286
287# Improvement is to merge this with newer version in MetadataPass
288
289sub open_prettyprint_metadata_table
290{
291 my $self = shift(@_);
292
293 my $att = "width=100% cellspacing=2";
294 my $style = "style=\'border-bottom: 4px solid #000080\'";
295
296 $self->{'ppmd_table'} = "\n<table $att $style>";
297}
298
299sub add_prettyprint_metadata_line
300{
301 my $self = shift(@_);
302 my ($metaname, $metavalue_utf8) = @_;
303
304 $metavalue_utf8 =~ s/hdl\.handle\.net/mcgonagall.cs.waikato.ac.nz:8080\/dspace\/handle/;
305 $metavalue_utf8 = &util::hyperlink_text($metavalue_utf8);
306
307 $self->{'ppmd_table'} .= " <tr bgcolor=#b5d3cd>\n";
308 $self->{'ppmd_table'} .= " <td width=30%>\n";
309 $self->{'ppmd_table'} .= " $metaname\n";
310 $self->{'ppmd_table'} .= " </td>\n";
311 $self->{'ppmd_table'} .= " <td>\n";
312 $self->{'ppmd_table'} .= " $metavalue_utf8\n";
313 $self->{'ppmd_table'} .= " </td>\n";
314 $self->{'ppmd_table'} .= " </tr>\n";
315
316}
317
318sub close_prettyprint_metadata_table
319{
320 my $self = shift(@_);
321
322 $self->{'ppmd_table'} .= "</table>\n";
323}
324
325
326
327
328sub extract_oai_metadata {
329 my $self = shift (@_);
330 my ($textref, $metadata) = @_;
331 my $outhandle = $self->{'outhandle'};
332
333 # Only handles DC metadata
334
335 $self->open_prettyprint_metadata_table();
336
337 if ($$textref =~ m/<metadata\s*>(.*?)<\/metadata\s*>/s)
338 {
339 $metadata_text = $1;
340 $metadata_text =~ s/^.*?<(oai_dc:)?dc.*?>(.*?)<\/(oai_dc:)?dc>.*?/$2/s;
341
342 while ($metadata_text =~ m/<(.*?)>(.*?)<\/(.*?)>(.*)/s)
343 {
344 # if URL given for document as identifier metadata, store it ...
345 # $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
346
347 my $metaname = $1;
348 my $metavalue = $2;
349 $metadata_text = $4;
350
351 $metaname =~ s/^(dc:)?(.)/\u$2/;
352
353 if ($metaname eq "Identifier")
354 {
355 # name clashes with GSDL reserved metadata name for hash id
356 $metaname = "URL";
357 }
358
359 if (defined $metadata->{$metaname})
360 {
361 push(@{$metadata->{$metaname}},$metavalue);
362
363 }
364 else
365 {
366 $metadata->{$metaname} = [ $metavalue ];
367 }
368
369 $self->add_prettyprint_metadata_line($metaname, $metavalue);
370
371 }
372 }
373
374 $self->close_prettyprint_metadata_table();
375}
376
3771;
Note: See TracBrowser for help on using the repository browser.