source: trunk/gsdl/perllib/plugins/OAIPlug.pm@ 10218

Last change on this file since 10218 was 10218, checked in by kjdon, 19 years ago

Jeffrey's new parsing modifications, committed approx 6 July, 15.16

  • Property svn:keywords set to Author Date Id Revision
File size: 9.5 KB
Line 
1###########################################################################
2#
3# OAIPlug.pm -- basic Open Archives Initiative (OAI) plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package OAIPlug;
28
29use BasPlug;
30use unicode;
31use util;
32use parsargv;
33
34use XMLPlug;
35
36sub BEGIN {
37 @OAIPlug::ISA = ('XMLPlug');
38}
39
40
41my $arguments =
42 [ { 'name' => "process_exp",
43 'desc' => "{BasPlug.process_exp}",
44 'type' => "regexp",
45 'reqd' => "no",
46 'deft' => &get_default_process_exp() },
47 ];
48
49my $options = { 'name' => "OAIPlug",
50 'desc' => "{OAIPlug.desc}",
51 'abstract' => "no",
52 'inherits' => "yes",
53 'args' => $arguments };
54
55sub new {
56 my ($class) = shift (@_);
57 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
58 push(@$pluginlist, $class);
59
60 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
61 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
62
63 my $self = (defined $hashArgOptLists)? new XMLPlug($pluginlist,$inputargs,$hashArgOptLists): new XMLPlug($pluginlist,$inputargs);
64
65 return bless $self, $class;
66}
67
68sub get_default_process_exp {
69 my $self = shift (@_);
70
71 return q^(?i)(\.oai)$^;
72}
73
74sub xml_start_document {
75 $self->{'in_metadata_node'} = 0;
76 $self->{'rawxml'} = "";
77}
78
79sub xml_end_document {
80}
81
82sub xml_doctype {
83 my $self = shift(@_);
84
85 my ($expat, $name, $sysid, $pubid, $internal) = @_;
86
87 # allow the short-lived and badly named "GreenstoneArchive" files to be processed
88 # as well as the "Archive" files which should now be created by import.pl
89 die "" if ($name !~ /^OAI-PMH$/);
90
91 my $outhandle = $self->{'outhandle'};
92 print $outhandle "OAIPlug: processing $self->{'file'}\n" if $self->{'verbosity'} > 1;
93 print STDERR "<Processing n='$self->{'file'}' p='OAIPlug'>\n" if $self->{'gli'};
94
95}
96
97
98sub xml_start_tag {
99 my $self = shift(@_);
100 my ($expat,$element) = @_;
101
102 my %attr_hash = %_;
103
104 my $attr = "";
105 map { $attr .= " $_=$attr_hash{$_}"; } keys %attr_hash;
106
107 $self->{'rawxml'} .= "<$element$attr>";
108
109 if ($element eq "metadata") {
110 $self->{'in_metadata_node'} = 1;
111 $self->{'metadata_xml'} = "";
112 }
113
114 if ($self->{'in_metadata_node'}) {
115 $self->{'metadata_xml'} .= "<$element$attr>";
116 }
117}
118
119sub xml_end_tag {
120 my $self = shift(@_);
121 my ($expat, $element) = @_;
122
123 $self->{'rawxml'} .= "</$element>";
124
125 if ($self->{'in_metadata_node'}) {
126 $self->{'metadata_xml'} .= "</$element>";
127 }
128
129 if ($element eq "metadata") {
130 my $textref = \$self->{'metadata_xml'};
131 my $metadata = $self->{'metadata'};
132 $self->extract_oai_metadata($textref,$metadata);
133
134 $self->{'in_metadata_node'} = 0;
135 }
136
137
138}
139
140sub xml_text {
141 my $self = shift(@_);
142 my ($expat) = @_;
143
144 $self->{'rawxml'} .= $_;
145
146 if ($self->{'in_metadata_node'}) {
147 $self->{'metadata_xml'} .= $_;
148 }
149}
150
151
152
153
154sub read {
155 my $self = shift (@_);
156
157 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
158
159 my $outhandle = $self->{'outhandle'};
160
161 my $filename = $file;
162 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
163
164 return 0 if ((-d $filename) && ($filename =~ m/srcdocs$/));
165
166 if ($self->SUPER::read(@_)) {
167
168 # Do encoding stuff
169 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
170
171 my $url_array = $metadata->{'URL'};
172 my $num_urls = (defined $url_arry) ? scalar(@$url_array) : 0;
173
174 my $srcdoc_exists = 0;
175 my $srcdoc_pos = 0;
176 my $filename_dir = &util::filename_head($filename);
177
178 for (my $i=0; $i<$num_urls; $i++) {
179
180 if ($url_array->[$i] !~ m/^(http|ftp):/) {
181
182 my $src_filename = &util::filename_cat($filename_dir, $url_array->[$i]);
183
184 if (-e $src_filename) {
185 $srcdoc_pos = $i;
186 $srcdoc_exists = 1;
187 }
188 }
189 }
190
191 if ($srcdoc_exists)
192 {
193 print $outhandle "OAIPlug: passing metadata on to $url_array->[0]\n"
194 if ($self->{'verbosity'}>1);
195
196
197 # Make pretty print metadata table stick with src filename
198 my $ppmd_table = $self->{'ppmd_table'};
199 $metadata->{'prettymd'} = [ $ppmd_table ];
200 $self->{'ppmd_table'} = undef;
201
202 return &plugin::read ($pluginfo, $filename_dir, $url_array->[0],
203 $metadata, $processor, $maxdocs, $total_count, $gli);
204 }
205 else
206 {
207 # create a new document
208 my $doc_obj = new doc ($filename, "indexed_doc");
209 my $top_section = $doc_obj->get_top_section;
210 my $plugin_type = $self->{'plugin_type'};
211
212 $doc_obj->add_utf8_metadata($top_section, "Language", $language);
213 $doc_obj->add_utf8_metadata($top_section, "Encoding", $encoding);
214 $doc_obj->add_utf8_metadata($top_section, "Plugin", $plugin_type);
215 $doc_obj->add_metadata($top_section, "FileFormat", "OAI");
216 $doc_obj->add_metadata($top_section, "FileSize", (-s $filename));
217
218 # include any metadata passed in from previous plugins
219 # note that this metadata is associated with the top level section
220 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
221
222 # do plugin specific processing of doc_obj
223 my $textref = \$self->{'rawxml'};
224 unless (defined ($self->process($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
225 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
226 return -1;
227 }
228
229 # do any automatic metadata extraction
230 $self->auto_extract_metadata ($doc_obj);
231
232 # add an OID
233 $doc_obj->set_OID();
234
235 my $ppmd_table = $self->{'ppmd_table'};
236 $doc_obj->add_utf8_metadata($top_section,"prettymd",$ppmd_table);
237 $self->{'ppmd_table'} = undef;
238
239 # process the document
240 $processor->process($doc_obj);
241
242 $self->{'num_processed'} ++;
243
244 return 1; # processed the file
245 }
246 }
247 else {
248 return undef;
249 }
250}
251
252
253# do plugin specific processing of doc_obj
254sub process {
255 my $self = shift (@_);
256 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
257 my $outhandle = $self->{'outhandle'};
258
259 print STDERR "<Processing n='$file' p='OAIPlug'>\n" if ($gli);
260 print $outhandle "OAIPlug: processing $file\n"
261 if $self->{'verbosity'} > 1;
262
263 my $cursection = $doc_obj->get_top_section();
264
265## $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection);
266
267 # add text to document object
268
269# $$textref =~ s/<(.*?)>/$1 /g;
270 $$textref =~ s/</&lt;/g;
271 $$textref =~ s/>/&gt;/g;
272
273## print STDERR "*** adding text: $$textref\n";
274
275 $doc_obj->add_utf8_text($cursection, $$textref);
276
277 return 1;
278}
279
280
281# Improvement is to merge this with newer version in MetadataPass
282
283sub open_prettyprint_metadata_table
284{
285 my $self = shift(@_);
286
287 my $att = "width=100% cellspacing=2";
288 my $style = "style=\'border-bottom: 4px solid #000080\'";
289
290 $self->{'ppmd_table'} = "\n<table $att $style>";
291}
292
293sub add_prettyprint_metadata_line
294{
295 my $self = shift(@_);
296 my ($metaname, $metavalue_utf8) = @_;
297
298 $metavalue_utf8 =~ s/hdl\.handle\.net/mcgonagall.cs.waikato.ac.nz:8080\/dspace\/handle/;
299 $metavalue_utf8 = &util::hyperlink_text($metavalue_utf8);
300
301 $self->{'ppmd_table'} .= " <tr bgcolor=#b5d3cd>\n";
302 $self->{'ppmd_table'} .= " <td width=30%>\n";
303 $self->{'ppmd_table'} .= " $metaname\n";
304 $self->{'ppmd_table'} .= " </td>\n";
305 $self->{'ppmd_table'} .= " <td>\n";
306 $self->{'ppmd_table'} .= " $metavalue_utf8\n";
307 $self->{'ppmd_table'} .= " </td>\n";
308 $self->{'ppmd_table'} .= " </tr>\n";
309
310}
311
312sub close_prettyprint_metadata_table
313{
314 my $self = shift(@_);
315
316 $self->{'ppmd_table'} .= "</table>\n";
317}
318
319
320
321
322sub extract_oai_metadata {
323 my $self = shift (@_);
324 my ($textref, $metadata) = @_;
325 my $outhandle = $self->{'outhandle'};
326
327 # Only handles DC metadata
328
329 $self->open_prettyprint_metadata_table();
330
331 if ($$textref =~ m/<metadata\s*>(.*?)<\/metadata\s*>/s)
332 {
333 $metadata_text = $1;
334 $metadata_text =~ s/^.*?<(oai_dc:)?dc.*?>(.*?)<\/(oai_dc:)?dc>.*?/$2/s;
335
336 while ($metadata_text =~ m/<(.*?)>(.*?)<\/(.*?)>(.*)/s)
337 {
338 # if URL given for document as identifier metadata, store it ...
339 # $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
340
341 my $metaname = $1;
342 my $metavalue = $2;
343 $metadata_text = $4;
344
345 $metaname =~ s/^(dc:)?(.)/\u$2/;
346
347 if ($metaname eq "Identifier")
348 {
349 # name clashes with GSDL reserved metadata name for hash id
350 $metaname = "URL";
351 }
352
353 if (defined $metadata->{$metaname})
354 {
355 push(@{$metadata->{$metaname}},$metavalue);
356
357 }
358 else
359 {
360 $metadata->{$metaname} = [ $metavalue ];
361 }
362
363 $self->add_prettyprint_metadata_line($metaname, $metavalue);
364
365 }
366 }
367
368 $self->close_prettyprint_metadata_table();
369}
370
3711;
Note: See TracBrowser for help on using the repository browser.