source: trunk/gsdl/perllib/plugins/OAIPlug.pm@ 9403

Last change on this file since 9403 was 9352, checked in by davidb, 19 years ago

Processing lines for OAIPlug were not appearing in GLI (librarian mode).
This is because it don't "process" the files in the traditional manner
-> instead it passes the metadata on to a source document. Extra print
statement added if $gli=1 to generate message for GLI at the point that
it says its passing on the extracted metadata to ...

  • Property svn:keywords set to Author Date Id Revision
File size: 6.6 KB
Line 
1###########################################################################
2#
3# OAIPlug.pm -- basic Open Archives Initiative (OAI) plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package OAIPlug;
28
29use BasPlug;
30use unicode;
31use util;
32use parsargv;
33
34sub BEGIN {
35 @OAIPlug::ISA = ('BasPlug');
36}
37
38my $arguments =
39 [ { 'name' => "process_exp",
40 'desc' => "{BasPlug.process_exp}",
41 'type' => "regexp",
42 'reqd' => "no",
43 'deft' => &get_default_process_exp() },
44 ];
45
46my $options = { 'name' => "OAIPlug",
47 'desc' => "{OAIPlug.desc}",
48 'abstract' => "no",
49 'inherits' => "yes",
50 'args' => $arguments };
51
52sub new {
53 my $class = shift (@_);
54 my $self = new BasPlug ($class, @_);
55 $self->{'plugin_type'} = "OAIPlug";
56 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
57 my $option_list = $self->{'option_list'};
58 push( @{$option_list}, $options );
59
60 if (!parsargv::parse(\@_,
61 "allow_extra_options")) {
62
63 print STDERR "\nIncorrect options passed to OAIPlug, check your collect.cfg configuration file\n";
64 $self->print_txt_usage(""); # Use default resource bundle
65 die "\n";
66 }
67
68 return bless $self, $class;
69}
70
71sub get_default_process_exp {
72 my $self = shift (@_);
73
74 return q^(?i)(\.oai)$^;
75}
76
77
78sub read {
79 my $self = shift (@_);
80
81 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $gli) = @_;
82
83 my $outhandle = $self->{'outhandle'};
84
85 my $filename = $file;
86 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
87
88 return 0 if ((-d $filename) && ($filename =~ m/srcdocs$/));
89
90 if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
91 $self->{'num_blocked'} ++;
92 return 0;
93 }
94 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
95 return undef;
96 }
97 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
98
99 # Do encoding stuff
100 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
101
102 ####
103 # Above code exactly the same as in BasPlug
104 # => consider making supporting function?
105 ###
106
107 # read in file ($text will be in utf8)
108 my $text = "";
109 $self->read_file ($filename, $encoding, $language, \$text);
110
111 if (!length ($text)) {
112 print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'};
113 return 0;
114 }
115
116 print STDERR "<Processing n='$file' p='OAIPlug'>\n" if ($gli);
117 print $outhandle "OAIPlug: extracting metadata from $file\n"
118 if ($self->{'verbosity'}>1);
119
120 $self->extract_oai_metadata(\$text,$metadata);
121
122 my $url_array = $metadata->{'URL'};
123
124 if (defined $url_array && ($url_array->[0] !~ m/^http:/))
125 {
126 ## my $source_file = &util::filename_cat($base_dir, $file);
127
128 my $url_base_dir = &util::filename_head($filename);
129
130## print STDERR "*** url base dir = $url_base_dir/$url_array->[0]\n";
131 print $outhandle "OAIPlug: passing metadata on to $url_array->[0]\n"
132 if ($self->{'verbosity'}>1);
133
134 return &plugin::read ($pluginfo, $url_base_dir, $url_array->[0],
135 $metadata, $processor, $maxdocs);
136 }
137 else
138 {
139 # create a new document
140 my $doc_obj = new doc ($filename, "indexed_doc");
141 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
142 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
143 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
144 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "OAI");
145 $doc_obj->add_metadata($dob_obj->get_top_section(), "FileSize", (-s $filename));
146
147
148 # include any metadata passed in from previous plugins
149 # note that this metadata is associated with the top level section
150 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
151
152
153 # do plugin specific processing of doc_obj
154 return -1 unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj));
155
156 # do any automatic metadata extraction
157 $self->auto_extract_metadata ($doc_obj);
158
159 # add an OID
160 $doc_obj->set_OID();
161
162 # process the document
163 $processor->process($doc_obj);
164
165 return 1; # processed the file
166 }
167}
168
169
170# do plugin specific processing of doc_obj
171sub process {
172 my $self = shift (@_);
173 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
174 my $outhandle = $self->{'outhandle'};
175
176 print STDERR "<Processing n='$file' p='OAIPlug'>\n" if ($gli);
177 print $outhandle "OAIPlug: processing $file\n"
178 if $self->{'verbosity'} > 1;
179
180 my $cursection = $doc_obj->get_top_section();
181
182## $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection);
183
184 # add text to document object
185
186# $$textref =~ s/<(.*?)>/$1 /g;
187 $$textref =~ s/</&lt;/g;
188 $$textref =~ s/>/&gt;/g;
189
190## print STDERR "*** adding text: $$textref\n";
191
192 $doc_obj->add_utf8_text($cursection, $$textref);
193
194 return 1;
195}
196
197
198
199sub extract_oai_metadata {
200 my $self = shift (@_);
201 my ($textref, $metadata) = @_;
202 my $outhandle = $self->{'outhandle'};
203
204
205 if ($$textref =~ m/<metadata>(.*?)<\/metadata>/s)
206 {
207 $metadata_text = $1;
208 $metadata_text =~ s/^.*?<(oai_dc:)?dc.*?>(.*?)<\/(oai_dc:)?dc>.*?/$2/s;
209
210 while ($metadata_text =~ m/<(.*?)>(.*?)<\/(.*?)>(.*)/s)
211 {
212 # if URL given for document as identifier metadata, store it ...
213 # $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
214 my $metaname = $1;
215 my $metavalue = $2;
216 $metadata_text = $4;
217
218 $metaname =~ s/^(dc:)?(.)/\u$2/;
219
220 if ($metaname eq "Identifier")
221 {
222 # name clashes with GSDL reserved metadata name for hash id
223 $metaname = "URL";
224 }
225
226 if (defined $metadata->{$metaname})
227 {
228 push(@{$metadata->{$metaname}},$metavalue);
229
230 }
231 else
232 {
233 $metadata->{$metaname} = [ $metavalue ];
234 }
235
236
237 }
238 }
239}
240
2411;
Note: See TracBrowser for help on using the repository browser.