source: trunk/gsdl/perllib/plugins/OAIPlug.pm@ 9893

Last change on this file since 9893 was 9853, checked in by kjdon, 19 years ago

fixed up maxdocs - now pass an extra parameter to the read function

  • Property svn:keywords set to Author Date Id Revision
File size: 6.7 KB
Line 
1###########################################################################
2#
3# OAIPlug.pm -- basic Open Archives Initiative (OAI) plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package OAIPlug;
28
29use BasPlug;
30use unicode;
31use util;
32use parsargv;
33
34sub BEGIN {
35 @OAIPlug::ISA = ('BasPlug');
36}
37
38my $arguments =
39 [ { 'name' => "process_exp",
40 'desc' => "{BasPlug.process_exp}",
41 'type' => "regexp",
42 'reqd' => "no",
43 'deft' => &get_default_process_exp() },
44 ];
45
46my $options = { 'name' => "OAIPlug",
47 'desc' => "{OAIPlug.desc}",
48 'abstract' => "no",
49 'inherits' => "yes",
50 'args' => $arguments };
51
52sub new {
53 my $class = shift (@_);
54 my $self = new BasPlug ($class, @_);
55 $self->{'plugin_type'} = "OAIPlug";
56 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
57 my $option_list = $self->{'option_list'};
58 push( @{$option_list}, $options );
59
60 if (!parsargv::parse(\@_,
61 "allow_extra_options")) {
62
63 print STDERR "\nIncorrect options passed to OAIPlug, check your collect.cfg configuration file\n";
64 $self->print_txt_usage(""); # Use default resource bundle
65 die "\n";
66 }
67
68 return bless $self, $class;
69}
70
71sub get_default_process_exp {
72 my $self = shift (@_);
73
74 return q^(?i)(\.oai)$^;
75}
76
77
78sub read {
79 my $self = shift (@_);
80
81 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
82
83 my $outhandle = $self->{'outhandle'};
84
85 my $filename = $file;
86 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
87
88 return 0 if ((-d $filename) && ($filename =~ m/srcdocs$/));
89
90 if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
91 $self->{'num_blocked'} ++;
92 return 0;
93 }
94 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
95 return undef;
96 }
97 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
98
99 # Do encoding stuff
100 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
101
102 ####
103 # Above code exactly the same as in BasPlug
104 # => consider making supporting function?
105 ###
106
107 # read in file ($text will be in utf8)
108 my $text = "";
109 $self->read_file ($filename, $encoding, $language, \$text);
110
111 if (!length ($text)) {
112 print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'};
113 return 0;
114 }
115
116 print STDERR "<Processing n='$file' p='OAIPlug'>\n" if ($gli);
117 print $outhandle "OAIPlug: extracting metadata from $file\n"
118 if ($self->{'verbosity'}>1);
119
120 $self->extract_oai_metadata(\$text,$metadata);
121
122 my $url_array = $metadata->{'URL'};
123
124 if (defined $url_array && ($url_array->[0] !~ m/^http:/))
125 {
126 ## my $source_file = &util::filename_cat($base_dir, $file);
127
128 my $url_base_dir = &util::filename_head($filename);
129
130## print STDERR "*** url base dir = $url_base_dir/$url_array->[0]\n";
131 print $outhandle "OAIPlug: passing metadata on to $url_array->[0]\n"
132 if ($self->{'verbosity'}>1);
133
134 return &plugin::read ($pluginfo, $url_base_dir, $url_array->[0],
135 $metadata, $processor, $maxdocs, $total_count, $gli);
136 }
137 else
138 {
139 # create a new document
140 my $doc_obj = new doc ($filename, "indexed_doc");
141 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
142 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
143 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
144 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "OAI");
145 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileSize", (-s $filename));
146
147
148 # include any metadata passed in from previous plugins
149 # note that this metadata is associated with the top level section
150 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
151
152
153 # do plugin specific processing of doc_obj
154 unless (defined ($self->process(\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
155 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
156 return -1;
157 }
158
159 # do any automatic metadata extraction
160 $self->auto_extract_metadata ($doc_obj);
161
162 # add an OID
163 $doc_obj->set_OID();
164
165 # process the document
166 $processor->process($doc_obj);
167
168 return 1; # processed the file
169 }
170}
171
172
173# do plugin specific processing of doc_obj
174sub process {
175 my $self = shift (@_);
176 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
177 my $outhandle = $self->{'outhandle'};
178
179 print STDERR "<Processing n='$file' p='OAIPlug'>\n" if ($gli);
180 print $outhandle "OAIPlug: processing $file\n"
181 if $self->{'verbosity'} > 1;
182
183 my $cursection = $doc_obj->get_top_section();
184
185## $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection);
186
187 # add text to document object
188
189# $$textref =~ s/<(.*?)>/$1 /g;
190 $$textref =~ s/</&lt;/g;
191 $$textref =~ s/>/&gt;/g;
192
193## print STDERR "*** adding text: $$textref\n";
194
195 $doc_obj->add_utf8_text($cursection, $$textref);
196
197 return 1;
198}
199
200
201
202sub extract_oai_metadata {
203 my $self = shift (@_);
204 my ($textref, $metadata) = @_;
205 my $outhandle = $self->{'outhandle'};
206
207
208 if ($$textref =~ m/<metadata>(.*?)<\/metadata>/s)
209 {
210 $metadata_text = $1;
211 $metadata_text =~ s/^.*?<(oai_dc:)?dc.*?>(.*?)<\/(oai_dc:)?dc>.*?/$2/s;
212
213 while ($metadata_text =~ m/<(.*?)>(.*?)<\/(.*?)>(.*)/s)
214 {
215 # if URL given for document as identifier metadata, store it ...
216 # $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
217 my $metaname = $1;
218 my $metavalue = $2;
219 $metadata_text = $4;
220
221 $metaname =~ s/^(dc:)?(.)/\u$2/;
222
223 if ($metaname eq "Identifier")
224 {
225 # name clashes with GSDL reserved metadata name for hash id
226 $metaname = "URL";
227 }
228
229 if (defined $metadata->{$metaname})
230 {
231 push(@{$metadata->{$metaname}},$metavalue);
232
233 }
234 else
235 {
236 $metadata->{$metaname} = [ $metavalue ];
237 }
238
239
240 }
241 }
242}
243
2441;
Note: See TracBrowser for help on using the repository browser.