source: trunk/gsdl/perllib/plugins/OAIPlug.pm@ 7243

Last change on this file since 7243 was 6408, checked in by jmt12, 20 years ago

Added two new attributes for script arguments. HiddenGLI controls whether the argument will be visible at all in GLI, while ModeGLI defines the lowest detail mode under which the argument will be visible (only really for import and buildcol). Also ensured that the scripts were reporting their correct default process expressions, and further refined argument types by adding the catagory regexp for any regular expression (which can then be hidden under lower detail modes in GLI)

  • Property svn:keywords set to Author Date Id Revision
File size: 6.5 KB
Line 
1###########################################################################
2#
3# OAIPlug.pm -- basic Open Archives Initiative (OAI) plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package OAIPlug;
28
29use BasPlug;
30use unicode;
31use util;
32use parsargv;
33
34sub BEGIN {
35 @ISA = ('BasPlug');
36}
37
38my $arguments =
39 [ { 'name' => "process_exp",
40 'desc' => "{BasPlug.process_exp}",
41 'type' => "regexp",
42 'reqd' => "no",
43 'deft' => &get_default_process_exp() },
44 ];
45
46my $options = { 'name' => "OAIPlug",
47 'desc' => "{OAIPlug.desc}",
48 'abstract' => "no",
49 'inherits' => "yes",
50 'args' => $arguments };
51
52# sub print_usage {
53# print STDERR "\n usage: plugin OAIPlug [options]\n\n";
54# print STDERR " currently no options:\n";
55# }
56
57sub new {
58 my $class = shift (@_);
59 my $self = new BasPlug ($class, @_);
60 $self->{'plugin_type'} = "OAIPlug";
61 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
62 my $option_list = $self->{'option_list'};
63 push( @{$option_list}, $options );
64
65 if (!parsargv::parse(\@_,
66 "allow_extra_options")) {
67
68 print STDERR "\nIncorrect options passed to OAIPlug, check your collect.cfg configuration file\n";
69 $self->print_txt_usage(""); # Use default resource bundle
70 die "\n";
71 }
72
73 return bless $self, $class;
74}
75
76sub get_default_process_exp {
77 my $self = shift (@_);
78
79 return q^(?i)(\.oai)$^;
80}
81
82
83sub read {
84 my $self = shift (@_);
85
86 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
87
88 my $outhandle = $self->{'outhandle'};
89
90 my $filename = $file;
91 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
92
93 return 0 if ((-d $filename) && ($filename =~ m/srcdocs$/));
94
95 if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
96 $self->{'num_blocked'} ++;
97 return 0;
98 }
99 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
100 return undef;
101 }
102 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
103
104 # Do encoding stuff
105 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
106
107 ####
108 # Above code exactly the same as in BasPlug
109 # => consider making supporting function?
110 ###
111
112 # read in file ($text will be in utf8)
113 my $text = "";
114 $self->read_file ($filename, $encoding, $language, \$text);
115
116 if (!length ($text)) {
117 print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'};
118 return 0;
119 }
120
121 print $outhandle "OAIPlug: extracting metadata from $file\n"
122 if ($self->{'verbosity'}>1);
123
124 $self->extract_oai_metadata(\$text,$metadata);
125
126 my $url_array = $metadata->{'URL'};
127
128 if (defined $url_array && ($url_array->[0] !~ m/^http:/))
129 {
130 ## my $source_file = &util::filename_cat($base_dir, $file);
131
132 my $url_base_dir = $filename;
133 $url_base_dir =~ s/^(.*)\/(.*?)$/$1/;
134## print STDERR "*** url base dir = $url_base_dir/$url_array->[0]\n";
135 print $outhandle "OAIPlug: passing metadata on to $url_array->[0]\n"
136 if ($self->{'verbosity'}>1);
137
138 return &plugin::read ($pluginfo, $url_base_dir, $url_array->[0],
139 $metadata, $processor, $maxdocs);
140 }
141 else
142 {
143 # create a new document
144 my $doc_obj = new doc ($filename, "indexed_doc");
145 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
146 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
147 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "$self->{'plugin_type'}", "1");
148
149 # include any metadata passed in from previous plugins
150 # note that this metadata is associated with the top level section
151 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
152
153
154 # do plugin specific processing of doc_obj
155 return undef unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj));
156
157 # do any automatic metadata extraction
158 $self->auto_extract_metadata ($doc_obj);
159
160 # add an OID
161 $doc_obj->set_OID();
162
163 # process the document
164 $processor->process($doc_obj);
165
166 return 1; # processed the file
167 }
168}
169
170
171# do plugin specific processing of doc_obj
172sub process {
173 my $self = shift (@_);
174 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
175 my $outhandle = $self->{'outhandle'};
176
177 print STDERR "<Processing n='$file' p='OAIPlug'>\n" if ($gli);
178 print $outhandle "OAIPlug: processing $file\n"
179 if $self->{'verbosity'} > 1;
180
181 my $cursection = $doc_obj->get_top_section();
182
183## $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection);
184
185 # add text to document object
186
187# $$textref =~ s/<(.*?)>/$1 /g;
188 $$textref =~ s/</&lt;/g;
189 $$textref =~ s/>/&gt;/g;
190
191## print STDERR "*** adding text: $$textref\n";
192
193 $doc_obj->add_utf8_text($cursection, $$textref);
194
195 return 1;
196}
197
198
199
200sub extract_oai_metadata {
201 my $self = shift (@_);
202 my ($textref, $metadata) = @_;
203 my $outhandle = $self->{'outhandle'};
204
205
206 if ($$textref =~ m/<metadata>(.*?)<\/metadata>/s)
207 {
208 $metadata_text = $1;
209 $metadata_text =~ s/^.*?<(oai_dc:)?dc.*?>(.*?)<\/(oai_dc:)?dc>.*?/$2/s;
210
211 while ($metadata_text =~ m/<(.*?)>(.*?)<\/(.*?)>(.*)/s)
212 {
213 # if URL given for document as identifier metadata, store it ...
214 # $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
215 my $metaname = $1;
216 my $metavalue = $2;
217 $metadata_text = $4;
218
219 $metaname =~ s/^(dc:)?(.)/\u$2/;
220
221 if ($metaname eq "Identifier")
222 {
223 # name clashes with GSDL reserved metadata name for hash id
224 $metaname = "URL";
225 }
226
227 if (defined $metadata->{$metaname})
228 {
229 push(@{$metadata->{$metaname}},$metavalue);
230 }
231 else
232 {
233 $metadata->{$metaname} = [ $metavalue ];
234 }
235
236
237 }
238 }
239}
240
2411;
Note: See TracBrowser for help on using the repository browser.