source: trunk/gsdl/perllib/plugins/OAIPlug.pm@ 8121

Last change on this file since 8121 was 8121, checked in by chi, 20 years ago

Add the "FileFormat" metadata to each of the Plugins.

  • Property svn:keywords set to Author Date Id Revision
File size: 6.5 KB
Line 
1###########################################################################
2#
3# OAIPlug.pm -- basic Open Archives Initiative (OAI) plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package OAIPlug;
28
29use BasPlug;
30use unicode;
31use util;
32use parsargv;
33
34sub BEGIN {
35 @ISA = ('BasPlug');
36}
37
38my $arguments =
39 [ { 'name' => "process_exp",
40 'desc' => "{BasPlug.process_exp}",
41 'type' => "regexp",
42 'reqd' => "no",
43 'deft' => &get_default_process_exp() },
44 ];
45
46my $options = { 'name' => "OAIPlug",
47 'desc' => "{OAIPlug.desc}",
48 'abstract' => "no",
49 'inherits' => "yes",
50 'args' => $arguments };
51
52sub new {
53 my $class = shift (@_);
54 my $self = new BasPlug ($class, @_);
55 $self->{'plugin_type'} = "OAIPlug";
56 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
57 my $option_list = $self->{'option_list'};
58 push( @{$option_list}, $options );
59
60 if (!parsargv::parse(\@_,
61 "allow_extra_options")) {
62
63 print STDERR "\nIncorrect options passed to OAIPlug, check your collect.cfg configuration file\n";
64 $self->print_txt_usage(""); # Use default resource bundle
65 die "\n";
66 }
67
68 return bless $self, $class;
69}
70
71sub get_default_process_exp {
72 my $self = shift (@_);
73
74 return q^(?i)(\.oai)$^;
75}
76
77
78sub read {
79 my $self = shift (@_);
80
81 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
82
83 my $outhandle = $self->{'outhandle'};
84
85 my $filename = $file;
86 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
87
88 return 0 if ((-d $filename) && ($filename =~ m/srcdocs$/));
89
90 if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
91 $self->{'num_blocked'} ++;
92 return 0;
93 }
94 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
95 return undef;
96 }
97 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
98
99 # Do encoding stuff
100 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
101
102 ####
103 # Above code exactly the same as in BasPlug
104 # => consider making supporting function?
105 ###
106
107 # read in file ($text will be in utf8)
108 my $text = "";
109 $self->read_file ($filename, $encoding, $language, \$text);
110
111 if (!length ($text)) {
112 print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'};
113 return 0;
114 }
115
116 print $outhandle "OAIPlug: extracting metadata from $file\n"
117 if ($self->{'verbosity'}>1);
118
119 $self->extract_oai_metadata(\$text,$metadata);
120
121 my $url_array = $metadata->{'URL'};
122
123 if (defined $url_array && ($url_array->[0] !~ m/^http:/))
124 {
125 ## my $source_file = &util::filename_cat($base_dir, $file);
126
127 my $url_base_dir = $filename;
128 $url_base_dir =~ s/^(.*)\/(.*?)$/$1/;
129## print STDERR "*** url base dir = $url_base_dir/$url_array->[0]\n";
130 print $outhandle "OAIPlug: passing metadata on to $url_array->[0]\n"
131 if ($self->{'verbosity'}>1);
132
133 return &plugin::read ($pluginfo, $url_base_dir, $url_array->[0],
134 $metadata, $processor, $maxdocs);
135 }
136 else
137 {
138 # create a new document
139 my $doc_obj = new doc ($filename, "indexed_doc");
140 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
141 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
142 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
143 $doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "OAI");
144
145
146 # include any metadata passed in from previous plugins
147 # note that this metadata is associated with the top level section
148 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
149
150
151 # do plugin specific processing of doc_obj
152 return -1 unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj));
153
154 # do any automatic metadata extraction
155 $self->auto_extract_metadata ($doc_obj);
156
157 # add an OID
158 $doc_obj->set_OID();
159
160 # process the document
161 $processor->process($doc_obj);
162
163 return 1; # processed the file
164 }
165}
166
167
168# do plugin specific processing of doc_obj
169sub process {
170 my $self = shift (@_);
171 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
172 my $outhandle = $self->{'outhandle'};
173
174 print STDERR "<Processing n='$file' p='OAIPlug'>\n" if ($gli);
175 print $outhandle "OAIPlug: processing $file\n"
176 if $self->{'verbosity'} > 1;
177
178 my $cursection = $doc_obj->get_top_section();
179
180## $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection);
181
182 # add text to document object
183
184# $$textref =~ s/<(.*?)>/$1 /g;
185 $$textref =~ s/</&lt;/g;
186 $$textref =~ s/>/&gt;/g;
187
188## print STDERR "*** adding text: $$textref\n";
189
190 $doc_obj->add_utf8_text($cursection, $$textref);
191
192 return 1;
193}
194
195
196
197sub extract_oai_metadata {
198 my $self = shift (@_);
199 my ($textref, $metadata) = @_;
200 my $outhandle = $self->{'outhandle'};
201
202
203 if ($$textref =~ m/<metadata>(.*?)<\/metadata>/s)
204 {
205 $metadata_text = $1;
206 $metadata_text =~ s/^.*?<(oai_dc:)?dc.*?>(.*?)<\/(oai_dc:)?dc>.*?/$2/s;
207
208 while ($metadata_text =~ m/<(.*?)>(.*?)<\/(.*?)>(.*)/s)
209 {
210 # if URL given for document as identifier metadata, store it ...
211 # $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
212 my $metaname = $1;
213 my $metavalue = $2;
214 $metadata_text = $4;
215
216 $metaname =~ s/^(dc:)?(.)/\u$2/;
217
218 if ($metaname eq "Identifier")
219 {
220 # name clashes with GSDL reserved metadata name for hash id
221 $metaname = "URL";
222 }
223
224 if (defined $metadata->{$metaname})
225 {
226 push(@{$metadata->{$metaname}},$metavalue);
227
228 }
229 else
230 {
231 $metadata->{$metaname} = [ $metavalue ];
232 }
233
234
235 }
236 }
237}
238
2391;
Note: See TracBrowser for help on using the repository browser.