source: trunk/gsdl/perllib/plugins/OAIPlug.pm@ 5924

Last change on this file since 5924 was 5924, checked in by kjdon, 20 years ago

changed the new metadata to eg WordPlug instead of Word, cos a clash with Image

  • Property svn:keywords set to Author Date Id Revision
File size: 6.3 KB
Line 
1###########################################################################
2#
3# OAIPlug.pm -- basic Open Archives Initiative (OAI) plugin
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright (C) 1999 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package OAIPlug;
28
29use BasPlug;
30use unicode;
31use util;
32use parsargv;
33
34sub BEGIN {
35 @ISA = ('BasPlug');
36}
37
38my $options = { 'name' => "OAIPlug",
39 'desc' => "{OAIPlug.desc}",
40 'inherits' => "yes" };
41
42# sub print_usage {
43# print STDERR "\n usage: plugin OAIPlug [options]\n\n";
44# print STDERR " currently no options:\n";
45# }
46
47sub new {
48 my $class = shift (@_);
49 my $self = new BasPlug ($class, @_);
50 $self->{'plugin_type'} = "OAIPlug";
51 # 14-05-02 To allow for proper inheritance of arguments - John Thompson
52 my $option_list = $self->{'option_list'};
53 push( @{$option_list}, $options );
54
55 if (!parsargv::parse(\@_,
56 "allow_extra_options")) {
57
58 print STDERR "\nIncorrect options passed to OAIPlug, check your collect.cfg configuration file\n";
59 $self->print_txt_usage(""); # Use default resource bundle
60 die "\n";
61 }
62
63 return bless $self, $class;
64}
65
66sub get_default_process_exp {
67 my $self = shift (@_);
68
69 return q^(?i)(\.oai)$^;
70}
71
72
73sub read {
74 my $self = shift (@_);
75
76 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
77
78 my $outhandle = $self->{'outhandle'};
79
80 my $filename = $file;
81 $filename = &util::filename_cat ($base_dir, $file) if $base_dir =~ /\w/;
82
83 return 0 if ((-d $filename) && ($filename =~ m/.orig$/));
84
85 if ($self->{'block_exp'} ne "" && $filename =~ /$self->{'block_exp'}/) {
86 $self->{'num_blocked'} ++;
87 return 0;
88 }
89 if ($filename !~ /$self->{'process_exp'}/ || !-f $filename) {
90 return undef;
91 }
92 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
93
94 # Do encoding stuff
95 my ($language, $encoding) = $self->textcat_get_language_encoding ($filename);
96
97 ####
98 # Above code exactly the same as in BasPlug
99 # => consider making supporting function?
100 ###
101
102 # read in file ($text will be in utf8)
103 my $text = "";
104 $self->read_file ($filename, $encoding, $language, \$text);
105
106 if (!length ($text)) {
107 print $outhandle "$plugin_name: ERROR: $file contains no text\n" if $self->{'verbosity'};
108 return 0;
109 }
110
111 print $outhandle "OAIPlug: extracting metadata from $file\n"
112 if ($self->{'verbosity'}>1);
113
114 $self->extract_oai_metadata(\$text,$metadata);
115
116 my $url_array = $metadata->{'URL'};
117
118 if (defined $url_array && ($url_array->[0] !~ m/^http:/))
119 {
120 ## my $source_file = &util::filename_cat($base_dir, $file);
121
122 my $url_base_dir = $filename;
123 $url_base_dir =~ s/^(.*)\/(.*?)$/$1/;
124## print STDERR "*** url base dir = $url_base_dir/$url_array->[0]\n";
125 print $outhandle "OAIPlug: passing metadata on to $url_array->[0]\n"
126 if ($self->{'verbosity'}>1);
127
128 return &plugin::read ($pluginfo, $url_base_dir, $url_array->[0],
129 $metadata, $processor, $maxdocs);
130 }
131 else
132 {
133 # create a new document
134 my $doc_obj = new doc ($filename, "indexed_doc");
135 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Language", $language);
136 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Encoding", $encoding);
137 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "$self->{'plugin_type'}", "1");
138
139 # include any metadata passed in from previous plugins
140 # note that this metadata is associated with the top level section
141 $self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
142
143
144 # do plugin specific processing of doc_obj
145 return undef unless defined ($self->process (\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj));
146
147 # do any automatic metadata extraction
148 $self->auto_extract_metadata ($doc_obj);
149
150 # add an OID
151 $doc_obj->set_OID();
152
153 # process the document
154 $processor->process($doc_obj);
155
156 return 1; # processed the file
157 }
158}
159
160
161# do plugin specific processing of doc_obj
162sub process {
163 my $self = shift (@_);
164 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
165 my $outhandle = $self->{'outhandle'};
166
167 print $outhandle "OAIPlug: processing $file\n"
168 if $self->{'verbosity'} > 1;
169
170 my $cursection = $doc_obj->get_top_section();
171
172## $self->extract_metadata ($textref, $metadata, $doc_obj, $cursection);
173
174 # add text to document object
175
176# $$textref =~ s/<(.*?)>/$1 /g;
177 $$textref =~ s/</&lt;/g;
178 $$textref =~ s/>/&gt;/g;
179
180## print STDERR "*** adding text: $$textref\n";
181
182 $doc_obj->add_utf8_text($cursection, $$textref);
183
184 return 1;
185}
186
187
188
189sub extract_oai_metadata {
190 my $self = shift (@_);
191 my ($textref, $metadata) = @_;
192 my $outhandle = $self->{'outhandle'};
193
194
195 if ($$textref =~ m/<metadata>(.*?)<\/metadata>/s)
196 {
197 $metadata_text = $1;
198 $metadata_text =~ s/^.*?<(oai_dc:)?dc.*?>(.*?)<\/(oai_dc:)?dc>.*?/$2/s;
199
200 while ($metadata_text =~ m/<(.*?)>(.*?)<\/(.*?)>(.*)/s)
201 {
202 # if URL given for document as identifier metadata, store it ...
203 # $doc_obj->add_utf8_metadata($cursection, "URL", $web_url);
204 my $metaname = $1;
205 my $metavalue = $2;
206 $metadata_text = $4;
207
208 $metaname =~ s/^(dc:)?(.)/\u$2/;
209
210 if ($metaname eq "Identifier")
211 {
212 # name clashes with GSDL reserved metadata name for hash id
213 $metaname = "URL";
214 }
215
216 if (defined $metadata->{$metaname})
217 {
218 push(@{$metadata->{$metaname}},$metavalue);
219 }
220 else
221 {
222 $metadata->{$metaname} = [ $metavalue ];
223 }
224
225
226 }
227 }
228}
229
2301;
Note: See TracBrowser for help on using the repository browser.