source: trunk/gsdl/perllib/plugins/UnknownPlug.pm@ 11334

Last change on this file since 11334 was 11123, checked in by davidb, 18 years ago

Filename stored as URL metadata was missing from this plugin and didn't
occur through inheritence.

  • Property svn:keywords set to Author Date Id Revision
File size: 8.6 KB
Line 
1###########################################################################
2#
3# UnknownPlug.pm -- Plugin for files you know about but Greenstone doesn't
4#
5# A component of the Greenstone digital library software from the New
6# Zealand Digital Library Project at the University of Waikato, New
7# Zealand.
8#
9# Copyright (C) 2001 Gordon W. Paynter
10# Copyright (C) 2001 New Zealand Digital Library Project
11#
12# This program is free software; you can redistribute it and/or modify
13# it under the terms of the GNU General Public License as published by
14# the Free Software Foundation; either version 2 of the License, or
15# (at your option) any later version.
16#
17# This program is distributed in the hope that it will be useful, but
18# WITHOUT ANY WARRANTY; without even the implied warranty of
19# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
20# General Public License for more details.
21#
22# You should have received a copy of the GNU General Public License
23# along with this program; if not, write to the Free Software
24# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
25#
26###########################################################################
27
28# UnknownPlug - a plugin for unknown files
29
30# This is a simple Plugin for importing files in formats that
31# Greenstone doesn't know anything about. A fictional document will
32# be created for every such file, and the file itself will be passed
33# to Greenstone as the "associated file" of the document.
34
35# Here's an example where it is useful: I have a collection of
36# pictures that include a couple of quicktime movie files with names
37# like DCP_0163.MOV. Rather than write a new plugin for quicktime
38# movies, I add this line to the collection configuration file:
39
40# plugin UnknownPlug -process_exp "*.MOV" -assoc_field "movie"
41
42# A document is created for each movie, with the associated movie
43# file's name in the "movie" metadata field. In the collection's
44# format strings, I use the {If} macro to output different text for
45# each type of file, like this:
46
47# {If}{[movie],<HTML for displaying movie>}{If}{[Image],<HTML for displaying image>}
48
49# You can also add extra metadata, such as the Title, Subject, and
50# Duration, with metadata.xml files and RecPlug. (If you want to use
51# UnknownPlug with more than one type of file, you will have to add
52# some sort of distinguishing metadata in this way.)
53
54
55
56package UnknownPlug;
57
58use BasPlug;
59
60use strict;
61no strict 'refs'; # allow filehandles to be variables and viceversa
62
63sub BEGIN {
64 @UnknownPlug::ISA = ('BasPlug');
65}
66
67my $arguments =
68 [ { 'name' => "assoc_field",
69 'desc' => "{UnknownPlug.assoc_field}",
70 'type' => "string",
71 'deft' => "",
72 'reqd' => "no" },
73 { 'name' => "file_format",
74 'desc' => "{UnknownPlug.file_format}",
75 'type' => "string",
76 'deft' => "",
77 'reqd' => "no" },
78 { 'name' => "mime_type",
79 'desc' => "{UnknownPlug.mime_type}",
80 'type' => "string",
81 'deft' => "",
82 'reqd' => "no" },
83 { 'name' => "srcicon",
84 'desc' => "{UnknownPlug.srcicon}",
85 'type' => "string",
86 'deft' => "iconunknown",
87 'reqd' => "no" },
88 { 'name' => "process_extension",
89 'desc' => "{UnknownPlug.process_extension}",
90 'type' => "string",
91 'deft' => "",
92 'reqd' => "no" } ];
93
94my $options = { 'name' => "UnknownPlug",
95 'desc' => "{UnknownPlug.desc}",
96 'abstract' => "no",
97 'inherits' => "yes",
98 'args' => $arguments };
99
100
101sub new {
102 my ($class) = shift (@_);
103 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
104 push(@$pluginlist, $class);
105
106 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
107 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
108
109 my $self = (defined $hashArgOptLists)? new BasPlug($pluginlist,$inputargs,$hashArgOptLists): new BasPlug($pluginlist,$inputargs);
110
111 # "-process_extension" is a simpler alternative to -process_exp for non-regexp people
112 if (!$self->{'process_exp'} && $self->{'process_extension'}) {
113 $self->{'process_exp'} = "\\." . $self->{'process_extension'} . "\$";
114 }
115
116 return bless $self, $class;
117}
118
119sub get_default_process_exp {
120 return '';
121}
122
123
124# Associate the unknown file with the new document
125
126sub associate_unknown_file {
127 my $self = shift (@_);
128 my $filename = shift (@_); # filename with full path
129 my $file = shift (@_); # filename without path
130 my $doc_obj = shift (@_);
131
132 my $verbosity = $self->{'verbosity'};
133 my $outhandle = $self->{'outhandle'};
134
135 # check the filename is okay
136 return 0 if ($file eq "" || $filename eq "");
137
138 # Add the image metadata
139 my $url = $file;
140 $url =~ s/ /%20/g;
141
142 # Add the file as an associated file ...
143 my $section = $doc_obj->get_top_section();
144 my $file_format = $self->{'file_format'} || "unknown";
145 my $mime_type = $self->{'mime_type'} || "unknown/unknown";
146 my $assoc_field = $self->{'assoc_field'} || "unknown_file";
147
148 $doc_obj->associate_file($filename, $file, $mime_type, $section);
149 $doc_obj->add_metadata ($section, "FileFormat", $file_format);
150 $doc_obj->add_metadata ($section, "MimeType", $mime_type);
151 $doc_obj->add_metadata ($section, $assoc_field, $file);
152
153 $doc_obj->add_metadata ($section, "srclink",
154 "<a href=\"_httpcollection_/index/assoc/[assocfilepath]/[$assoc_field]\">");
155 #$doc_obj->add_metadata ($section, "srcicon", "_iconunknown_");
156 $doc_obj->add_metadata ($section, "srcicon", "_".$self->{'srcicon'}."_");
157 $doc_obj->add_metadata ($section, "/srclink", "</a>");
158
159 return 1;
160}
161
162
163
164# The UnknownPlug read() function. This function does all the right
165# things to make general options work for a given plugin. UnknownPlug
166# overrides read() because there is no need to read the actual text of
167# the file in, because the contents of the file is not text...
168#
169#
170# Return number of files processed, undef if can't process
171#
172# Note that $base_dir might be "" and that $file might include directories
173
174sub read {
175 my $self = shift (@_);
176 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
177
178 my $outhandle = $self->{'outhandle'};
179
180 # Make sure we're processing the correct file
181 my ($block_status,$filename) = $self->read_block(@_);
182 return $block_status if ((!defined $block_status) || ($block_status==0));
183
184 print STDERR "<Processing n='$file' p='UnknownPlug'>\n" if ($gli);
185 print $outhandle "UnknownPlug processing \"$filename\"\n"
186 if $self->{'verbosity'} > 1;
187
188 #if there's a leading directory name, eat it...
189 $file =~ s/^.*[\/\\]//;
190
191 # create a new document
192 my $doc_obj = new doc ($filename, "indexed_doc");
193 $doc_obj->set_OIDtype ($processor->{'OIDtype'});
194 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "Plugin", "$self->{'plugin_type'}");
195 $doc_obj->add_metadata($doc_obj->get_top_section(), "Source", &ghtml::dmsafe($file)); # set the filename as Source metadata to be consistent with other plugins
196 $doc_obj->add_utf8_metadata($doc_obj->get_top_section(), "FileSize", (-s $filename));
197
198 # URL metadata (even invalid ones) are used to support internal
199 # links, so even if 'file_is_url' is off, still need to store info
200
201 my $web_url = "http://$file";
202 $doc_obj->add_metadata($doc_obj->get_top_section(), "URL", $web_url);
203
204
205 # associate the file with the document
206 if (associate_unknown_file($self, $filename, $file, $doc_obj) != 1)
207 {
208 if ($gli) {
209 print STDERR "<ProcessingError n='$file'>\n";
210 }
211 print $outhandle "UnknownPlug: couldn't process \"$filename\"\n";
212 return -1; # error during processing
213 }
214
215 #create an empty text string so we don't break downstream plugins
216 my $text = &gsprintf::lookup_string("{BasPlug.dummy_text}");
217
218 # include any metadata passed in from previous plugins
219 my $section = $doc_obj->get_top_section();
220 $self->extra_metadata ($doc_obj, $section, $metadata);
221
222 $self->title_fallback($doc_obj,$section,$file);
223
224 # do plugin specific processing of doc_obj
225 unless (defined ($self->process(\$text, $pluginfo, $base_dir, $file, $metadata, $doc_obj))) {
226 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
227 return -1;
228 }
229
230 # do any automatic metadata extraction
231 $self->auto_extract_metadata ($doc_obj);
232
233 # add an OID
234 $doc_obj->set_OID();
235 $doc_obj->add_text($section, $text);
236
237 # process the document
238 $processor->process($doc_obj);
239
240 $self->{'num_processed'} ++;
241 return 1;
242}
243
244
245# UnknownPlug processing of doc_obj. In practice we don't need to do
246# anything here because the read function takes care of everything.
247
248sub process {
249 my $self = shift (@_);
250 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj) = @_;
251 my $outhandle = $self->{'outhandle'};
252
253 return 1;
254}
255
256
2571;
258
259
260
261
262
263
264
265
266
267
268
Note: See TracBrowser for help on using the repository browser.