source: gsdl/trunk/perllib/plugins/ISISPlugin.pm@ 15872

Last change on this file since 15872 was 15872, checked in by kjdon, 16 years ago

plugin overhaul: plugins renamed to xxPlugin, and in some cases the names are made more sensible. They now use the new base plugins. Hopefully we have better code reuse. Some of the plugins still need work done as I didn't want to spend another month doing this before committing it. Alos, I haven't really tested anything yet...

  • Property svn:keywords set to Author Date Id Revision
File size: 13.3 KB
Line 
1###########################################################################
2#
3# ISISPlugin.pm -- A plugin for CDS/ISIS databases
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 1999-2004 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package ISISPlugin;
28
29
30use multiread;
31use SplitTextFile;
32
33use strict;
34no strict 'refs'; # allow filehandles to be variables and viceversa
35
36# ISISPlugin is a sub-class of SplitTextFile.
37sub BEGIN {
38 @ISISPlugin::ISA = ('SplitTextFile');
39}
40
41
42my $arguments =
43 [ { 'name' => "process_exp",
44 'desc' => "{BasePlugin.process_exp}",
45 'type' => "regexp",
46 'reqd' => "no",
47 'deft' => &get_default_process_exp() },
48 { 'name' => "block_exp",
49 'desc' => "{BasePlugin.block_exp}",
50 'type' => "regexp",
51 'reqd' => "no",
52 'deft' => &get_default_block_exp(),
53 'hiddengli' => "yes" },
54 { 'name' => "split_exp",
55 'desc' => "{SplitTextFile.split_exp}",
56 'type' => "regexp",
57 'reqd' => "no",
58 'deft' => &get_default_split_exp(),
59 'hiddengli' => "yes" },
60
61 # The interesting options
62 { 'name' => "entry_separator",
63 'desc' => "{ISISPlugin.entry_separator}",
64 'type' => "string",
65 'reqd' => "no",
66 'deft' => "<br>" },
67 { 'name' => "subfield_separator",
68 'desc' => "{ISISPlugin.subfield_separator}",
69 'type' => "string",
70 'reqd' => "no",
71 'deft' => ", " }
72 ];
73
74my $options = { 'name' => "ISISPlugin",
75 'desc' => "{ISISPlugin.desc}",
76 'abstract' => "no",
77 'inherits' => "yes",
78 'explodes' => "yes",
79 'args' => $arguments };
80
81
82# This plugin processes files with the suffix ".mst"
83sub get_default_process_exp {
84 return q^(?i)(\.mst)$^;
85}
86
87
88# This plugin blocks files with the suffix ".fdt" and ".xrf"
89sub get_default_block_exp {
90 return q^(?i)(\.fdt|\.xrf)$^;
91}
92
93
94# This plugin splits the input text at the "----------" lines
95sub get_default_split_exp {
96 return q^\r?\n----------\r?\n^;
97}
98
99
100sub new
101{
102 my ($class) = shift (@_);
103 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
104 push(@$pluginlist, $class);
105
106 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
107 push(@{$hashArgOptLists->{"OptList"}},$options);
108
109 my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
110
111 if ($self->{'info_only'}) {
112 # don't worry about any options etc
113 return bless $self, $class;
114 }
115
116 # isis plug doesn't care about encoding - it assumes ascii unless the user
117 # has specified an encoding
118 if ($self->{'input_encoding'} eq "auto") {
119 $self->{'input_encoding'} = "ascii";
120 }
121 return bless $self, $class;
122}
123
124
125sub read_file
126{
127 my $self = shift (@_);
128 my ($filename, $encoding, $language, $textref) = @_;
129 my $outhandle = $self->{'outhandle'};
130
131 my ($database_file_path_root) = ($filename =~ /(.*)\.mst$/i);
132 my $mst_file_path_relative = $filename;
133 $mst_file_path_relative =~ s/^.+import.(.*?)$/$1/;
134
135 # Check the associated .fdt and .xrf files exist
136 $self->{'fdt_file_path'} = $database_file_path_root . ".FDT";
137 if (!-e $self->{'fdt_file_path'}) {
138 $self->{'fdt_file_path'} = $database_file_path_root . ".fdt";
139 }
140 if (!-e $self->{'fdt_file_path'}) {
141 print STDERR "<ProcessingError n='$mst_file_path_relative' r='Could not find ISIS FDT file $self->{'fdt_file_path'}'>\n" if ($self->{'gli'});
142 print $outhandle "Error: Could not find ISIS FDT file " . $self->{'fdt_file_path'} . ".\n";
143 return;
144 }
145 $self->{'xrf_file_path'} = $database_file_path_root . ".XRF";
146 if (!-e $self->{'xrf_file_path'}) {
147 $self->{'xrf_file_path'} = $database_file_path_root . ".xrf";
148 }
149 if (!-e $self->{'xrf_file_path'}) {
150 print STDERR "<ProcessingError n='$mst_file_path_relative' r='Could not find ISIS XRF file $self->{'xrf_file_path'}'>\n" if ($self->{'gli'});
151 print $outhandle "Error: Could not find ISIS XRF file " . $self->{'xrf_file_path'} . ".\n";
152 return;
153 }
154
155 # The text to split is exported from the database by the IsisGdl program
156 open(FILE, "IsisGdl \"$filename\" |");
157
158 my $reader = new multiread();
159 $reader->set_handle('ISISPlugin::FILE');
160 $reader->set_encoding($encoding);
161 $reader->read_file($textref);
162
163 close(FILE);
164
165 # Parse the associated ISIS database Field Definition Table file (.fdt)
166 my %fdt_mapping = &parse_field_definition_table($self->{'fdt_file_path'}, $encoding);
167 $self->{'fdt_mapping'} = \%fdt_mapping;
168
169 # Remove the line at the start, and any blank lines, so the data is split and processed properly
170 $$textref =~ s/^----------\n//;
171 $$textref =~ s/\n\n/\n/g;
172}
173
174
175sub process
176{
177 my $self = shift (@_);
178 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
179 my $outhandle = $self->{'outhandle'};
180
181 my $section = $doc_obj->get_top_section();
182 my $fdt_mapping = $self->{'fdt_mapping'};
183 my $subfield_separator = $self->{'subfield_separator'};
184 my $entry_separator = $self->{'entry_separator'};
185 my $isis_record_html_metadata_value = "<table cellpadding=\"4\" cellspacing=\"0\">";
186
187 # Report that we're processing the file
188 print STDERR "\n<Processing n='$file' p='ISISPlugin'>\n" if ($gli);
189 print $outhandle "IsisPlug: processing $file\n" if ($self->{'verbosity'}) > 1;
190
191 # Process each line of the ISIS record, one at a time
192 foreach my $line (split(/\n/, $$textref)) {
193 $line =~ s/(\s*)$//; # Remove any nasty whitespace (very important for Windows)
194 $line =~ /^tag=(.*) data=(.+)$/;
195 my $tag = $1;
196 my $tag_data = $2;
197 # print STDERR "\nTag: $tag, Data: $tag_data\n";
198
199 # Convert the tag number into a name, and remove any invalid characters
200 my $raw_metadata_name = $fdt_mapping->{$tag}{'name'} || "";
201 $raw_metadata_name =~ s/[,&\#\.\-\/]/ /g;
202 next if ($raw_metadata_name eq "");
203
204 # Metadata field names: title case, then remove spaces
205 my $metadata_name = "";
206 foreach my $word (split(/\s+/, $raw_metadata_name)) {
207 substr($word, 0, 1) =~ tr/a-z/A-Z/;
208 $metadata_name .= $word;
209 }
210
211 my $all_metadata_name = $metadata_name . "^all";
212 my $all_metadata_value = "";
213
214 # Handle repeatable fields
215 if ($fdt_mapping->{$tag}{'repeatable'}) {
216 # Multiple values are separated using the '%' character
217 foreach my $raw_metadata_value (split(/%/, $tag_data)) {
218 my $metadata_value = "";
219
220 # Handle subfields
221 while ($raw_metadata_value ne "") {
222 # If there is a subfield specifier, parse it off
223 my $sub_metadata_name = $metadata_name;
224 if ($raw_metadata_value =~ s/^\^// && $raw_metadata_value =~ s/^([a-z])//) {
225 $sub_metadata_name .= "^$1";
226 }
227
228 # Parse the value off and add it as metadata
229 $raw_metadata_value =~ s/^([^\^]*)//;
230 my $sub_metadata_value = &escape_metadata_value($1);
231
232 # print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
233 if ($sub_metadata_name ne $metadata_name) {
234 $doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
235 }
236
237 # If this tag has subfields and this is the first, use the value for the CDS/ISIS ^* field
238 if ($fdt_mapping->{$tag}{'subfields'} ne "" && $metadata_value eq "") {
239 $doc_obj->add_utf8_metadata($section, $metadata_name . "^*", $sub_metadata_value);
240 }
241
242 $metadata_value .= $subfield_separator unless ($metadata_value eq "");
243 $metadata_value .= $sub_metadata_value;
244 }
245
246 # Add the metadata value
247 # print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
248 $doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
249
250 $all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
251 $all_metadata_value .= $metadata_value;
252 }
253 }
254
255 # Handle non-repeatable fields
256 else {
257 my $raw_metadata_value = $tag_data;
258 my $metadata_value = "";
259
260 # Handle subfields
261 while ($raw_metadata_value ne "") {
262 # If there is a subfield specifier, parse it off
263 my $sub_metadata_name = $metadata_name;
264 if ($raw_metadata_value =~ s/^\^// && $raw_metadata_value =~ s/^([a-z])//) {
265 $sub_metadata_name .= "^$1";
266 }
267
268 # Parse the value off and add it as metadata
269 $raw_metadata_value =~ s/^([^\^]*)//;
270 my $sub_metadata_value = $1;
271
272 # Deal with the case when multiple values are specified using <...>
273 if ($sub_metadata_value =~ /\<(.+)\>/) {
274 my $sub_sub_metadata_name = $sub_metadata_name . "^sub";
275 my $tmp_sub_metadata_value = $sub_metadata_value;
276 while ($tmp_sub_metadata_value =~ s/\<(.+?)\>//) {
277 my $sub_sub_metadata_value = $1;
278 $doc_obj->add_utf8_metadata($section, $sub_sub_metadata_name, $sub_sub_metadata_value);
279 }
280 }
281 # Deal with the legacy case when multiple values are specified using /.../
282 elsif ($sub_metadata_value =~ /\/(.+)\//) {
283 my $sub_sub_metadata_name = $sub_metadata_name . "^sub";
284 my $tmp_sub_metadata_value = $sub_metadata_value;
285 while ($tmp_sub_metadata_value =~ s/\/(.+?)\///) {
286 my $sub_sub_metadata_value = $1;
287 $doc_obj->add_utf8_metadata($section, $sub_sub_metadata_name, $sub_sub_metadata_value);
288 }
289 }
290
291 # Escape the metadata value so it appears correctly in the final collection
292 $sub_metadata_value = &escape_metadata_value($sub_metadata_value);
293
294 # print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
295 if ($sub_metadata_name ne $metadata_name) {
296 $doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
297 }
298
299 # If this tag has subfields and this is the first, use the value for the CDS/ISIS ^* field
300 if ($fdt_mapping->{$tag}{'subfields'} ne "" && $metadata_value eq "") {
301 $doc_obj->add_utf8_metadata($section, $metadata_name . "^*", $sub_metadata_value);
302 }
303
304 $metadata_value .= $subfield_separator unless ($metadata_value eq "");
305 $metadata_value .= $sub_metadata_value;
306 }
307
308 # Add the metadata value
309 # print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
310 $doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
311
312 $all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
313 $all_metadata_value .= $metadata_value;
314 }
315
316 # Add the "^all" metadata value
317 # print STDERR "All metadata name: $all_metadata_name, value: $all_metadata_value\n";
318 $doc_obj->add_utf8_metadata($section, $all_metadata_name, $all_metadata_value);
319
320 $isis_record_html_metadata_value .= "<tr><td valign=top><nobr><b>" . $fdt_mapping->{$tag}{'name'} . "</b></nobr></td><td valign=top>" . $all_metadata_value . "</td></tr>";
321 }
322
323 # Add a reasonably formatted HTML table view of the record as the document text
324 $isis_record_html_metadata_value .= "</table>";
325 $doc_obj->add_utf8_text($section, $isis_record_html_metadata_value);
326
327 # Add the full raw record as metadata
328 my $isis_raw_record_metadata_value = &escape_metadata_value($$textref);
329 $doc_obj->add_utf8_metadata($section, "ISISRawRecord", $isis_raw_record_metadata_value);
330
331 # Add FileFormat metadata
332 $doc_obj->add_utf8_metadata($section, "FileFormat", "CDS/ISIS");
333
334 # Record was processed successfully
335 return 1;
336}
337
338
339sub parse_field_definition_table
340{
341 my $fdtfilename = shift(@_);
342 my $encoding = shift(@_);
343
344 my %fdtmapping = ();
345
346 open(FDT_FILE, "<$fdtfilename") || die "Error: Could not open file $fdtfilename.\n";
347
348 my $fdtfiletext = "";
349 my $reader = new multiread();
350 $reader->set_handle('ISISPlugin::FDT_FILE');
351 $reader->set_encoding($encoding);
352 $reader->read_file($fdtfiletext);
353
354 my $amongstdefinitions = 0;
355 foreach my $fdtfileline (split(/\n/, $$fdtfiletext)) {
356 $fdtfileline =~ s/(\s*)$//; # Remove any nasty spaces at the end of the lines
357
358 if ($amongstdefinitions) {
359 my $fieldname = &unicode::substr($fdtfileline, 0, 30);
360 my $fieldsubfields = &unicode::substr($fdtfileline, 30, 20);
361 my $fieldspecs = &unicode::substr($fdtfileline, 50, 50);
362
363 # Remove extra spaces
364 $fieldname =~ s/(\s*)$//;
365 $fieldsubfields =~ s/(\s*)$//;
366 $fieldspecs =~ s/(\s*)$//;
367
368 # Map from tag number to metadata field title, subfields, and repeatability
369 my $fieldtag = (split(/ /, $fieldspecs))[0];
370 my $fieldrepeatable = (split(/ /, $fieldspecs))[3];
371 $fdtmapping{$fieldtag} = { 'name' => $fieldname,
372 'subfields' => $fieldsubfields,
373 'repeatable' => $fieldrepeatable };
374 }
375 elsif ($fdtfileline eq "***") {
376 $amongstdefinitions = 1;
377 }
378 }
379
380 close(FDT_FILE);
381
382 return %fdtmapping;
383}
384
385
386sub escape_metadata_value
387{
388 my $value = shift(@_);
389 $value =~ s/\</&lt;/g;
390 $value =~ s/\>/&gt;/g;
391 $value =~ s/\\/\\\\/g;
392 return $value;
393}
394
395
396sub clean_up_after_exploding
397{
398 my $self = shift(@_);
399
400 # Delete the FDT and XRF files too
401 &util::rm($self->{'fdt_file_path'});
402 &util::rm($self->{'xrf_file_path'});
403}
404
405
4061;
Note: See TracBrowser for help on using the repository browser.