source: trunk/gsdl/perllib/plugins/ISISPlug.pm@ 12969

Last change on this file since 12969 was 12833, checked in by kjdon, 18 years ago

we don't want textcat to be run for isis files. we assume ascii encoding unless the user has specified an encoding

  • Property svn:keywords set to Author Date Id Revision
File size: 13.0 KB
Line 
1###########################################################################
2#
3# ISISPlug.pm -- A plugin for CDS/ISIS databases
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 1999-2004 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package ISISPlug;
28
29
30use multiread;
31use SplitPlug;
32
33use strict;
34no strict 'refs'; # allow filehandles to be variables and viceversa
35
36# ISISPlug is a sub-class of SplitPlug.
37sub BEGIN {
38 @ISISPlug::ISA = ('SplitPlug');
39}
40
41
42my $arguments =
43 [ { 'name' => "process_exp",
44 'desc' => "{BasPlug.process_exp}",
45 'type' => "regexp",
46 'reqd' => "no",
47 'deft' => &get_default_process_exp() },
48 { 'name' => "block_exp",
49 'desc' => "{BasPlug.block_exp}",
50 'type' => "regexp",
51 'reqd' => "no",
52 'deft' => &get_default_block_exp(),
53 'hiddengli' => "yes" },
54 { 'name' => "split_exp",
55 'desc' => "{SplitPlug.split_exp}",
56 'type' => "regexp",
57 'reqd' => "no",
58 'deft' => &get_default_split_exp(),
59 'hiddengli' => "yes" },
60
61 # The interesting options
62 { 'name' => "entry_separator",
63 'desc' => "{ISISPlug.entry_separator}",
64 'type' => "string",
65 'reqd' => "no",
66 'deft' => "<br>" },
67 { 'name' => "subfield_separator",
68 'desc' => "{ISISPlug.subfield_separator}",
69 'type' => "string",
70 'reqd' => "no",
71 'deft' => ", " }
72 ];
73
74my $options = { 'name' => "ISISPlug",
75 'desc' => "{ISISPlug.desc}",
76 'abstract' => "no",
77 'inherits' => "yes",
78 'explodes' => "yes",
79 'args' => $arguments };
80
81
82# This plugin processes files with the suffix ".mst"
83sub get_default_process_exp {
84 return q^(?i)(\.mst)$^;
85}
86
87
88# This plugin blocks files with the suffix ".fdt" and ".xrf"
89sub get_default_block_exp {
90 return q^(?i)(\.fdt|\.xrf)$^;
91}
92
93
94# This plugin splits the input text at the "----------" lines
95sub get_default_split_exp {
96 return q^\r?\n----------\r?\n^;
97}
98
99
100sub new
101{
102 my ($class) = shift (@_);
103 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
104 push(@$pluginlist, $class);
105
106 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
107 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
108
109 my $self = new SplitPlug($pluginlist, $inputargs, $hashArgOptLists);
110
111 # isis plug doesn't care about encoding - it assumes ascii unless the user
112 # has specified an encoding
113 if ($self->{'input_encoding'} eq "auto") {
114 $self->{'input_encoding'} = "ascii";
115 }
116 return bless $self, $class;
117}
118
119
120sub read_file
121{
122 my $self = shift (@_);
123 my ($filename, $encoding, $language, $textref) = @_;
124 my $outhandle = $self->{'outhandle'};
125
126 my ($database_file_path_root) = ($filename =~ /(.*)\.mst$/i);
127 my $mst_file_path_relative = $filename;
128 $mst_file_path_relative =~ s/^.+import.(.*?)$/$1/;
129
130 # Check the associated .fdt and .xrf files exist
131 $self->{'fdt_file_path'} = $database_file_path_root . ".FDT";
132 if (!-e $self->{'fdt_file_path'}) {
133 $self->{'fdt_file_path'} = $database_file_path_root . ".fdt";
134 }
135 if (!-e $self->{'fdt_file_path'}) {
136 print STDERR "<ProcessingError n='$mst_file_path_relative' r='Could not find ISIS FDT file $self->{'fdt_file_path'}'>\n" if ($self->{'gli'});
137 print $outhandle "Error: Could not find ISIS FDT file " . $self->{'fdt_file_path'} . ".\n";
138 return;
139 }
140 $self->{'xrf_file_path'} = $database_file_path_root . ".XRF";
141 if (!-e $self->{'xrf_file_path'}) {
142 $self->{'xrf_file_path'} = $database_file_path_root . ".xrf";
143 }
144 if (!-e $self->{'xrf_file_path'}) {
145 print STDERR "<ProcessingError n='$mst_file_path_relative' r='Could not find ISIS XRF file $self->{'xrf_file_path'}'>\n" if ($self->{'gli'});
146 print $outhandle "Error: Could not find ISIS XRF file " . $self->{'xrf_file_path'} . ".\n";
147 return;
148 }
149
150 # The text to split is exported from the database by the IsisGdl program
151 open(FILE, "IsisGdl \"$filename\" |");
152
153 my $reader = new multiread();
154 $reader->set_handle('ISISPlug::FILE');
155 $reader->set_encoding($encoding);
156 $reader->read_file($textref);
157
158 close(FILE);
159
160 # Parse the associated ISIS database Field Definition Table file (.fdt)
161 my %fdt_mapping = &parse_field_definition_table($self->{'fdt_file_path'}, $encoding);
162 $self->{'fdt_mapping'} = \%fdt_mapping;
163
164 # Remove the line at the start, and any blank lines, so the data is split and processed properly
165 $$textref =~ s/^----------\n//;
166 $$textref =~ s/\n\n/\n/g;
167}
168
169
170sub process
171{
172 my $self = shift (@_);
173 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
174 my $outhandle = $self->{'outhandle'};
175
176 my $section = $doc_obj->get_top_section();
177 my $fdt_mapping = $self->{'fdt_mapping'};
178 my $subfield_separator = $self->{'subfield_separator'};
179 my $entry_separator = $self->{'entry_separator'};
180 my $isis_record_html_metadata_value = "<table cellpadding=\"4\" cellspacing=\"0\">";
181
182 # Report that we're processing the file
183 print STDERR "\n<Processing n='$file' p='ISISPlug'>\n" if ($gli);
184 print $outhandle "IsisPlug: processing $file\n" if ($self->{'verbosity'}) > 1;
185
186 # Process each line of the ISIS record, one at a time
187 foreach my $line (split(/\n/, $$textref)) {
188 $line =~ s/(\s*)$//; # Remove any nasty whitespace (very important for Windows)
189 $line =~ /^tag=(.*) data=(.+)$/;
190 my $tag = $1;
191 my $tag_data = $2;
192 # print STDERR "\nTag: $tag, Data: $tag_data\n";
193
194 # Convert the tag number into a name, and remove any invalid characters
195 my $raw_metadata_name = $fdt_mapping->{$tag}{'name'} || "";
196 $raw_metadata_name =~ s/[,&\#\.\-\/]/ /g;
197 next if ($raw_metadata_name eq "");
198
199 # Metadata field names: title case, then remove spaces
200 my $metadata_name = "";
201 foreach my $word (split(/\s+/, $raw_metadata_name)) {
202 substr($word, 0, 1) =~ tr/a-z/A-Z/;
203 $metadata_name .= $word;
204 }
205
206 my $all_metadata_name = $metadata_name . "^all";
207 my $all_metadata_value = "";
208
209 # Handle repeatable fields
210 if ($fdt_mapping->{$tag}{'repeatable'}) {
211 # Multiple values are separated using the '%' character
212 foreach my $raw_metadata_value (split(/%/, $tag_data)) {
213 my $metadata_value = "";
214
215 # Handle subfields
216 while ($raw_metadata_value ne "") {
217 # If there is a subfield specifier, parse it off
218 my $sub_metadata_name = $metadata_name;
219 if ($raw_metadata_value =~ s/^\^// && $raw_metadata_value =~ s/^([a-z])//) {
220 $sub_metadata_name .= "^$1";
221 }
222
223 # Parse the value off and add it as metadata
224 $raw_metadata_value =~ s/^([^\^]*)//;
225 my $sub_metadata_value = &escape_metadata_value($1);
226
227 # print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
228 if ($sub_metadata_name ne $metadata_name) {
229 $doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
230 }
231
232 # If this is the first subfield then the value is used for the CDS/ISIS ^* field
233 if ($metadata_value eq "") {
234 $doc_obj->add_utf8_metadata($section, $metadata_name . "^*", $sub_metadata_value);
235 }
236
237 $metadata_value .= $subfield_separator unless ($metadata_value eq "");
238 $metadata_value .= $sub_metadata_value;
239 }
240
241 # Add the metadata value
242 # print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
243 $doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
244
245 $all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
246 $all_metadata_value .= $metadata_value;
247 }
248 }
249
250 # Handle non-repeatable fields
251 else {
252 my $raw_metadata_value = $tag_data;
253 my $metadata_value = "";
254
255 # Handle subfields
256 while ($raw_metadata_value ne "") {
257 # If there is a subfield specifier, parse it off
258 my $sub_metadata_name = $metadata_name;
259 if ($raw_metadata_value =~ s/^\^// && $raw_metadata_value =~ s/^([a-z])//) {
260 $sub_metadata_name .= "^$1";
261 }
262
263 # Parse the value off and add it as metadata
264 $raw_metadata_value =~ s/^([^\^]*)//;
265 my $sub_metadata_value = $1;
266
267 # Deal with the case when multiple values are specified using <...>
268 if ($sub_metadata_value =~ /\<(.+)\>/) {
269 my $sub_sub_metadata_name = $sub_metadata_name . "^sub";
270 my $tmp_sub_metadata_value = $sub_metadata_value;
271 while ($tmp_sub_metadata_value =~ s/\<(.+?)\>//) {
272 my $sub_sub_metadata_value = $1;
273 $doc_obj->add_utf8_metadata($section, $sub_sub_metadata_name, $sub_sub_metadata_value);
274 }
275 }
276 # Deal with the legacy case when multiple values are specified using /.../
277 elsif ($sub_metadata_value =~ /\/(.+)\//) {
278 my $sub_sub_metadata_name = $sub_metadata_name . "^sub";
279 my $tmp_sub_metadata_value = $sub_metadata_value;
280 while ($tmp_sub_metadata_value =~ s/\/(.+?)\///) {
281 my $sub_sub_metadata_value = $1;
282 $doc_obj->add_utf8_metadata($section, $sub_sub_metadata_name, $sub_sub_metadata_value);
283 }
284 }
285
286 # Escape the metadata value so it appears correctly in the final collection
287 $sub_metadata_value = &escape_metadata_value($sub_metadata_value);
288
289 # print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
290 if ($sub_metadata_name ne $metadata_name) {
291 $doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
292 }
293
294 # If this is the first subfield then the value is used for the CDS/ISIS ^* field
295 if ($metadata_value eq "") {
296 $doc_obj->add_utf8_metadata($section, $metadata_name . "^*", $sub_metadata_value);
297 }
298
299 $metadata_value .= $subfield_separator unless ($metadata_value eq "");
300 $metadata_value .= $sub_metadata_value;
301 }
302
303 # Add the metadata value
304 # print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
305 $doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
306
307 $all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
308 $all_metadata_value .= $metadata_value;
309 }
310
311 # Add the "^all" metadata value
312 # print STDERR "All metadata name: $all_metadata_name, value: $all_metadata_value\n";
313 $doc_obj->add_utf8_metadata($section, $all_metadata_name, $all_metadata_value);
314
315 $isis_record_html_metadata_value .= "<tr><td valign=top><nobr><b>" . $fdt_mapping->{$tag}{'name'} . "</b></nobr></td><td valign=top>" . $all_metadata_value . "</td></tr>";
316 }
317
318 # Add a reasonably formatted HTML table view of the record as the document text
319 $isis_record_html_metadata_value .= "</table>";
320 $doc_obj->add_utf8_text($section, $isis_record_html_metadata_value);
321
322 # Add the full raw record as metadata
323 my $isis_raw_record_metadata_value = &escape_metadata_value($$textref);
324 $doc_obj->add_utf8_metadata($section, "ISISRawRecord", $isis_raw_record_metadata_value);
325
326 # Add FileFormat metadata
327 $doc_obj->add_utf8_metadata($section, "FileFormat", "CDS/ISIS");
328
329 # Record was processed successfully
330 return 1;
331}
332
333
334sub parse_field_definition_table
335{
336 my $fdtfilename = shift(@_);
337 my $encoding = shift(@_);
338
339 my %fdtmapping = ();
340
341 open(FDT_FILE, "<$fdtfilename") || die "Error: Could not open file $fdtfilename.\n";
342
343 my $fdtfiletext = "";
344 my $reader = new multiread();
345 $reader->set_handle('ISISPlug::FDT_FILE');
346 $reader->set_encoding($encoding);
347 $reader->read_file($fdtfiletext);
348
349 my $amongstdefinitions = 0;
350 foreach my $fdtfileline (split(/\n/, $$fdtfiletext)) {
351 $fdtfileline =~ s/(\s*)$//; # Remove any nasty spaces at the end of the lines
352
353 if ($amongstdefinitions) {
354 my $fieldname = substr($fdtfileline, 0, 30);
355 my $fieldsubfields = substr($fdtfileline, 30, 20);
356 my $fieldspecs = substr($fdtfileline, 50);
357
358 # Remove extra spaces
359 $fieldname =~ s/(\s*)$//;
360 $fieldsubfields =~ s/(\s*)$//;
361 $fieldspecs =~ s/(\s*)$//;
362
363 # Map from tag number to metadata field title, subfields, and repeatability
364 my $fieldtag = (split(/ /, $fieldspecs))[0];
365 my $fieldrepeatable = (split(/ /, $fieldspecs))[3];
366 $fdtmapping{$fieldtag} = { 'name' => $fieldname,
367 'subfields' => $fieldsubfields,
368 'repeatable' => $fieldrepeatable };
369 }
370 elsif ($fdtfileline eq "***") {
371 $amongstdefinitions = 1;
372 }
373 }
374
375 close(FDT_FILE);
376
377 return %fdtmapping;
378}
379
380
381sub escape_metadata_value
382{
383 my $value = shift(@_);
384 $value =~ s/\</&lt;/g;
385 $value =~ s/\>/&gt;/g;
386 $value =~ s/\\/\\\\/g;
387 return $value;
388}
389
390
391sub clean_up_after_exploding
392{
393 my $self = shift(@_);
394
395 # Delete the FDT and XRF files too
396 &util::rm($self->{'fdt_file_path'});
397 &util::rm($self->{'xrf_file_path'});
398}
399
400
4011;
Note: See TracBrowser for help on using the repository browser.