source: trunk/gsdl/perllib/plugins/ISISPlug.pm@ 12705

Last change on this file since 12705 was 12705, checked in by mdewsnip, 18 years ago

Now escapes backslash characters in metadata values so these appear correctly in the final collection.

  • Property svn:keywords set to Author Date Id Revision
File size: 12.8 KB
Line 
1###########################################################################
2#
3# ISISPlug.pm -- A plugin for CDS/ISIS databases
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 1999-2004 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package ISISPlug;
28
29
30use multiread;
31use SplitPlug;
32
33use strict;
34no strict 'refs'; # allow filehandles to be variables and viceversa
35
36# ISISPlug is a sub-class of SplitPlug.
37sub BEGIN {
38 @ISISPlug::ISA = ('SplitPlug');
39}
40
41
42my $arguments =
43 [ { 'name' => "process_exp",
44 'desc' => "{BasPlug.process_exp}",
45 'type' => "regexp",
46 'reqd' => "no",
47 'deft' => &get_default_process_exp() },
48 { 'name' => "block_exp",
49 'desc' => "{BasPlug.block_exp}",
50 'type' => "regexp",
51 'reqd' => "no",
52 'deft' => &get_default_block_exp(),
53 'hiddengli' => "yes" },
54 { 'name' => "split_exp",
55 'desc' => "{SplitPlug.split_exp}",
56 'type' => "regexp",
57 'reqd' => "no",
58 'deft' => &get_default_split_exp(),
59 'hiddengli' => "yes" },
60
61 # The interesting options
62 { 'name' => "entry_separator",
63 'desc' => "{ISISPlug.entry_separator}",
64 'type' => "string",
65 'reqd' => "no",
66 'deft' => "<br>" },
67 { 'name' => "subfield_separator",
68 'desc' => "{ISISPlug.subfield_separator}",
69 'type' => "string",
70 'reqd' => "no",
71 'deft' => ", " }
72 ];
73
74my $options = { 'name' => "ISISPlug",
75 'desc' => "{ISISPlug.desc}",
76 'abstract' => "no",
77 'inherits' => "yes",
78 'explodes' => "yes",
79 'args' => $arguments };
80
81
82# This plugin processes files with the suffix ".mst"
83sub get_default_process_exp {
84 return q^(?i)(\.mst)$^;
85}
86
87
88# This plugin blocks files with the suffix ".fdt" and ".xrf"
89sub get_default_block_exp {
90 return q^(?i)(\.fdt|\.xrf)$^;
91}
92
93
94# This plugin splits the input text at the "----------" lines
95sub get_default_split_exp {
96 return q^\r?\n----------\r?\n^;
97}
98
99
100sub new
101{
102 my ($class) = shift (@_);
103 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
104 push(@$pluginlist, $class);
105
106 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
107 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
108
109 my $self = new SplitPlug($pluginlist, $inputargs, $hashArgOptLists);
110
111 return bless $self, $class;
112}
113
114
115sub read_file
116{
117 my $self = shift (@_);
118 my ($filename, $encoding, $language, $textref) = @_;
119 my $outhandle = $self->{'outhandle'};
120
121 my ($database_file_path_root) = ($filename =~ /(.*)\.mst$/i);
122 my $mst_file_path_relative = $filename;
123 $mst_file_path_relative =~ s/^.+import.(.*?)$/$1/;
124
125 # Check the associated .fdt and .xrf files exist
126 # These files must have a lowercase extension for the IsisGdl program to work
127 # Bailing out because of this is kind of crappy but it is only an issue on Unix
128 $self->{'fdt_file_path'} = $database_file_path_root . ".fdt";
129 if (!-e $self->{'fdt_file_path'}) {
130 print STDERR "<ProcessingError n='$mst_file_path_relative' r='Could not find ISIS FDT file $self->{'fdt_file_path'}'>\n" if ($self->{'gli'});
131 print $outhandle "Error: Could not find ISIS FDT file " . $self->{'fdt_file_path'} . ".\n";
132 return;
133 }
134 $self->{'xrf_file_path'} = $database_file_path_root . ".xrf";
135 if (!-e $self->{'xrf_file_path'}) {
136 print STDERR "<ProcessingError n='$mst_file_path_relative' r='Could not find ISIS XRF file $self->{'xrf_file_path'}'>\n" if ($self->{'gli'});
137 print $outhandle "Error: Could not find ISIS XRF file " . $self->{'xrf_file_path'} . ".\n";
138 return;
139 }
140
141 # The text to split is exported from the database by the IsisGdl program
142 open(FILE, "IsisGdl \"$filename\" |");
143
144 my $reader = new multiread();
145 $reader->set_handle('ISISPlug::FILE');
146 $reader->set_encoding($encoding);
147 $reader->read_file($textref);
148
149 close(FILE);
150
151 # Parse the associated ISIS database Field Definition Table file (.fdt)
152 my %fdt_mapping = &parse_field_definition_table($self->{'fdt_file_path'}, $encoding);
153 $self->{'fdt_mapping'} = \%fdt_mapping;
154
155 # Remove the line at the start, and any blank lines, so the data is split and processed properly
156 $$textref =~ s/^----------\n//;
157 $$textref =~ s/\n\n/\n/g;
158}
159
160
161sub process
162{
163 my $self = shift (@_);
164 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
165 my $outhandle = $self->{'outhandle'};
166
167 my $section = $doc_obj->get_top_section();
168 my $fdt_mapping = $self->{'fdt_mapping'};
169 my $subfield_separator = $self->{'subfield_separator'};
170 my $entry_separator = $self->{'entry_separator'};
171 my $isis_record_html_metadata_value = "<table cellpadding=\"4\" cellspacing=\"0\">";
172
173 # Report that we're processing the file
174 print STDERR "\n<Processing n='$file' p='ISISPlug'>\n" if ($gli);
175 print $outhandle "IsisPlug: processing $file\n" if ($self->{'verbosity'}) > 1;
176
177 # Process each line of the ISIS record, one at a time
178 foreach my $line (split(/\n/, $$textref)) {
179 $line =~ s/(\s*)$//; # Remove any nasty whitespace (very important for Windows)
180 $line =~ /^tag=(.*) data=(.+)$/;
181 my $tag = $1;
182 my $tag_data = $2;
183 # print STDERR "\nTag: $tag, Data: $tag_data\n";
184
185 # Convert the tag number into a name, and remove any invalid characters
186 my $raw_metadata_name = $fdt_mapping->{$tag}{'name'} || "";
187 $raw_metadata_name =~ s/[,&\#\.\-\/]/ /g;
188 next if ($raw_metadata_name eq "");
189
190 # Metadata field names: title case, then remove spaces
191 my $metadata_name = "";
192 foreach my $word (split(/\s+/, $raw_metadata_name)) {
193 substr($word, 0, 1) =~ tr/a-z/A-Z/;
194 $metadata_name .= $word;
195 }
196
197 my $all_metadata_name = $metadata_name . "^all";
198 my $all_metadata_value = "";
199
200 # Handle repeatable fields
201 if ($fdt_mapping->{$tag}{'repeatable'}) {
202 # Multiple values are separated using the '%' character
203 foreach my $raw_metadata_value (split(/%/, $tag_data)) {
204 my $metadata_value = "";
205
206 # Handle subfields
207 while ($raw_metadata_value ne "") {
208 # If there is a subfield specifier, parse it off
209 my $sub_metadata_name = $metadata_name;
210 if ($raw_metadata_value =~ s/^\^// && $raw_metadata_value =~ s/^([a-z])//) {
211 $sub_metadata_name .= "^$1";
212 }
213
214 # Parse the value off and add it as metadata
215 $raw_metadata_value =~ s/^([^\^]*)//;
216 my $sub_metadata_value = &escape_metadata_value($1);
217
218 # print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
219 if ($sub_metadata_name ne $metadata_name) {
220 $doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
221 }
222
223 # If this is the first subfield then the value is used for the CDS/ISIS ^* field
224 if ($metadata_value eq "") {
225 $doc_obj->add_utf8_metadata($section, $metadata_name . "^*", $sub_metadata_value);
226 }
227
228 $metadata_value .= $subfield_separator unless ($metadata_value eq "");
229 $metadata_value .= $sub_metadata_value;
230 }
231
232 # Add the metadata value
233 # print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
234 $doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
235
236 $all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
237 $all_metadata_value .= $metadata_value;
238 }
239 }
240
241 # Handle non-repeatable fields
242 else {
243 my $raw_metadata_value = $tag_data;
244 my $metadata_value = "";
245
246 # Handle subfields
247 while ($raw_metadata_value ne "") {
248 # If there is a subfield specifier, parse it off
249 my $sub_metadata_name = $metadata_name;
250 if ($raw_metadata_value =~ s/^\^// && $raw_metadata_value =~ s/^([a-z])//) {
251 $sub_metadata_name .= "^$1";
252 }
253
254 # Parse the value off and add it as metadata
255 $raw_metadata_value =~ s/^([^\^]*)//;
256 my $sub_metadata_value = $1;
257
258 # Deal with the case when multiple values are specified using <...>
259 if ($sub_metadata_value =~ /\<(.+)\>/) {
260 my $sub_sub_metadata_name = $sub_metadata_name . "^sub";
261 my $tmp_sub_metadata_value = $sub_metadata_value;
262 while ($tmp_sub_metadata_value =~ s/\<(.+?)\>//) {
263 my $sub_sub_metadata_value = $1;
264 $doc_obj->add_utf8_metadata($section, $sub_sub_metadata_name, $sub_sub_metadata_value);
265 }
266 }
267 # Deal with the legacy case when multiple values are specified using /.../
268 elsif ($sub_metadata_value =~ /\/(.+)\//) {
269 my $sub_sub_metadata_name = $sub_metadata_name . "^sub";
270 my $tmp_sub_metadata_value = $sub_metadata_value;
271 while ($tmp_sub_metadata_value =~ s/\/(.+?)\///) {
272 my $sub_sub_metadata_value = $1;
273 $doc_obj->add_utf8_metadata($section, $sub_sub_metadata_name, $sub_sub_metadata_value);
274 }
275 }
276
277 # Escape the metadata value so it appears correctly in the final collection
278 $sub_metadata_value = &escape_metadata_value($sub_metadata_value);
279
280 # print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
281 if ($sub_metadata_name ne $metadata_name) {
282 $doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
283 }
284
285 # If this is the first subfield then the value is used for the CDS/ISIS ^* field
286 if ($metadata_value eq "") {
287 $doc_obj->add_utf8_metadata($section, $metadata_name . "^*", $sub_metadata_value);
288 }
289
290 $metadata_value .= $subfield_separator unless ($metadata_value eq "");
291 $metadata_value .= $sub_metadata_value;
292 }
293
294 # Add the metadata value
295 # print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
296 $doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
297
298 $all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
299 $all_metadata_value .= $metadata_value;
300 }
301
302 # Add the "^all" metadata value
303 # print STDERR "All metadata name: $all_metadata_name, value: $all_metadata_value\n";
304 $doc_obj->add_utf8_metadata($section, $all_metadata_name, $all_metadata_value);
305
306 $isis_record_html_metadata_value .= "<tr><td valign=top><nobr><b>" . $fdt_mapping->{$tag}{'name'} . "</b></nobr></td><td valign=top>" . $all_metadata_value . "</td></tr>";
307 }
308
309 # Add a reasonably formatted HTML table view of the record as the document text
310 $isis_record_html_metadata_value .= "</table>";
311 $doc_obj->add_utf8_text($section, $isis_record_html_metadata_value);
312
313 # Add the full raw record as metadata
314 my $isis_raw_record_metadata_value = &escape_metadata_value($$textref);
315 $doc_obj->add_utf8_metadata($section, "ISISRawRecord", $isis_raw_record_metadata_value);
316
317 # Add FileFormat metadata
318 $doc_obj->add_utf8_metadata($section, "FileFormat", "CDS/ISIS");
319
320 # Record was processed successfully
321 return 1;
322}
323
324
325sub parse_field_definition_table
326{
327 my $fdtfilename = shift(@_);
328 my $encoding = shift(@_);
329
330 my %fdtmapping = ();
331
332 open(FDT_FILE, "<$fdtfilename") || die "Error: Could not open file $fdtfilename.\n";
333
334 my $fdtfiletext = "";
335 my $reader = new multiread();
336 $reader->set_handle('ISISPlug::FDT_FILE');
337 $reader->set_encoding($encoding);
338 $reader->read_file($fdtfiletext);
339
340 my $amongstdefinitions = 0;
341 foreach my $fdtfileline (split(/\n/, $$fdtfiletext)) {
342 $fdtfileline =~ s/(\s*)$//; # Remove any nasty spaces at the end of the lines
343
344 if ($amongstdefinitions) {
345 my $fieldname = substr($fdtfileline, 0, 30);
346 my $fieldsubfields = substr($fdtfileline, 30, 20);
347 my $fieldspecs = substr($fdtfileline, 50);
348
349 # Remove extra spaces
350 $fieldname =~ s/(\s*)$//;
351 $fieldsubfields =~ s/(\s*)$//;
352 $fieldspecs =~ s/(\s*)$//;
353
354 # Map from tag number to metadata field title, subfields, and repeatability
355 my $fieldtag = (split(/ /, $fieldspecs))[0];
356 my $fieldrepeatable = (split(/ /, $fieldspecs))[3];
357 $fdtmapping{$fieldtag} = { 'name' => $fieldname,
358 'subfields' => $fieldsubfields,
359 'repeatable' => $fieldrepeatable };
360 }
361 elsif ($fdtfileline eq "***") {
362 $amongstdefinitions = 1;
363 }
364 }
365
366 close(FDT_FILE);
367
368 return %fdtmapping;
369}
370
371
372sub escape_metadata_value
373{
374 my $value = shift(@_);
375 $value =~ s/\</&lt;/g;
376 $value =~ s/\>/&gt;/g;
377 $value =~ s/\\/\\\\/g;
378 return $value;
379}
380
381
382sub clean_up_after_exploding
383{
384 my $self = shift(@_);
385
386 # Delete the FDT and XRF files too
387 &util::rm($self->{'fdt_file_path'});
388 &util::rm($self->{'xrf_file_path'});
389}
390
391
3921;
Note: See TracBrowser for help on using the repository browser.