source: main/trunk/greenstone2/perllib/plugins/ISISPlugin.pm@ 31492

Last change on this file since 31492 was 31492, checked in by kjdon, 7 years ago

renamed EncodingUtil to CommonUtil, BasePlugin to BaseImporter. The idea is that only top level plugins that you can specify in your collection get to have plugin in their name. Modified all other plugins to reflect these name changes

  • Property svn:keywords set to Author Date Id Revision
File size: 15.0 KB
Line 
1###########################################################################
2#
3# ISISPlugin.pm -- A plugin for CDS/ISIS databases
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 1999-2004 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package ISISPlugin;
28
29use Encode;
30
31use multiread;
32use SplitTextFile;
33use MetadataRead;
34use FileUtils;
35
36use strict;
37no strict 'refs'; # allow filehandles to be variables and viceversa
38
39# ISISPlugin is a sub-class of SplitTextFile.
40# methods with identical signatures take precedence in the order given in the ISA list.
41sub BEGIN {
42 @ISISPlugin::ISA = ('MetadataRead', 'SplitTextFile');
43}
44
45
46my $arguments =
47 [ { 'name' => "process_exp",
48 'desc' => "{BaseImporter.process_exp}",
49 'type' => "regexp",
50 'reqd' => "no",
51 'deft' => &get_default_process_exp() },
52 { 'name' => "block_exp",
53 'desc' => "{BaseImporter.block_exp}",
54 'type' => "regexp",
55 'reqd' => "no",
56 'deft' => &get_default_block_exp(),
57 'hiddengli' => "yes" },
58 { 'name' => "split_exp",
59 'desc' => "{SplitTextFile.split_exp}",
60 'type' => "regexp",
61 'reqd' => "no",
62 'deft' => &get_default_split_exp(),
63 'hiddengli' => "yes" },
64
65 # The interesting options
66 { 'name' => "entry_separator",
67 'desc' => "{ISISPlugin.entry_separator}",
68 'type' => "string",
69 'reqd' => "no",
70 'deft' => "<br>" },
71 { 'name' => "subfield_separator",
72 'desc' => "{ISISPlugin.subfield_separator}",
73 'type' => "string",
74 'reqd' => "no",
75 'deft' => ", " }
76 ];
77
78my $options = { 'name' => "ISISPlugin",
79 'desc' => "{ISISPlugin.desc}",
80 'abstract' => "no",
81 'inherits' => "yes",
82 'explodes' => "yes",
83 'args' => $arguments };
84
85
86# This plugin processes files with the suffix ".mst"
87sub get_default_process_exp {
88 return q^(?i)(\.mst)$^;
89}
90
91
92# This plugin blocks files with the suffix ".fdt" and ".xrf"
93sub get_default_block_exp {
94 return q^(?i)(\.fdt|\.xrf)$^;
95 #return "";
96}
97
98
99# This plugin splits the input text at the "----------" lines
100sub get_default_split_exp {
101 return q^\r?\n----------\r?\n^;
102}
103
104
105sub new
106{
107 my ($class) = shift (@_);
108 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
109 push(@$pluginlist, $class);
110
111 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
112 push(@{$hashArgOptLists->{"OptList"}},$options);
113
114 my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
115
116 if ($self->{'info_only'}) {
117 # don't worry about any options etc
118 return bless $self, $class;
119 }
120
121 # isis plug doesn't care about encoding - it assumes ascii unless the user
122 # has specified an encoding
123 if ($self->{'input_encoding'} eq "auto") {
124 $self->{'input_encoding'} = "ascii";
125 }
126 return bless $self, $class;
127}
128
129# we block the corresponding fdt and xrf
130# a pain on windows. blocks xxx.FDT, but if actual file is xx.fdt then
131# complains that no plugin can process it. Have put it back to using
132# block exp for now
133# This works now, as are doing case insenstive blocking on windows. However,
134# a pain for GLI as will not know what plugin processes the fdt and xrf.
135# if add to process expression, then get more problems.
136sub store_block_files_tmp {
137
138 my $self =shift (@_);
139 my ($filename_full_path, $block_hash) = @_;
140 print STDERR "in store block files\n";
141 $self->check_auxiliary_files($filename_full_path);
142 if (-e $self->{'fdt_file_path'}) {
143 print STDERR "$self->{'fdt_file_path'}\n";
144 my $fdt_file = $self->{'fdt_file_path'};
145 $self->block_raw_filename($block_hash,$fdt_file);
146 }
147 if (-e $self->{'xrf_file_path'}) {
148 print STDERR "$self->{'xrf_file_path'}\n";
149 my $xrf_file = $self->{'xrf_file_path'};
150 $self->block_raw_filename($block_hash,$xrf_file);
151 }
152
153
154}
155
156sub check_auxiliary_files {
157 my $self = shift (@_);
158 my ($filename) = @_;
159
160 my ($database_file_path_root) = ($filename =~ /(.*)\.mst$/i);
161 # Check the associated .fdt and .xrf files exist
162 $self->{'fdt_file_path'} = $database_file_path_root . ".FDT";
163 if (!-e $self->{'fdt_file_path'}) {
164 $self->{'fdt_file_path'} = $database_file_path_root . ".fdt";
165 }
166 $self->{'xrf_file_path'} = $database_file_path_root . ".XRF";
167 if (!-e $self->{'xrf_file_path'}) {
168 $self->{'xrf_file_path'} = $database_file_path_root . ".xrf";
169 }
170}
171
172
173sub read_file
174{
175 my $self = shift (@_);
176 my ($filename, $encoding, $language, $textref) = @_;
177 my $outhandle = $self->{'outhandle'};
178
179 my ($database_file_path_root) = ($filename =~ /(.*)\.mst$/i);
180 my $mst_file_path_relative = $filename;
181 $mst_file_path_relative =~ s/^.+import.(.*?)$/$1/;
182
183 # Check the associated .fdt and .xrf files exist
184 $self->check_auxiliary_files($filename);
185
186 if (!-e $self->{'fdt_file_path'}) {
187 print STDERR "<ProcessingError n='$mst_file_path_relative' r='Could not find ISIS FDT file $self->{'fdt_file_path'}'>\n" if ($self->{'gli'});
188 print $outhandle "Error: Could not find ISIS FDT file " . $self->{'fdt_file_path'} . ".\n";
189 return;
190 }
191 if (!-e $self->{'xrf_file_path'}) {
192 print STDERR "<ProcessingError n='$mst_file_path_relative' r='Could not find ISIS XRF file $self->{'xrf_file_path'}'>\n" if ($self->{'gli'});
193 print $outhandle "Error: Could not find ISIS XRF file " . $self->{'xrf_file_path'} . ".\n";
194 return;
195 }
196
197 # The text to split is exported from the database by the IsisGdl program
198 open(FILE, "IsisGdl \"$filename\" |");
199
200 my $reader = new multiread();
201 $reader->set_handle('ISISPlugin::FILE');
202 $reader->set_encoding($encoding);
203 $reader->read_file($textref);
204
205 # At this point $$textref is a binary byte string
206 # => turn it into a Unicode aware string, so full
207 # Unicode aware pattern matching can be used.
208 # For instance: 's/\x{0101}//g' or '[[:upper:]]'
209 #
210
211 $$textref = decode("utf8",$$textref);
212 close(FILE);
213
214 # Parse the associated ISIS database Field Definition Table file (.fdt)
215 my %fdt_mapping = &parse_field_definition_table($self->{'fdt_file_path'}, $encoding);
216 $self->{'fdt_mapping'} = \%fdt_mapping;
217
218 # Remove the line at the start, and any blank lines, so the data is split and processed properly
219 $$textref =~ s/^----------\r?\n//;
220 $$textref =~ s/(\r|\n)\n/\n/g;
221}
222
223
224sub process
225{
226 my $self = shift (@_);
227 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
228 my $outhandle = $self->{'outhandle'};
229
230 # store the auxiliary files so we know which ones were used
231 # (mst file becomes the source file)
232 $doc_obj->associate_source_file($self->{'fdt_file_path'});
233 $doc_obj->associate_source_file($self->{'xrf_file_path'});
234
235 my $section = $doc_obj->get_top_section();
236 my $fdt_mapping = $self->{'fdt_mapping'};
237 my $subfield_separator = $self->{'subfield_separator'};
238 my $entry_separator = $self->{'entry_separator'};
239 my $isis_record_html_metadata_value = "<table cellpadding=\"4\" cellspacing=\"0\">";
240
241 # Process each line of the ISIS record, one at a time
242 foreach my $line (split(/\n/, $$textref)) {
243 $line =~ s/(\s*)$//; # Remove any nasty whitespace (very important for Windows)
244 $line =~ /^tag=(.*) data=(.+)$/;
245 my $tag = $1;
246 my $tag_data = $2;
247 # print STDERR "\nTag: $tag, Data: $tag_data\n";
248
249 # Convert the tag number into a name, and remove any invalid characters
250 my $raw_metadata_name = $fdt_mapping->{$tag}{'name'} || "";
251 $raw_metadata_name =~ s/[,&\#\.\-\/]/ /g;
252 next if ($raw_metadata_name eq "");
253
254 # Metadata field names: title case, then remove spaces
255 my $metadata_name = "";
256 foreach my $word (split(/\s+/, $raw_metadata_name)) {
257 substr($word, 0, 1) =~ tr/a-z/A-Z/;
258 $metadata_name .= $word;
259 }
260
261 my $all_metadata_name = $metadata_name . "^all";
262 my $all_metadata_value = "";
263
264 # Handle repeatable fields
265 if ($fdt_mapping->{$tag}{'repeatable'}) {
266 # Multiple values are separated using the '%' character
267 foreach my $raw_metadata_value (split(/%/, $tag_data)) {
268 my $metadata_value = "";
269
270 # Handle subfields
271 while ($raw_metadata_value ne "") {
272 # If there is a subfield specifier, parse it off
273 my $sub_metadata_name = $metadata_name;
274 if ($raw_metadata_value =~ s/^\^// && $raw_metadata_value =~ s/^([a-z])//) {
275 $sub_metadata_name .= "^$1";
276 }
277
278 # Parse the value off and add it as metadata
279 $raw_metadata_value =~ s/^([^\^]*)//;
280 my $sub_metadata_value = &escape_metadata_value($1);
281
282 # print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
283 if ($sub_metadata_name ne $metadata_name) {
284 $doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
285 }
286
287 # If this tag has subfields and this is the first, use the value for the CDS/ISIS ^* field
288 if ($fdt_mapping->{$tag}{'subfields'} ne "" && $metadata_value eq "") {
289 $doc_obj->add_utf8_metadata($section, $metadata_name . "^*", $sub_metadata_value);
290 }
291
292 $metadata_value .= $subfield_separator unless ($metadata_value eq "");
293 $metadata_value .= $sub_metadata_value;
294 }
295
296 # Add the metadata value
297 # print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
298 $doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
299
300 $all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
301 $all_metadata_value .= $metadata_value;
302 }
303 }
304
305 # Handle non-repeatable fields
306 else {
307 my $raw_metadata_value = $tag_data;
308 my $metadata_value = "";
309
310 # Handle subfields
311 while ($raw_metadata_value ne "") {
312 # If there is a subfield specifier, parse it off
313 my $sub_metadata_name = $metadata_name;
314 if ($raw_metadata_value =~ s/^\^// && $raw_metadata_value =~ s/^([a-z])//) {
315 $sub_metadata_name .= "^$1";
316 }
317
318 # Parse the value off and add it as metadata
319 $raw_metadata_value =~ s/^([^\^]*)//;
320 my $sub_metadata_value = $1;
321
322 # Deal with the case when multiple values are specified using <...>
323 if ($sub_metadata_value =~ /\<(.+)\>/) {
324 my $sub_sub_metadata_name = $sub_metadata_name . "^sub";
325 my $tmp_sub_metadata_value = $sub_metadata_value;
326 while ($tmp_sub_metadata_value =~ s/\<(.+?)\>//) {
327 my $sub_sub_metadata_value = $1;
328 $doc_obj->add_utf8_metadata($section, $sub_sub_metadata_name, $sub_sub_metadata_value);
329 }
330 }
331 # Deal with the legacy case when multiple values are specified using /.../
332 elsif ($sub_metadata_value =~ /\/(.+)\//) {
333 my $sub_sub_metadata_name = $sub_metadata_name . "^sub";
334 my $tmp_sub_metadata_value = $sub_metadata_value;
335 while ($tmp_sub_metadata_value =~ s/\/(.+?)\///) {
336 my $sub_sub_metadata_value = $1;
337 $doc_obj->add_utf8_metadata($section, $sub_sub_metadata_name, $sub_sub_metadata_value);
338 }
339 }
340
341 # Escape the metadata value so it appears correctly in the final collection
342 $sub_metadata_value = &escape_metadata_value($sub_metadata_value);
343
344 # print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
345 if ($sub_metadata_name ne $metadata_name) {
346 $doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
347 }
348
349 # If this tag has subfields and this is the first, use the value for the CDS/ISIS ^* field
350 if ($fdt_mapping->{$tag}{'subfields'} ne "" && $metadata_value eq "") {
351 $doc_obj->add_utf8_metadata($section, $metadata_name . "^*", $sub_metadata_value);
352 }
353
354 $metadata_value .= $subfield_separator unless ($metadata_value eq "");
355 $metadata_value .= $sub_metadata_value;
356 }
357
358 # Add the metadata value
359 # print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
360 $doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
361
362 $all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
363 $all_metadata_value .= $metadata_value;
364 }
365
366 # Add the "^all" metadata value
367 # print STDERR "All metadata name: $all_metadata_name, value: $all_metadata_value\n";
368 $doc_obj->add_utf8_metadata($section, $all_metadata_name, $all_metadata_value);
369
370 $isis_record_html_metadata_value .= "<tr><td valign=top><nobr><b>" . $fdt_mapping->{$tag}{'name'} . "</b></nobr></td><td valign=top>" . $all_metadata_value . "</td></tr>";
371 }
372
373 # Add a reasonably formatted HTML table view of the record as the document text
374 $isis_record_html_metadata_value .= "</table>";
375 $doc_obj->add_utf8_text($section, $isis_record_html_metadata_value);
376
377 # Add the full raw record as metadata
378 my $isis_raw_record_metadata_value = &escape_metadata_value($$textref);
379 $doc_obj->add_utf8_metadata($section, "ISISRawRecord", $isis_raw_record_metadata_value);
380
381 # Add FileFormat metadata
382 $doc_obj->add_utf8_metadata($section, "FileFormat", "CDS/ISIS");
383
384 # Record was processed successfully
385 return 1;
386}
387
388
389sub parse_field_definition_table
390{
391 my $fdtfilename = shift(@_);
392 my $encoding = shift(@_);
393
394 my %fdtmapping = ();
395
396 open(FDT_FILE, "<$fdtfilename") || die "Error: Could not open file $fdtfilename.\n";
397
398 my $fdtfiletext = "";
399 my $reader = new multiread();
400 $reader->set_handle('ISISPlugin::FDT_FILE');
401 $reader->set_encoding($encoding);
402 $reader->read_file($fdtfiletext);
403
404 my $amongstdefinitions = 0;
405 foreach my $fdtfileline (split(/\n/, $$fdtfiletext)) {
406 $fdtfileline =~ s/(\s*)$//; # Remove any nasty spaces at the end of the lines
407
408 if ($amongstdefinitions) {
409 my $fieldname = &unicode::substr($fdtfileline, 0, 30);
410 my $fieldsubfields = &unicode::substr($fdtfileline, 30, 20);
411 my $fieldspecs = &unicode::substr($fdtfileline, 50, 50);
412
413 # Remove extra spaces
414 $fieldname =~ s/(\s*)$//;
415 $fieldsubfields =~ s/(\s*)$//;
416 $fieldspecs =~ s/(\s*)$//;
417
418 # Map from tag number to metadata field title, subfields, and repeatability
419 my $fieldtag = (split(/ /, $fieldspecs))[0];
420 my $fieldrepeatable = (split(/ /, $fieldspecs))[3];
421 $fdtmapping{$fieldtag} = { 'name' => $fieldname,
422 'subfields' => $fieldsubfields,
423 'repeatable' => $fieldrepeatable };
424 }
425 elsif ($fdtfileline eq "***") {
426 $amongstdefinitions = 1;
427 }
428 }
429
430 close(FDT_FILE);
431
432 return %fdtmapping;
433}
434
435
436sub escape_metadata_value
437{
438 my $value = shift(@_);
439 $value =~ s/\</&lt;/g;
440 $value =~ s/\>/&gt;/g;
441 $value =~ s/\\/\\\\/g;
442 return $value;
443}
444
445
446sub clean_up_after_exploding
447{
448 my $self = shift(@_);
449
450 # Delete the FDT and XRF files too
451 &FileUtils::removeFiles($self->{'fdt_file_path'});
452 &FileUtils::removeFiles($self->{'xrf_file_path'});
453}
454
455
4561;
Note: See TracBrowser for help on using the repository browser.