source: main/trunk/greenstone2/perllib/plugins/ISISPlugin.pm@ 22597

Last change on this file since 22597 was 20778, checked in by kjdon, 15 years ago

plugins now need to add any auxiliary source files as source assoc files, so we know when to reimport for incremental import. Have started this, but not finished and not tested :-)

  • Property svn:keywords set to Author Date Id Revision
File size: 14.3 KB
Line 
1###########################################################################
2#
3# ISISPlugin.pm -- A plugin for CDS/ISIS databases
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 1999-2004 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package ISISPlugin;
28
29
30use multiread;
31use SplitTextFile;
32
33use strict;
34no strict 'refs'; # allow filehandles to be variables and viceversa
35
36# ISISPlugin is a sub-class of SplitTextFile.
37sub BEGIN {
38 @ISISPlugin::ISA = ('SplitTextFile');
39}
40
41
42my $arguments =
43 [ { 'name' => "process_exp",
44 'desc' => "{BasePlugin.process_exp}",
45 'type' => "regexp",
46 'reqd' => "no",
47 'deft' => &get_default_process_exp() },
48 { 'name' => "block_exp",
49 'desc' => "{BasePlugin.block_exp}",
50 'type' => "regexp",
51 'reqd' => "no",
52 'deft' => &get_default_block_exp(),
53 'hiddengli' => "yes" },
54 { 'name' => "split_exp",
55 'desc' => "{SplitTextFile.split_exp}",
56 'type' => "regexp",
57 'reqd' => "no",
58 'deft' => &get_default_split_exp(),
59 'hiddengli' => "yes" },
60
61 # The interesting options
62 { 'name' => "entry_separator",
63 'desc' => "{ISISPlugin.entry_separator}",
64 'type' => "string",
65 'reqd' => "no",
66 'deft' => "<br>" },
67 { 'name' => "subfield_separator",
68 'desc' => "{ISISPlugin.subfield_separator}",
69 'type' => "string",
70 'reqd' => "no",
71 'deft' => ", " }
72 ];
73
74my $options = { 'name' => "ISISPlugin",
75 'desc' => "{ISISPlugin.desc}",
76 'abstract' => "no",
77 'inherits' => "yes",
78 'explodes' => "yes",
79 'args' => $arguments };
80
81
82# This plugin processes files with the suffix ".mst"
83sub get_default_process_exp {
84 return q^(?i)(\.mst)$^;
85}
86
87
88# This plugin blocks files with the suffix ".fdt" and ".xrf"
89sub get_default_block_exp {
90 return q^(?i)(\.fdt|\.xrf)$^;
91 #return "";
92}
93
94
95# This plugin splits the input text at the "----------" lines
96sub get_default_split_exp {
97 return q^\r?\n----------\r?\n^;
98}
99
100
101sub new
102{
103 my ($class) = shift (@_);
104 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
105 push(@$pluginlist, $class);
106
107 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
108 push(@{$hashArgOptLists->{"OptList"}},$options);
109
110 my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
111
112 if ($self->{'info_only'}) {
113 # don't worry about any options etc
114 return bless $self, $class;
115 }
116
117 # isis plug doesn't care about encoding - it assumes ascii unless the user
118 # has specified an encoding
119 if ($self->{'input_encoding'} eq "auto") {
120 $self->{'input_encoding'} = "ascii";
121 }
122 return bless $self, $class;
123}
124
125# we block the corresponding fdt and xrf
126# a pain on windows. blocks xxx.FDT, but if actual file is xx.fdt then
127# complains that no plugin can process it. Have put it back to using
128# block exp for now
129sub store_block_files_tmp {
130
131 my $self =shift (@_);
132 my ($filename_full_path, $block_hash) = @_;
133 print STDERR "in store block files\n";
134 $self->check_auxiliary_files($filename_full_path);
135 if (-e $self->{'fdt_file_path'}) {
136 print STDERR "$self->{'fdt_file_path'}\n";
137 my $fdt_file = $self->{'fdt_file_path'};
138 $block_hash->{'file_blocks'}->{$fdt_file} = 1;
139 }
140 if (-e $self->{'xrf_file_path'}) {
141 print STDERR "$self->{'xrf_file_path'}\n";
142 my $xrf_file = $self->{'xrf_file_path'};
143 $block_hash->{'file_blocks'}->{$xrf_file} = 1;
144 }
145
146
147}
148
149sub check_auxiliary_files {
150 my $self = shift (@_);
151 my ($filename) = @_;
152
153 my ($database_file_path_root) = ($filename =~ /(.*)\.mst$/i);
154 # Check the associated .fdt and .xrf files exist
155 $self->{'fdt_file_path'} = $database_file_path_root . ".FDT";
156 if (!-e $self->{'fdt_file_path'}) {
157 $self->{'fdt_file_path'} = $database_file_path_root . ".fdt";
158 }
159 $self->{'xrf_file_path'} = $database_file_path_root . ".XRF";
160 if (!-e $self->{'xrf_file_path'}) {
161 $self->{'xrf_file_path'} = $database_file_path_root . ".xrf";
162 }
163}
164
165
166sub read_file
167{
168 my $self = shift (@_);
169 my ($filename, $encoding, $language, $textref) = @_;
170 my $outhandle = $self->{'outhandle'};
171
172 my ($database_file_path_root) = ($filename =~ /(.*)\.mst$/i);
173 my $mst_file_path_relative = $filename;
174 $mst_file_path_relative =~ s/^.+import.(.*?)$/$1/;
175
176 # Check the associated .fdt and .xrf files exist
177 $self->check_auxiliary_files($filename);
178
179 if (!-e $self->{'fdt_file_path'}) {
180 print STDERR "<ProcessingError n='$mst_file_path_relative' r='Could not find ISIS FDT file $self->{'fdt_file_path'}'>\n" if ($self->{'gli'});
181 print $outhandle "Error: Could not find ISIS FDT file " . $self->{'fdt_file_path'} . ".\n";
182 return;
183 }
184 if (!-e $self->{'xrf_file_path'}) {
185 print STDERR "<ProcessingError n='$mst_file_path_relative' r='Could not find ISIS XRF file $self->{'xrf_file_path'}'>\n" if ($self->{'gli'});
186 print $outhandle "Error: Could not find ISIS XRF file " . $self->{'xrf_file_path'} . ".\n";
187 return;
188 }
189
190 # The text to split is exported from the database by the IsisGdl program
191 open(FILE, "IsisGdl \"$filename\" |");
192
193 my $reader = new multiread();
194 $reader->set_handle('ISISPlugin::FILE');
195 $reader->set_encoding($encoding);
196 $reader->read_file($textref);
197
198 close(FILE);
199
200 # Parse the associated ISIS database Field Definition Table file (.fdt)
201 my %fdt_mapping = &parse_field_definition_table($self->{'fdt_file_path'}, $encoding);
202 $self->{'fdt_mapping'} = \%fdt_mapping;
203
204 # Remove the line at the start, and any blank lines, so the data is split and processed properly
205 $$textref =~ s/^----------\n//;
206 $$textref =~ s/\n\n/\n/g;
207}
208
209
210sub process
211{
212 my $self = shift (@_);
213 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
214 my $outhandle = $self->{'outhandle'};
215
216 # store the auxiliary files so we know which ones were used
217 # (mst file becomes the source file)
218 $doc_obj->associate_source_file($self->{'fdt_file_path'});
219 $doc_obj->associate_source_file($self->{'xrf_file_path'});
220
221 my $section = $doc_obj->get_top_section();
222 my $fdt_mapping = $self->{'fdt_mapping'};
223 my $subfield_separator = $self->{'subfield_separator'};
224 my $entry_separator = $self->{'entry_separator'};
225 my $isis_record_html_metadata_value = "<table cellpadding=\"4\" cellspacing=\"0\">";
226
227 # Process each line of the ISIS record, one at a time
228 foreach my $line (split(/\n/, $$textref)) {
229 $line =~ s/(\s*)$//; # Remove any nasty whitespace (very important for Windows)
230 $line =~ /^tag=(.*) data=(.+)$/;
231 my $tag = $1;
232 my $tag_data = $2;
233 # print STDERR "\nTag: $tag, Data: $tag_data\n";
234
235 # Convert the tag number into a name, and remove any invalid characters
236 my $raw_metadata_name = $fdt_mapping->{$tag}{'name'} || "";
237 $raw_metadata_name =~ s/[,&\#\.\-\/]/ /g;
238 next if ($raw_metadata_name eq "");
239
240 # Metadata field names: title case, then remove spaces
241 my $metadata_name = "";
242 foreach my $word (split(/\s+/, $raw_metadata_name)) {
243 substr($word, 0, 1) =~ tr/a-z/A-Z/;
244 $metadata_name .= $word;
245 }
246
247 my $all_metadata_name = $metadata_name . "^all";
248 my $all_metadata_value = "";
249
250 # Handle repeatable fields
251 if ($fdt_mapping->{$tag}{'repeatable'}) {
252 # Multiple values are separated using the '%' character
253 foreach my $raw_metadata_value (split(/%/, $tag_data)) {
254 my $metadata_value = "";
255
256 # Handle subfields
257 while ($raw_metadata_value ne "") {
258 # If there is a subfield specifier, parse it off
259 my $sub_metadata_name = $metadata_name;
260 if ($raw_metadata_value =~ s/^\^// && $raw_metadata_value =~ s/^([a-z])//) {
261 $sub_metadata_name .= "^$1";
262 }
263
264 # Parse the value off and add it as metadata
265 $raw_metadata_value =~ s/^([^\^]*)//;
266 my $sub_metadata_value = &escape_metadata_value($1);
267
268 # print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
269 if ($sub_metadata_name ne $metadata_name) {
270 $doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
271 }
272
273 # If this tag has subfields and this is the first, use the value for the CDS/ISIS ^* field
274 if ($fdt_mapping->{$tag}{'subfields'} ne "" && $metadata_value eq "") {
275 $doc_obj->add_utf8_metadata($section, $metadata_name . "^*", $sub_metadata_value);
276 }
277
278 $metadata_value .= $subfield_separator unless ($metadata_value eq "");
279 $metadata_value .= $sub_metadata_value;
280 }
281
282 # Add the metadata value
283 # print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
284 $doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
285
286 $all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
287 $all_metadata_value .= $metadata_value;
288 }
289 }
290
291 # Handle non-repeatable fields
292 else {
293 my $raw_metadata_value = $tag_data;
294 my $metadata_value = "";
295
296 # Handle subfields
297 while ($raw_metadata_value ne "") {
298 # If there is a subfield specifier, parse it off
299 my $sub_metadata_name = $metadata_name;
300 if ($raw_metadata_value =~ s/^\^// && $raw_metadata_value =~ s/^([a-z])//) {
301 $sub_metadata_name .= "^$1";
302 }
303
304 # Parse the value off and add it as metadata
305 $raw_metadata_value =~ s/^([^\^]*)//;
306 my $sub_metadata_value = $1;
307
308 # Deal with the case when multiple values are specified using <...>
309 if ($sub_metadata_value =~ /\<(.+)\>/) {
310 my $sub_sub_metadata_name = $sub_metadata_name . "^sub";
311 my $tmp_sub_metadata_value = $sub_metadata_value;
312 while ($tmp_sub_metadata_value =~ s/\<(.+?)\>//) {
313 my $sub_sub_metadata_value = $1;
314 $doc_obj->add_utf8_metadata($section, $sub_sub_metadata_name, $sub_sub_metadata_value);
315 }
316 }
317 # Deal with the legacy case when multiple values are specified using /.../
318 elsif ($sub_metadata_value =~ /\/(.+)\//) {
319 my $sub_sub_metadata_name = $sub_metadata_name . "^sub";
320 my $tmp_sub_metadata_value = $sub_metadata_value;
321 while ($tmp_sub_metadata_value =~ s/\/(.+?)\///) {
322 my $sub_sub_metadata_value = $1;
323 $doc_obj->add_utf8_metadata($section, $sub_sub_metadata_name, $sub_sub_metadata_value);
324 }
325 }
326
327 # Escape the metadata value so it appears correctly in the final collection
328 $sub_metadata_value = &escape_metadata_value($sub_metadata_value);
329
330 # print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
331 if ($sub_metadata_name ne $metadata_name) {
332 $doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
333 }
334
335 # If this tag has subfields and this is the first, use the value for the CDS/ISIS ^* field
336 if ($fdt_mapping->{$tag}{'subfields'} ne "" && $metadata_value eq "") {
337 $doc_obj->add_utf8_metadata($section, $metadata_name . "^*", $sub_metadata_value);
338 }
339
340 $metadata_value .= $subfield_separator unless ($metadata_value eq "");
341 $metadata_value .= $sub_metadata_value;
342 }
343
344 # Add the metadata value
345 # print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
346 $doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
347
348 $all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
349 $all_metadata_value .= $metadata_value;
350 }
351
352 # Add the "^all" metadata value
353 # print STDERR "All metadata name: $all_metadata_name, value: $all_metadata_value\n";
354 $doc_obj->add_utf8_metadata($section, $all_metadata_name, $all_metadata_value);
355
356 $isis_record_html_metadata_value .= "<tr><td valign=top><nobr><b>" . $fdt_mapping->{$tag}{'name'} . "</b></nobr></td><td valign=top>" . $all_metadata_value . "</td></tr>";
357 }
358
359 # Add a reasonably formatted HTML table view of the record as the document text
360 $isis_record_html_metadata_value .= "</table>";
361 $doc_obj->add_utf8_text($section, $isis_record_html_metadata_value);
362
363 # Add the full raw record as metadata
364 my $isis_raw_record_metadata_value = &escape_metadata_value($$textref);
365 $doc_obj->add_utf8_metadata($section, "ISISRawRecord", $isis_raw_record_metadata_value);
366
367 # Add FileFormat metadata
368 $doc_obj->add_utf8_metadata($section, "FileFormat", "CDS/ISIS");
369
370 # Record was processed successfully
371 return 1;
372}
373
374
375sub parse_field_definition_table
376{
377 my $fdtfilename = shift(@_);
378 my $encoding = shift(@_);
379
380 my %fdtmapping = ();
381
382 open(FDT_FILE, "<$fdtfilename") || die "Error: Could not open file $fdtfilename.\n";
383
384 my $fdtfiletext = "";
385 my $reader = new multiread();
386 $reader->set_handle('ISISPlugin::FDT_FILE');
387 $reader->set_encoding($encoding);
388 $reader->read_file($fdtfiletext);
389
390 my $amongstdefinitions = 0;
391 foreach my $fdtfileline (split(/\n/, $$fdtfiletext)) {
392 $fdtfileline =~ s/(\s*)$//; # Remove any nasty spaces at the end of the lines
393
394 if ($amongstdefinitions) {
395 my $fieldname = &unicode::substr($fdtfileline, 0, 30);
396 my $fieldsubfields = &unicode::substr($fdtfileline, 30, 20);
397 my $fieldspecs = &unicode::substr($fdtfileline, 50, 50);
398
399 # Remove extra spaces
400 $fieldname =~ s/(\s*)$//;
401 $fieldsubfields =~ s/(\s*)$//;
402 $fieldspecs =~ s/(\s*)$//;
403
404 # Map from tag number to metadata field title, subfields, and repeatability
405 my $fieldtag = (split(/ /, $fieldspecs))[0];
406 my $fieldrepeatable = (split(/ /, $fieldspecs))[3];
407 $fdtmapping{$fieldtag} = { 'name' => $fieldname,
408 'subfields' => $fieldsubfields,
409 'repeatable' => $fieldrepeatable };
410 }
411 elsif ($fdtfileline eq "***") {
412 $amongstdefinitions = 1;
413 }
414 }
415
416 close(FDT_FILE);
417
418 return %fdtmapping;
419}
420
421
422sub escape_metadata_value
423{
424 my $value = shift(@_);
425 $value =~ s/\</&lt;/g;
426 $value =~ s/\>/&gt;/g;
427 $value =~ s/\\/\\\\/g;
428 return $value;
429}
430
431
432sub clean_up_after_exploding
433{
434 my $self = shift(@_);
435
436 # Delete the FDT and XRF files too
437 &util::rm($self->{'fdt_file_path'});
438 &util::rm($self->{'xrf_file_path'});
439}
440
441
4421;
Note: See TracBrowser for help on using the repository browser.