source: main/trunk/greenstone2/perllib/plugins/ISISPlugin.pm@ 31492

Last change on this file since 31492 was 31492, checked in by kjdon, 7 years ago

renamed EncodingUtil to CommonUtil, BasePlugin to BaseImporter. The idea is that only top level plugins that you can specify in your collection get to have plugin in their name. Modified all other plugins to reflect these name changes

  • Property svn:keywords set to Author Date Id Revision
File size: 15.0 KB
RevLine 
[6107]1###########################################################################
2#
[15872]3# ISISPlugin.pm -- A plugin for CDS/ISIS databases
[6107]4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
[7686]9# Copyright 1999-2004 New Zealand Digital Library Project
[6107]10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
[15872]27package ISISPlugin;
[6107]28
[27502]29use Encode;
[6107]30
31use multiread;
[15872]32use SplitTextFile;
[24547]33use MetadataRead;
[28563]34use FileUtils;
[6107]35
[10254]36use strict;
37no strict 'refs'; # allow filehandles to be variables and viceversa
[6107]38
[15872]39# ISISPlugin is a sub-class of SplitTextFile.
[24547]40# methods with identical signatures take precedence in the order given in the ISA list.
[6107]41sub BEGIN {
[24547]42 @ISISPlugin::ISA = ('MetadataRead', 'SplitTextFile');
[6107]43}
44
45
46my $arguments =
[7686]47 [ { 'name' => "process_exp",
[31492]48 'desc' => "{BaseImporter.process_exp}",
[6408]49 'type' => "regexp",
50 'reqd' => "no",
[6107]51 'deft' => &get_default_process_exp() },
52 { 'name' => "block_exp",
[31492]53 'desc' => "{BaseImporter.block_exp}",
[6408]54 'type' => "regexp",
[7686]55 'reqd' => "no",
[11329]56 'deft' => &get_default_block_exp(),
57 'hiddengli' => "yes" },
[7686]58 { 'name' => "split_exp",
[15872]59 'desc' => "{SplitTextFile.split_exp}",
[7686]60 'type' => "regexp",
61 'reqd' => "no",
[11295]62 'deft' => &get_default_split_exp(),
63 'hiddengli' => "yes" },
[7686]64
65 # The interesting options
66 { 'name' => "entry_separator",
[15872]67 'desc' => "{ISISPlugin.entry_separator}",
[7686]68 'type' => "string",
69 'reqd' => "no",
70 'deft' => "<br>" },
[6107]71 { 'name' => "subfield_separator",
[15872]72 'desc' => "{ISISPlugin.subfield_separator}",
[6107]73 'type' => "string",
74 'reqd' => "no",
[8563]75 'deft' => ", " }
[6408]76 ];
[6107]77
[15872]78my $options = { 'name' => "ISISPlugin",
79 'desc' => "{ISISPlugin.desc}",
[6408]80 'abstract' => "no",
81 'inherits' => "yes",
[8762]82 'explodes' => "yes",
[6107]83 'args' => $arguments };
84
85
86# This plugin processes files with the suffix ".mst"
87sub get_default_process_exp {
88 return q^(?i)(\.mst)$^;
89}
90
91
92# This plugin blocks files with the suffix ".fdt" and ".xrf"
93sub get_default_block_exp {
[17479]94 return q^(?i)(\.fdt|\.xrf)$^;
95 #return "";
[6107]96}
97
98
99# This plugin splits the input text at the "----------" lines
100sub get_default_split_exp {
[9998]101 return q^\r?\n----------\r?\n^;
[6107]102}
103
104
[8563]105sub new
106{
[10218]107 my ($class) = shift (@_);
108 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
109 push(@$pluginlist, $class);
[6107]110
[15872]111 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
112 push(@{$hashArgOptLists->{"OptList"}},$options);
[6107]113
[15872]114 my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
[6107]115
[13178]116 if ($self->{'info_only'}) {
117 # don't worry about any options etc
118 return bless $self, $class;
119 }
120
[12833]121 # isis plug doesn't care about encoding - it assumes ascii unless the user
122 # has specified an encoding
123 if ($self->{'input_encoding'} eq "auto") {
124 $self->{'input_encoding'} = "ascii";
125 }
[6107]126 return bless $self, $class;
127}
128
[16392]129# we block the corresponding fdt and xrf
[17479]130# a pain on windows. blocks xxx.FDT, but if actual file is xx.fdt then
131# complains that no plugin can process it. Have put it back to using
132# block exp for now
[23564]133# This works now, as are doing case insenstive blocking on windows. However,
134# a pain for GLI as will not know what plugin processes the fdt and xrf.
135# if add to process expression, then get more problems.
[17479]136sub store_block_files_tmp {
[16392]137
138 my $self =shift (@_);
139 my ($filename_full_path, $block_hash) = @_;
[17479]140 print STDERR "in store block files\n";
[16392]141 $self->check_auxiliary_files($filename_full_path);
142 if (-e $self->{'fdt_file_path'}) {
[17479]143 print STDERR "$self->{'fdt_file_path'}\n";
[16392]144 my $fdt_file = $self->{'fdt_file_path'};
[31480]145 $self->block_raw_filename($block_hash,$fdt_file);
[16392]146 }
147 if (-e $self->{'xrf_file_path'}) {
[17479]148 print STDERR "$self->{'xrf_file_path'}\n";
[16392]149 my $xrf_file = $self->{'xrf_file_path'};
[31480]150 $self->block_raw_filename($block_hash,$xrf_file);
[16392]151 }
152
[6107]153
[16392]154}
155
156sub check_auxiliary_files {
157 my $self = shift (@_);
158 my ($filename) = @_;
159
160 my ($database_file_path_root) = ($filename =~ /(.*)\.mst$/i);
161 # Check the associated .fdt and .xrf files exist
162 $self->{'fdt_file_path'} = $database_file_path_root . ".FDT";
163 if (!-e $self->{'fdt_file_path'}) {
164 $self->{'fdt_file_path'} = $database_file_path_root . ".fdt";
165 }
166 $self->{'xrf_file_path'} = $database_file_path_root . ".XRF";
167 if (!-e $self->{'xrf_file_path'}) {
168 $self->{'xrf_file_path'} = $database_file_path_root . ".xrf";
169 }
170}
171
172
[8563]173sub read_file
[7686]174{
[6107]175 my $self = shift (@_);
176 my ($filename, $encoding, $language, $textref) = @_;
[11334]177 my $outhandle = $self->{'outhandle'};
[6107]178
[11334]179 my ($database_file_path_root) = ($filename =~ /(.*)\.mst$/i);
180 my $mst_file_path_relative = $filename;
181 $mst_file_path_relative =~ s/^.+import.(.*?)$/$1/;
[6107]182
[7048]183 # Check the associated .fdt and .xrf files exist
[16392]184 $self->check_auxiliary_files($filename);
185
[11334]186 if (!-e $self->{'fdt_file_path'}) {
187 print STDERR "<ProcessingError n='$mst_file_path_relative' r='Could not find ISIS FDT file $self->{'fdt_file_path'}'>\n" if ($self->{'gli'});
188 print $outhandle "Error: Could not find ISIS FDT file " . $self->{'fdt_file_path'} . ".\n";
189 return;
[7048]190 }
[11334]191 if (!-e $self->{'xrf_file_path'}) {
192 print STDERR "<ProcessingError n='$mst_file_path_relative' r='Could not find ISIS XRF file $self->{'xrf_file_path'}'>\n" if ($self->{'gli'});
193 print $outhandle "Error: Could not find ISIS XRF file " . $self->{'xrf_file_path'} . ".\n";
194 return;
[7048]195 }
196
[6107]197 # The text to split is exported from the database by the IsisGdl program
[7021]198 open(FILE, "IsisGdl \"$filename\" |");
[6107]199
200 my $reader = new multiread();
[15872]201 $reader->set_handle('ISISPlugin::FILE');
[8563]202 $reader->set_encoding($encoding);
203 $reader->read_file($textref);
[6107]204
[27502]205 # At this point $$textref is a binary byte string
206 # => turn it into a Unicode aware string, so full
207 # Unicode aware pattern matching can be used.
208 # For instance: 's/\x{0101}//g' or '[[:upper:]]'
209 #
210
211 $$textref = decode("utf8",$$textref);
[6107]212 close(FILE);
213
214 # Parse the associated ISIS database Field Definition Table file (.fdt)
[11334]215 my %fdt_mapping = &parse_field_definition_table($self->{'fdt_file_path'}, $encoding);
[11332]216 $self->{'fdt_mapping'} = \%fdt_mapping;
[6107]217
[11545]218 # Remove the line at the start, and any blank lines, so the data is split and processed properly
[28603]219 $$textref =~ s/^----------\r?\n//;
220 $$textref =~ s/(\r|\n)\n/\n/g;
[6107]221}
222
223
224sub process
225{
226 my $self = shift (@_);
[6332]227 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[6107]228 my $outhandle = $self->{'outhandle'};
229
[20778]230 # store the auxiliary files so we know which ones were used
231 # (mst file becomes the source file)
232 $doc_obj->associate_source_file($self->{'fdt_file_path'});
233 $doc_obj->associate_source_file($self->{'xrf_file_path'});
234
[11298]235 my $section = $doc_obj->get_top_section();
236 my $fdt_mapping = $self->{'fdt_mapping'};
[6107]237 my $subfield_separator = $self->{'subfield_separator'};
238 my $entry_separator = $self->{'entry_separator'};
[11466]239 my $isis_record_html_metadata_value = "<table cellpadding=\"4\" cellspacing=\"0\">";
[6107]240
241 # Process each line of the ISIS record, one at a time
[10254]242 foreach my $line (split(/\n/, $$textref)) {
[11430]243 $line =~ s/(\s*)$//; # Remove any nasty whitespace (very important for Windows)
[8646]244 $line =~ /^tag=(.*) data=(.+)$/;
[11298]245 my $tag = $1;
246 my $tag_data = $2;
247 # print STDERR "\nTag: $tag, Data: $tag_data\n";
[6107]248
[11298]249 # Convert the tag number into a name, and remove any invalid characters
250 my $raw_metadata_name = $fdt_mapping->{$tag}{'name'} || "";
[11300]251 $raw_metadata_name =~ s/[,&\#\.\-\/]/ /g;
[11298]252 next if ($raw_metadata_name eq "");
253
[6107]254 # Metadata field names: title case, then remove spaces
[11298]255 my $metadata_name = "";
256 foreach my $word (split(/\s+/, $raw_metadata_name)) {
[6107]257 substr($word, 0, 1) =~ tr/a-z/A-Z/;
[11298]258 $metadata_name .= $word;
[6107]259 }
260
[11298]261 my $all_metadata_name = $metadata_name . "^all";
262 my $all_metadata_value = "";
[6123]263
[11298]264 # Handle repeatable fields
265 if ($fdt_mapping->{$tag}{'repeatable'}) {
266 # Multiple values are separated using the '%' character
267 foreach my $raw_metadata_value (split(/%/, $tag_data)) {
268 my $metadata_value = "";
[6107]269
[11298]270 # Handle subfields
271 while ($raw_metadata_value ne "") {
272 # If there is a subfield specifier, parse it off
273 my $sub_metadata_name = $metadata_name;
[11299]274 if ($raw_metadata_value =~ s/^\^// && $raw_metadata_value =~ s/^([a-z])//) {
[11298]275 $sub_metadata_name .= "^$1";
276 }
277
278 # Parse the value off and add it as metadata
279 $raw_metadata_value =~ s/^([^\^]*)//;
[12705]280 my $sub_metadata_value = &escape_metadata_value($1);
[11298]281
282 # print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
283 if ($sub_metadata_name ne $metadata_name) {
284 $doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
285 }
286
[13157]287 # If this tag has subfields and this is the first, use the value for the CDS/ISIS ^* field
288 if ($fdt_mapping->{$tag}{'subfields'} ne "" && $metadata_value eq "") {
[12703]289 $doc_obj->add_utf8_metadata($section, $metadata_name . "^*", $sub_metadata_value);
290 }
291
[11298]292 $metadata_value .= $subfield_separator unless ($metadata_value eq "");
293 $metadata_value .= $sub_metadata_value;
294 }
295
296 # Add the metadata value
297 # print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
298 $doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
299
300 $all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
301 $all_metadata_value .= $metadata_value;
302 }
303 }
304
305 # Handle non-repeatable fields
306 else {
307 my $raw_metadata_value = $tag_data;
308 my $metadata_value = "";
309
310 # Handle subfields
311 while ($raw_metadata_value ne "") {
[6107]312 # If there is a subfield specifier, parse it off
[11298]313 my $sub_metadata_name = $metadata_name;
[11353]314 if ($raw_metadata_value =~ s/^\^// && $raw_metadata_value =~ s/^([a-z])//) {
[11379]315 $sub_metadata_name .= "^$1";
[6107]316 }
317
[11298]318 # Parse the value off and add it as metadata
319 $raw_metadata_value =~ s/^([^\^]*)//;
320 my $sub_metadata_value = $1;
321
322 # Deal with the case when multiple values are specified using <...>
[11545]323 if ($sub_metadata_value =~ /\<(.+)\>/) {
[11298]324 my $sub_sub_metadata_name = $sub_metadata_name . "^sub";
325 my $tmp_sub_metadata_value = $sub_metadata_value;
[11545]326 while ($tmp_sub_metadata_value =~ s/\<(.+?)\>//) {
[11298]327 my $sub_sub_metadata_value = $1;
328 $doc_obj->add_utf8_metadata($section, $sub_sub_metadata_name, $sub_sub_metadata_value);
[6107]329 }
330 }
[11546]331 # Deal with the legacy case when multiple values are specified using /.../
[11545]332 elsif ($sub_metadata_value =~ /\/(.+)\//) {
333 my $sub_sub_metadata_name = $sub_metadata_name . "^sub";
334 my $tmp_sub_metadata_value = $sub_metadata_value;
335 while ($tmp_sub_metadata_value =~ s/\/(.+?)\///) {
336 my $sub_sub_metadata_value = $1;
337 $doc_obj->add_utf8_metadata($section, $sub_sub_metadata_name, $sub_sub_metadata_value);
338 }
339 }
[6107]340
[12705]341 # Escape the metadata value so it appears correctly in the final collection
342 $sub_metadata_value = &escape_metadata_value($sub_metadata_value);
[9998]343
[11298]344 # print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
345 if ($sub_metadata_name ne $metadata_name) {
346 $doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
[6107]347 }
348
[13157]349 # If this tag has subfields and this is the first, use the value for the CDS/ISIS ^* field
350 if ($fdt_mapping->{$tag}{'subfields'} ne "" && $metadata_value eq "") {
[12703]351 $doc_obj->add_utf8_metadata($section, $metadata_name . "^*", $sub_metadata_value);
352 }
353
[11298]354 $metadata_value .= $subfield_separator unless ($metadata_value eq "");
355 $metadata_value .= $sub_metadata_value;
[6107]356 }
357
[11298]358 # Add the metadata value
359 # print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
360 $doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
361
362 $all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
363 $all_metadata_value .= $metadata_value;
[6107]364 }
365
[11298]366 # Add the "^all" metadata value
367 # print STDERR "All metadata name: $all_metadata_name, value: $all_metadata_value\n";
368 $doc_obj->add_utf8_metadata($section, $all_metadata_name, $all_metadata_value);
[11465]369
370 $isis_record_html_metadata_value .= "<tr><td valign=top><nobr><b>" . $fdt_mapping->{$tag}{'name'} . "</b></nobr></td><td valign=top>" . $all_metadata_value . "</td></tr>";
[6107]371 }
[8563]372
[11467]373 # Add a reasonably formatted HTML table view of the record as the document text
[11465]374 $isis_record_html_metadata_value .= "</table>";
[11467]375 $doc_obj->add_utf8_text($section, $isis_record_html_metadata_value);
[11465]376
[11467]377 # Add the full raw record as metadata
[12705]378 my $isis_raw_record_metadata_value = &escape_metadata_value($$textref);
379 $doc_obj->add_utf8_metadata($section, "ISISRawRecord", $isis_raw_record_metadata_value);
[6107]380
[8563]381 # Add FileFormat metadata
[11298]382 $doc_obj->add_utf8_metadata($section, "FileFormat", "CDS/ISIS");
[7686]383
[11298]384 # Record was processed successfully
[6107]385 return 1;
386}
387
388
389sub parse_field_definition_table
390{
[7686]391 my $fdtfilename = shift(@_);
[11262]392 my $encoding = shift(@_);
[6107]393
[7686]394 my %fdtmapping = ();
[6107]395
396 open(FDT_FILE, "<$fdtfilename") || die "Error: Could not open file $fdtfilename.\n";
397
[11262]398 my $fdtfiletext = "";
399 my $reader = new multiread();
[15872]400 $reader->set_handle('ISISPlugin::FDT_FILE');
[11262]401 $reader->set_encoding($encoding);
402 $reader->read_file($fdtfiletext);
403
[7686]404 my $amongstdefinitions = 0;
[11262]405 foreach my $fdtfileline (split(/\n/, $$fdtfiletext)) {
[6107]406 $fdtfileline =~ s/(\s*)$//; # Remove any nasty spaces at the end of the lines
407
408 if ($amongstdefinitions) {
[13298]409 my $fieldname = &unicode::substr($fdtfileline, 0, 30);
410 my $fieldsubfields = &unicode::substr($fdtfileline, 30, 20);
411 my $fieldspecs = &unicode::substr($fdtfileline, 50, 50);
[6107]412
413 # Remove extra spaces
[11298]414 $fieldname =~ s/(\s*)$//;
[6107]415 $fieldsubfields =~ s/(\s*)$//;
[11298]416 $fieldspecs =~ s/(\s*)$//;
[6107]417
[11298]418 # Map from tag number to metadata field title, subfields, and repeatability
419 my $fieldtag = (split(/ /, $fieldspecs))[0];
420 my $fieldrepeatable = (split(/ /, $fieldspecs))[3];
421 $fdtmapping{$fieldtag} = { 'name' => $fieldname,
422 'subfields' => $fieldsubfields,
423 'repeatable' => $fieldrepeatable };
[6107]424 }
425 elsif ($fdtfileline eq "***") {
426 $amongstdefinitions = 1;
427 }
428 }
429
430 close(FDT_FILE);
431
432 return %fdtmapping;
433}
434
435
[12705]436sub escape_metadata_value
437{
438 my $value = shift(@_);
439 $value =~ s/\</&lt;/g;
440 $value =~ s/\>/&gt;/g;
441 $value =~ s/\\/\\\\/g;
442 return $value;
443}
444
445
[11332]446sub clean_up_after_exploding
447{
448 my $self = shift(@_);
449
450 # Delete the FDT and XRF files too
[28563]451 &FileUtils::removeFiles($self->{'fdt_file_path'});
452 &FileUtils::removeFiles($self->{'xrf_file_path'});
[11332]453}
454
455
[6107]4561;
Note: See TracBrowser for help on using the repository browser.