source: main/trunk/greenstone2/perllib/plugins/ISISPlugin.pm@ 24547

Last change on this file since 24547 was 24547, checked in by ak19, 13 years ago

Added new abstract plugin MetadataRead that defines can_process_this_file_for_metadata that MetadataPlugin subclasses can inherit (if MetadataRead is listed first in the ISA inheritance list) and which will then override the one defined in BasePlugin. For now committing MARC, ISIS and OAIPlugins which now additionally inherit from MetadataRead. Other metadataPlugins also need to be committed.

  • Property svn:keywords set to Author Date Id Revision
File size: 14.7 KB
RevLine 
[6107]1###########################################################################
2#
[15872]3# ISISPlugin.pm -- A plugin for CDS/ISIS databases
[6107]4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
[7686]9# Copyright 1999-2004 New Zealand Digital Library Project
[6107]10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
[15872]27package ISISPlugin;
[6107]28
29
30use multiread;
[15872]31use SplitTextFile;
[24547]32use MetadataRead;
[6107]33
[10254]34use strict;
35no strict 'refs'; # allow filehandles to be variables and viceversa
[6107]36
[15872]37# ISISPlugin is a sub-class of SplitTextFile.
[24547]38# methods with identical signatures take precedence in the order given in the ISA list.
[6107]39sub BEGIN {
[24547]40 @ISISPlugin::ISA = ('MetadataRead', 'SplitTextFile');
[6107]41}
42
43
44my $arguments =
[7686]45 [ { 'name' => "process_exp",
[15872]46 'desc' => "{BasePlugin.process_exp}",
[6408]47 'type' => "regexp",
48 'reqd' => "no",
[6107]49 'deft' => &get_default_process_exp() },
50 { 'name' => "block_exp",
[15872]51 'desc' => "{BasePlugin.block_exp}",
[6408]52 'type' => "regexp",
[7686]53 'reqd' => "no",
[11329]54 'deft' => &get_default_block_exp(),
55 'hiddengli' => "yes" },
[7686]56 { 'name' => "split_exp",
[15872]57 'desc' => "{SplitTextFile.split_exp}",
[7686]58 'type' => "regexp",
59 'reqd' => "no",
[11295]60 'deft' => &get_default_split_exp(),
61 'hiddengli' => "yes" },
[7686]62
63 # The interesting options
64 { 'name' => "entry_separator",
[15872]65 'desc' => "{ISISPlugin.entry_separator}",
[7686]66 'type' => "string",
67 'reqd' => "no",
68 'deft' => "<br>" },
[6107]69 { 'name' => "subfield_separator",
[15872]70 'desc' => "{ISISPlugin.subfield_separator}",
[6107]71 'type' => "string",
72 'reqd' => "no",
[8563]73 'deft' => ", " }
[6408]74 ];
[6107]75
[15872]76my $options = { 'name' => "ISISPlugin",
77 'desc' => "{ISISPlugin.desc}",
[6408]78 'abstract' => "no",
79 'inherits' => "yes",
[8762]80 'explodes' => "yes",
[6107]81 'args' => $arguments };
82
83
84# This plugin processes files with the suffix ".mst"
85sub get_default_process_exp {
86 return q^(?i)(\.mst)$^;
87}
88
89
90# This plugin blocks files with the suffix ".fdt" and ".xrf"
91sub get_default_block_exp {
[17479]92 return q^(?i)(\.fdt|\.xrf)$^;
93 #return "";
[6107]94}
95
96
97# This plugin splits the input text at the "----------" lines
98sub get_default_split_exp {
[9998]99 return q^\r?\n----------\r?\n^;
[6107]100}
101
102
[8563]103sub new
104{
[10218]105 my ($class) = shift (@_);
106 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
107 push(@$pluginlist, $class);
[6107]108
[15872]109 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
110 push(@{$hashArgOptLists->{"OptList"}},$options);
[6107]111
[15872]112 my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
[6107]113
[13178]114 if ($self->{'info_only'}) {
115 # don't worry about any options etc
116 return bless $self, $class;
117 }
118
[12833]119 # isis plug doesn't care about encoding - it assumes ascii unless the user
120 # has specified an encoding
121 if ($self->{'input_encoding'} eq "auto") {
122 $self->{'input_encoding'} = "ascii";
123 }
[6107]124 return bless $self, $class;
125}
126
[16392]127# we block the corresponding fdt and xrf
[17479]128# a pain on windows. blocks xxx.FDT, but if actual file is xx.fdt then
129# complains that no plugin can process it. Have put it back to using
130# block exp for now
[23564]131# This works now, as are doing case insenstive blocking on windows. However,
132# a pain for GLI as will not know what plugin processes the fdt and xrf.
133# if add to process expression, then get more problems.
[17479]134sub store_block_files_tmp {
[16392]135
136 my $self =shift (@_);
137 my ($filename_full_path, $block_hash) = @_;
[17479]138 print STDERR "in store block files\n";
[16392]139 $self->check_auxiliary_files($filename_full_path);
140 if (-e $self->{'fdt_file_path'}) {
[17479]141 print STDERR "$self->{'fdt_file_path'}\n";
[16392]142 my $fdt_file = $self->{'fdt_file_path'};
[23561]143 &util::block_filename($block_hash,$fdt_file);
[16392]144 }
145 if (-e $self->{'xrf_file_path'}) {
[17479]146 print STDERR "$self->{'xrf_file_path'}\n";
[16392]147 my $xrf_file = $self->{'xrf_file_path'};
[23561]148 &util::block_filename($block_hash,$xrf_file);
[16392]149 }
150
[6107]151
[16392]152}
153
154sub check_auxiliary_files {
155 my $self = shift (@_);
156 my ($filename) = @_;
157
158 my ($database_file_path_root) = ($filename =~ /(.*)\.mst$/i);
159 # Check the associated .fdt and .xrf files exist
160 $self->{'fdt_file_path'} = $database_file_path_root . ".FDT";
161 if (!-e $self->{'fdt_file_path'}) {
162 $self->{'fdt_file_path'} = $database_file_path_root . ".fdt";
163 }
164 $self->{'xrf_file_path'} = $database_file_path_root . ".XRF";
165 if (!-e $self->{'xrf_file_path'}) {
166 $self->{'xrf_file_path'} = $database_file_path_root . ".xrf";
167 }
168}
169
170
[8563]171sub read_file
[7686]172{
[6107]173 my $self = shift (@_);
174 my ($filename, $encoding, $language, $textref) = @_;
[11334]175 my $outhandle = $self->{'outhandle'};
[6107]176
[11334]177 my ($database_file_path_root) = ($filename =~ /(.*)\.mst$/i);
178 my $mst_file_path_relative = $filename;
179 $mst_file_path_relative =~ s/^.+import.(.*?)$/$1/;
[6107]180
[7048]181 # Check the associated .fdt and .xrf files exist
[16392]182 $self->check_auxiliary_files($filename);
183
[11334]184 if (!-e $self->{'fdt_file_path'}) {
185 print STDERR "<ProcessingError n='$mst_file_path_relative' r='Could not find ISIS FDT file $self->{'fdt_file_path'}'>\n" if ($self->{'gli'});
186 print $outhandle "Error: Could not find ISIS FDT file " . $self->{'fdt_file_path'} . ".\n";
187 return;
[7048]188 }
[11334]189 if (!-e $self->{'xrf_file_path'}) {
190 print STDERR "<ProcessingError n='$mst_file_path_relative' r='Could not find ISIS XRF file $self->{'xrf_file_path'}'>\n" if ($self->{'gli'});
191 print $outhandle "Error: Could not find ISIS XRF file " . $self->{'xrf_file_path'} . ".\n";
192 return;
[7048]193 }
194
[6107]195 # The text to split is exported from the database by the IsisGdl program
[7021]196 open(FILE, "IsisGdl \"$filename\" |");
[6107]197
198 my $reader = new multiread();
[15872]199 $reader->set_handle('ISISPlugin::FILE');
[8563]200 $reader->set_encoding($encoding);
201 $reader->read_file($textref);
[6107]202
203 close(FILE);
204
205 # Parse the associated ISIS database Field Definition Table file (.fdt)
[11334]206 my %fdt_mapping = &parse_field_definition_table($self->{'fdt_file_path'}, $encoding);
[11332]207 $self->{'fdt_mapping'} = \%fdt_mapping;
[6107]208
[11545]209 # Remove the line at the start, and any blank lines, so the data is split and processed properly
[7686]210 $$textref =~ s/^----------\n//;
[11545]211 $$textref =~ s/\n\n/\n/g;
[6107]212}
213
214
215sub process
216{
217 my $self = shift (@_);
[6332]218 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[6107]219 my $outhandle = $self->{'outhandle'};
220
[20778]221 # store the auxiliary files so we know which ones were used
222 # (mst file becomes the source file)
223 $doc_obj->associate_source_file($self->{'fdt_file_path'});
224 $doc_obj->associate_source_file($self->{'xrf_file_path'});
225
[11298]226 my $section = $doc_obj->get_top_section();
227 my $fdt_mapping = $self->{'fdt_mapping'};
[6107]228 my $subfield_separator = $self->{'subfield_separator'};
229 my $entry_separator = $self->{'entry_separator'};
[11466]230 my $isis_record_html_metadata_value = "<table cellpadding=\"4\" cellspacing=\"0\">";
[6107]231
232 # Process each line of the ISIS record, one at a time
[10254]233 foreach my $line (split(/\n/, $$textref)) {
[11430]234 $line =~ s/(\s*)$//; # Remove any nasty whitespace (very important for Windows)
[8646]235 $line =~ /^tag=(.*) data=(.+)$/;
[11298]236 my $tag = $1;
237 my $tag_data = $2;
238 # print STDERR "\nTag: $tag, Data: $tag_data\n";
[6107]239
[11298]240 # Convert the tag number into a name, and remove any invalid characters
241 my $raw_metadata_name = $fdt_mapping->{$tag}{'name'} || "";
[11300]242 $raw_metadata_name =~ s/[,&\#\.\-\/]/ /g;
[11298]243 next if ($raw_metadata_name eq "");
244
[6107]245 # Metadata field names: title case, then remove spaces
[11298]246 my $metadata_name = "";
247 foreach my $word (split(/\s+/, $raw_metadata_name)) {
[6107]248 substr($word, 0, 1) =~ tr/a-z/A-Z/;
[11298]249 $metadata_name .= $word;
[6107]250 }
251
[11298]252 my $all_metadata_name = $metadata_name . "^all";
253 my $all_metadata_value = "";
[6123]254
[11298]255 # Handle repeatable fields
256 if ($fdt_mapping->{$tag}{'repeatable'}) {
257 # Multiple values are separated using the '%' character
258 foreach my $raw_metadata_value (split(/%/, $tag_data)) {
259 my $metadata_value = "";
[6107]260
[11298]261 # Handle subfields
262 while ($raw_metadata_value ne "") {
263 # If there is a subfield specifier, parse it off
264 my $sub_metadata_name = $metadata_name;
[11299]265 if ($raw_metadata_value =~ s/^\^// && $raw_metadata_value =~ s/^([a-z])//) {
[11298]266 $sub_metadata_name .= "^$1";
267 }
268
269 # Parse the value off and add it as metadata
270 $raw_metadata_value =~ s/^([^\^]*)//;
[12705]271 my $sub_metadata_value = &escape_metadata_value($1);
[11298]272
273 # print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
274 if ($sub_metadata_name ne $metadata_name) {
275 $doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
276 }
277
[13157]278 # If this tag has subfields and this is the first, use the value for the CDS/ISIS ^* field
279 if ($fdt_mapping->{$tag}{'subfields'} ne "" && $metadata_value eq "") {
[12703]280 $doc_obj->add_utf8_metadata($section, $metadata_name . "^*", $sub_metadata_value);
281 }
282
[11298]283 $metadata_value .= $subfield_separator unless ($metadata_value eq "");
284 $metadata_value .= $sub_metadata_value;
285 }
286
287 # Add the metadata value
288 # print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
289 $doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
290
291 $all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
292 $all_metadata_value .= $metadata_value;
293 }
294 }
295
296 # Handle non-repeatable fields
297 else {
298 my $raw_metadata_value = $tag_data;
299 my $metadata_value = "";
300
301 # Handle subfields
302 while ($raw_metadata_value ne "") {
[6107]303 # If there is a subfield specifier, parse it off
[11298]304 my $sub_metadata_name = $metadata_name;
[11353]305 if ($raw_metadata_value =~ s/^\^// && $raw_metadata_value =~ s/^([a-z])//) {
[11379]306 $sub_metadata_name .= "^$1";
[6107]307 }
308
[11298]309 # Parse the value off and add it as metadata
310 $raw_metadata_value =~ s/^([^\^]*)//;
311 my $sub_metadata_value = $1;
312
313 # Deal with the case when multiple values are specified using <...>
[11545]314 if ($sub_metadata_value =~ /\<(.+)\>/) {
[11298]315 my $sub_sub_metadata_name = $sub_metadata_name . "^sub";
316 my $tmp_sub_metadata_value = $sub_metadata_value;
[11545]317 while ($tmp_sub_metadata_value =~ s/\<(.+?)\>//) {
[11298]318 my $sub_sub_metadata_value = $1;
319 $doc_obj->add_utf8_metadata($section, $sub_sub_metadata_name, $sub_sub_metadata_value);
[6107]320 }
321 }
[11546]322 # Deal with the legacy case when multiple values are specified using /.../
[11545]323 elsif ($sub_metadata_value =~ /\/(.+)\//) {
324 my $sub_sub_metadata_name = $sub_metadata_name . "^sub";
325 my $tmp_sub_metadata_value = $sub_metadata_value;
326 while ($tmp_sub_metadata_value =~ s/\/(.+?)\///) {
327 my $sub_sub_metadata_value = $1;
328 $doc_obj->add_utf8_metadata($section, $sub_sub_metadata_name, $sub_sub_metadata_value);
329 }
330 }
[6107]331
[12705]332 # Escape the metadata value so it appears correctly in the final collection
333 $sub_metadata_value = &escape_metadata_value($sub_metadata_value);
[9998]334
[11298]335 # print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
336 if ($sub_metadata_name ne $metadata_name) {
337 $doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
[6107]338 }
339
[13157]340 # If this tag has subfields and this is the first, use the value for the CDS/ISIS ^* field
341 if ($fdt_mapping->{$tag}{'subfields'} ne "" && $metadata_value eq "") {
[12703]342 $doc_obj->add_utf8_metadata($section, $metadata_name . "^*", $sub_metadata_value);
343 }
344
[11298]345 $metadata_value .= $subfield_separator unless ($metadata_value eq "");
346 $metadata_value .= $sub_metadata_value;
[6107]347 }
348
[11298]349 # Add the metadata value
350 # print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
351 $doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
352
353 $all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
354 $all_metadata_value .= $metadata_value;
[6107]355 }
356
[11298]357 # Add the "^all" metadata value
358 # print STDERR "All metadata name: $all_metadata_name, value: $all_metadata_value\n";
359 $doc_obj->add_utf8_metadata($section, $all_metadata_name, $all_metadata_value);
[11465]360
361 $isis_record_html_metadata_value .= "<tr><td valign=top><nobr><b>" . $fdt_mapping->{$tag}{'name'} . "</b></nobr></td><td valign=top>" . $all_metadata_value . "</td></tr>";
[6107]362 }
[8563]363
[11467]364 # Add a reasonably formatted HTML table view of the record as the document text
[11465]365 $isis_record_html_metadata_value .= "</table>";
[11467]366 $doc_obj->add_utf8_text($section, $isis_record_html_metadata_value);
[11465]367
[11467]368 # Add the full raw record as metadata
[12705]369 my $isis_raw_record_metadata_value = &escape_metadata_value($$textref);
370 $doc_obj->add_utf8_metadata($section, "ISISRawRecord", $isis_raw_record_metadata_value);
[6107]371
[8563]372 # Add FileFormat metadata
[11298]373 $doc_obj->add_utf8_metadata($section, "FileFormat", "CDS/ISIS");
[7686]374
[11298]375 # Record was processed successfully
[6107]376 return 1;
377}
378
379
380sub parse_field_definition_table
381{
[7686]382 my $fdtfilename = shift(@_);
[11262]383 my $encoding = shift(@_);
[6107]384
[7686]385 my %fdtmapping = ();
[6107]386
387 open(FDT_FILE, "<$fdtfilename") || die "Error: Could not open file $fdtfilename.\n";
388
[11262]389 my $fdtfiletext = "";
390 my $reader = new multiread();
[15872]391 $reader->set_handle('ISISPlugin::FDT_FILE');
[11262]392 $reader->set_encoding($encoding);
393 $reader->read_file($fdtfiletext);
394
[7686]395 my $amongstdefinitions = 0;
[11262]396 foreach my $fdtfileline (split(/\n/, $$fdtfiletext)) {
[6107]397 $fdtfileline =~ s/(\s*)$//; # Remove any nasty spaces at the end of the lines
398
399 if ($amongstdefinitions) {
[13298]400 my $fieldname = &unicode::substr($fdtfileline, 0, 30);
401 my $fieldsubfields = &unicode::substr($fdtfileline, 30, 20);
402 my $fieldspecs = &unicode::substr($fdtfileline, 50, 50);
[6107]403
404 # Remove extra spaces
[11298]405 $fieldname =~ s/(\s*)$//;
[6107]406 $fieldsubfields =~ s/(\s*)$//;
[11298]407 $fieldspecs =~ s/(\s*)$//;
[6107]408
[11298]409 # Map from tag number to metadata field title, subfields, and repeatability
410 my $fieldtag = (split(/ /, $fieldspecs))[0];
411 my $fieldrepeatable = (split(/ /, $fieldspecs))[3];
412 $fdtmapping{$fieldtag} = { 'name' => $fieldname,
413 'subfields' => $fieldsubfields,
414 'repeatable' => $fieldrepeatable };
[6107]415 }
416 elsif ($fdtfileline eq "***") {
417 $amongstdefinitions = 1;
418 }
419 }
420
421 close(FDT_FILE);
422
423 return %fdtmapping;
424}
425
426
[12705]427sub escape_metadata_value
428{
429 my $value = shift(@_);
430 $value =~ s/\</&lt;/g;
431 $value =~ s/\>/&gt;/g;
432 $value =~ s/\\/\\\\/g;
433 return $value;
434}
435
436
[11332]437sub clean_up_after_exploding
438{
439 my $self = shift(@_);
440
441 # Delete the FDT and XRF files too
[11334]442 &util::rm($self->{'fdt_file_path'});
443 &util::rm($self->{'xrf_file_path'});
[11332]444}
445
446
[6107]4471;
Note: See TracBrowser for help on using the repository browser.