root/main/trunk/greenstone2/perllib/plugins/ISISPlugin.pm @ 27502

Revision 27502, 14.9 KB (checked in by kjdon, 7 years ago)

trying to fix double encoding issue for isis files. not sure that I have it yet

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# ISISPlugin.pm -- A plugin for CDS/ISIS databases
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 1999-2004 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package ISISPlugin;
28
29use Encode;
30
31use multiread;
32use SplitTextFile;
33use MetadataRead;
34
35use strict;
36no strict 'refs'; # allow filehandles to be variables and viceversa
37
38# ISISPlugin is a sub-class of SplitTextFile.
39# methods with identical signatures take precedence in the order given in the ISA list.
40sub BEGIN {
41    @ISISPlugin::ISA = ('MetadataRead', 'SplitTextFile');
42}
43
44
45my $arguments =
46    [ { 'name' => "process_exp",
47    'desc' => "{BasePlugin.process_exp}",
48    'type' => "regexp",
49    'reqd' => "no",
50    'deft' => &get_default_process_exp() },
51      { 'name' => "block_exp",
52    'desc' => "{BasePlugin.block_exp}",
53    'type' => "regexp",
54    'reqd' => "no",
55    'deft' => &get_default_block_exp(),
56    'hiddengli' => "yes" },
57      { 'name' => "split_exp",
58    'desc' => "{SplitTextFile.split_exp}",
59    'type' => "regexp",
60    'reqd' => "no",
61    'deft' => &get_default_split_exp(),
62        'hiddengli' => "yes" },
63
64      # The interesting options
65      { 'name' => "entry_separator",
66    'desc' => "{ISISPlugin.entry_separator}",
67    'type' => "string",
68    'reqd' => "no",
69    'deft' => "<br>" },
70      { 'name' => "subfield_separator",
71    'desc' => "{ISISPlugin.subfield_separator}",
72    'type' => "string",
73    'reqd' => "no",
74    'deft' => ", " }
75      ];
76
77my $options = { 'name'     => "ISISPlugin",
78        'desc'     => "{ISISPlugin.desc}",
79        'abstract' => "no",
80        'inherits' => "yes",
81        'explodes' => "yes",
82        'args'     => $arguments };
83
84
85# This plugin processes files with the suffix ".mst"
86sub get_default_process_exp {
87    return q^(?i)(\.mst)$^;
88}
89
90
91# This plugin blocks files with the suffix ".fdt" and ".xrf"
92sub get_default_block_exp {
93    return q^(?i)(\.fdt|\.xrf)$^;
94    #return "";
95}
96
97   
98# This plugin splits the input text at the "----------" lines
99sub get_default_split_exp {
100    return q^\r?\n----------\r?\n^;
101}
102
103
104sub new
105{
106    my ($class) = shift (@_);
107    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
108    push(@$pluginlist, $class);
109
110    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
111    push(@{$hashArgOptLists->{"OptList"}},$options);
112
113    my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
114
115    if ($self->{'info_only'}) {
116    # don't worry about any options etc
117    return bless $self, $class;
118    }
119
120    # isis plug doesn't care about encoding - it assumes ascii unless the user
121    # has specified an encoding
122    if ($self->{'input_encoding'} eq "auto") {
123    $self->{'input_encoding'} = "ascii";
124    }
125    return bless $self, $class;
126}
127
128# we block the corresponding fdt and xrf
129# a pain on windows. blocks xxx.FDT, but if actual file is xx.fdt then
130# complains that no plugin can process it. Have put it back to using
131# block exp for now
132# This works now, as are doing case insenstive blocking on windows. However,
133# a pain for GLI as will not know what plugin processes the fdt and xrf.
134# if add to process expression, then get more problems.
135sub store_block_files_tmp {
136   
137    my $self =shift (@_);
138    my ($filename_full_path, $block_hash) = @_;
139    print STDERR "in store block files\n";
140    $self->check_auxiliary_files($filename_full_path);
141    if (-e $self->{'fdt_file_path'}) {
142    print STDERR "$self->{'fdt_file_path'}\n";
143    my $fdt_file = $self->{'fdt_file_path'};
144    &util::block_filename($block_hash,$fdt_file);
145    }
146    if (-e $self->{'xrf_file_path'}) {
147    print STDERR "$self->{'xrf_file_path'}\n";
148    my $xrf_file = $self->{'xrf_file_path'};
149    &util::block_filename($block_hash,$xrf_file);
150    }
151   
152
153}
154
155sub check_auxiliary_files {
156    my $self = shift (@_);
157    my ($filename) = @_;
158
159    my ($database_file_path_root) = ($filename =~ /(.*)\.mst$/i);
160    # Check the associated .fdt and .xrf files exist
161    $self->{'fdt_file_path'} = $database_file_path_root . ".FDT";
162    if (!-e $self->{'fdt_file_path'}) {
163    $self->{'fdt_file_path'} = $database_file_path_root . ".fdt";
164    }
165    $self->{'xrf_file_path'} = $database_file_path_root . ".XRF";
166    if (!-e $self->{'xrf_file_path'}) {
167    $self->{'xrf_file_path'} = $database_file_path_root . ".xrf";
168    }
169}
170   
171
172sub read_file
173{
174    my $self = shift (@_);
175    my ($filename, $encoding, $language, $textref) = @_;
176    my $outhandle = $self->{'outhandle'};
177
178    my ($database_file_path_root) = ($filename =~ /(.*)\.mst$/i);
179    my $mst_file_path_relative = $filename;
180    $mst_file_path_relative =~ s/^.+import.(.*?)$/$1/;
181
182    # Check the associated .fdt and .xrf files exist
183    $self->check_auxiliary_files($filename);
184   
185    if (!-e $self->{'fdt_file_path'}) {
186    print STDERR "<ProcessingError n='$mst_file_path_relative' r='Could not find ISIS FDT file $self->{'fdt_file_path'}'>\n" if ($self->{'gli'});
187    print $outhandle "Error: Could not find ISIS FDT file " . $self->{'fdt_file_path'} . ".\n";
188    return;
189    }
190    if (!-e $self->{'xrf_file_path'}) {
191    print STDERR "<ProcessingError n='$mst_file_path_relative' r='Could not find ISIS XRF file $self->{'xrf_file_path'}'>\n" if ($self->{'gli'});
192    print $outhandle "Error: Could not find ISIS XRF file " . $self->{'xrf_file_path'} . ".\n";
193    return;
194    }
195
196    # The text to split is exported from the database by the IsisGdl program
197    open(FILE, "IsisGdl \"$filename\" |");
198
199    my $reader = new multiread();
200    $reader->set_handle('ISISPlugin::FILE');
201    $reader->set_encoding($encoding);
202    $reader->read_file($textref);
203
204    # At this point $$textref is a binary byte string
205    # => turn it into a Unicode aware string, so full
206    # Unicode aware pattern matching can be used.
207    # For instance: 's/\x{0101}//g' or '[[:upper:]]'
208    #
209
210    $$textref = decode("utf8",$$textref);
211    close(FILE);
212
213    # Parse the associated ISIS database Field Definition Table file (.fdt)
214    my %fdt_mapping = &parse_field_definition_table($self->{'fdt_file_path'}, $encoding);
215    $self->{'fdt_mapping'} = \%fdt_mapping;
216
217    # Remove the line at the start, and any blank lines, so the data is split and processed properly
218    $$textref =~ s/^----------\n//;
219    $$textref =~ s/\n\n/\n/g;
220}
221
222
223sub process
224{
225    my $self = shift (@_);
226    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
227    my $outhandle = $self->{'outhandle'};
228
229    # store the auxiliary files so we know which ones were used
230    # (mst file becomes the source file)
231    $doc_obj->associate_source_file($self->{'fdt_file_path'});
232    $doc_obj->associate_source_file($self->{'xrf_file_path'});
233
234    my $section = $doc_obj->get_top_section();
235    my $fdt_mapping = $self->{'fdt_mapping'};
236    my $subfield_separator = $self->{'subfield_separator'};
237    my $entry_separator = $self->{'entry_separator'};
238    my $isis_record_html_metadata_value = "<table cellpadding=\"4\" cellspacing=\"0\">";
239
240    # Process each line of the ISIS record, one at a time
241    foreach my $line (split(/\n/, $$textref)) {
242    $line =~ s/(\s*)$//;  # Remove any nasty whitespace (very important for Windows)
243    $line =~ /^tag=(.*) data=(.+)$/;
244    my $tag = $1;
245    my $tag_data = $2;
246        # print STDERR "\nTag: $tag, Data: $tag_data\n";
247
248    # Convert the tag number into a name, and remove any invalid characters
249    my $raw_metadata_name = $fdt_mapping->{$tag}{'name'} || "";
250    $raw_metadata_name =~ s/[,&\#\.\-\/]/ /g;
251    next if ($raw_metadata_name eq "");
252
253    # Metadata field names: title case, then remove spaces
254    my $metadata_name = "";
255    foreach my $word (split(/\s+/, $raw_metadata_name)) {
256        substr($word, 0, 1) =~ tr/a-z/A-Z/;
257        $metadata_name .= $word;
258    }
259
260    my $all_metadata_name = $metadata_name . "^all";
261    my $all_metadata_value = "";
262
263    # Handle repeatable fields
264    if ($fdt_mapping->{$tag}{'repeatable'}) {
265        # Multiple values are separated using the '%' character
266        foreach my $raw_metadata_value (split(/%/, $tag_data)) {
267        my $metadata_value = "";
268
269        # Handle subfields
270        while ($raw_metadata_value ne "") {
271            # If there is a subfield specifier, parse it off
272            my $sub_metadata_name = $metadata_name;
273            if ($raw_metadata_value =~ s/^\^// && $raw_metadata_value =~ s/^([a-z])//) {
274            $sub_metadata_name .= "^$1";
275            }
276
277            # Parse the value off and add it as metadata
278            $raw_metadata_value =~ s/^([^\^]*)//;
279            my $sub_metadata_value = &escape_metadata_value($1);
280
281            # print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
282            if ($sub_metadata_name ne $metadata_name) {
283            $doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
284            }
285
286            # If this tag has subfields and this is the first, use the value for the CDS/ISIS ^* field
287            if ($fdt_mapping->{$tag}{'subfields'} ne "" && $metadata_value eq "") {
288            $doc_obj->add_utf8_metadata($section, $metadata_name . "^*", $sub_metadata_value);
289            }
290
291            $metadata_value .= $subfield_separator unless ($metadata_value eq "");
292            $metadata_value .= $sub_metadata_value;
293        }
294
295        # Add the metadata value
296        # print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
297        $doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
298
299        $all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
300        $all_metadata_value .= $metadata_value;
301        }
302    }
303
304    # Handle non-repeatable fields
305    else {
306        my $raw_metadata_value = $tag_data;
307        my $metadata_value = "";
308
309        # Handle subfields
310        while ($raw_metadata_value ne "") {
311        # If there is a subfield specifier, parse it off
312        my $sub_metadata_name = $metadata_name;
313        if ($raw_metadata_value =~ s/^\^// && $raw_metadata_value =~ s/^([a-z])//) {
314            $sub_metadata_name .= "^$1";
315        }
316
317        # Parse the value off and add it as metadata
318        $raw_metadata_value =~ s/^([^\^]*)//;
319        my $sub_metadata_value = $1;
320
321        # Deal with the case when multiple values are specified using <...>
322        if ($sub_metadata_value =~ /\<(.+)\>/) {
323            my $sub_sub_metadata_name = $sub_metadata_name . "^sub";
324            my $tmp_sub_metadata_value = $sub_metadata_value;
325            while ($tmp_sub_metadata_value =~ s/\<(.+?)\>//) {
326            my $sub_sub_metadata_value = $1;
327            $doc_obj->add_utf8_metadata($section, $sub_sub_metadata_name, $sub_sub_metadata_value);
328            }
329        }
330        # Deal with the legacy case when multiple values are specified using /.../
331        elsif ($sub_metadata_value =~ /\/(.+)\//) {
332            my $sub_sub_metadata_name = $sub_metadata_name . "^sub";
333            my $tmp_sub_metadata_value = $sub_metadata_value;
334            while ($tmp_sub_metadata_value =~ s/\/(.+?)\///) {
335            my $sub_sub_metadata_value = $1;
336            $doc_obj->add_utf8_metadata($section, $sub_sub_metadata_name, $sub_sub_metadata_value);
337            }
338        }
339
340        # Escape the metadata value so it appears correctly in the final collection
341        $sub_metadata_value = &escape_metadata_value($sub_metadata_value);
342
343        # print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
344        if ($sub_metadata_name ne $metadata_name) {
345            $doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
346        }
347
348        # If this tag has subfields and this is the first, use the value for the CDS/ISIS ^* field
349        if ($fdt_mapping->{$tag}{'subfields'} ne "" && $metadata_value eq "") {
350            $doc_obj->add_utf8_metadata($section, $metadata_name . "^*", $sub_metadata_value);
351        }
352
353        $metadata_value .= $subfield_separator unless ($metadata_value eq "");
354        $metadata_value .= $sub_metadata_value;
355        }
356
357        # Add the metadata value
358        # print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
359        $doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
360
361        $all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
362        $all_metadata_value .= $metadata_value;
363    }
364
365    # Add the "^all" metadata value
366    # print STDERR "All metadata name: $all_metadata_name, value: $all_metadata_value\n";
367    $doc_obj->add_utf8_metadata($section, $all_metadata_name, $all_metadata_value);
368
369    $isis_record_html_metadata_value .= "<tr><td valign=top><nobr><b>" . $fdt_mapping->{$tag}{'name'} . "</b></nobr></td><td valign=top>" . $all_metadata_value . "</td></tr>";
370    }
371
372    # Add a reasonably formatted HTML table view of the record as the document text
373    $isis_record_html_metadata_value .= "</table>";
374    $doc_obj->add_utf8_text($section, $isis_record_html_metadata_value);
375
376    # Add the full raw record as metadata
377    my $isis_raw_record_metadata_value = &escape_metadata_value($$textref);
378    $doc_obj->add_utf8_metadata($section, "ISISRawRecord", $isis_raw_record_metadata_value);
379
380    # Add FileFormat metadata
381    $doc_obj->add_utf8_metadata($section, "FileFormat", "CDS/ISIS");
382
383    # Record was processed successfully
384    return 1;
385}
386
387
388sub parse_field_definition_table
389{
390    my $fdtfilename = shift(@_);
391    my $encoding = shift(@_);
392
393    my %fdtmapping = ();
394
395    open(FDT_FILE, "<$fdtfilename") || die "Error: Could not open file $fdtfilename.\n";
396
397    my $fdtfiletext = "";
398    my $reader = new multiread();
399    $reader->set_handle('ISISPlugin::FDT_FILE');
400    $reader->set_encoding($encoding);
401    $reader->read_file($fdtfiletext);
402
403    my $amongstdefinitions = 0;
404    foreach my $fdtfileline (split(/\n/, $$fdtfiletext)) {
405    $fdtfileline =~ s/(\s*)$//;  # Remove any nasty spaces at the end of the lines
406
407    if ($amongstdefinitions) {
408        my $fieldname      = &unicode::substr($fdtfileline,  0, 30);
409        my $fieldsubfields = &unicode::substr($fdtfileline, 30, 20);
410        my $fieldspecs     = &unicode::substr($fdtfileline, 50, 50);
411
412        # Remove extra spaces
413        $fieldname =~ s/(\s*)$//;
414        $fieldsubfields =~ s/(\s*)$//;
415        $fieldspecs =~ s/(\s*)$//;
416
417        # Map from tag number to metadata field title, subfields, and repeatability
418        my $fieldtag = (split(/ /, $fieldspecs))[0];
419        my $fieldrepeatable = (split(/ /, $fieldspecs))[3];
420        $fdtmapping{$fieldtag} = { 'name' => $fieldname,
421                       'subfields' => $fieldsubfields,
422                       'repeatable' => $fieldrepeatable };
423    }
424    elsif ($fdtfileline eq "***") {
425        $amongstdefinitions = 1;
426    }
427    }
428
429    close(FDT_FILE);
430
431    return %fdtmapping;
432}
433
434
435sub escape_metadata_value
436{
437    my $value = shift(@_);
438    $value =~ s/\</&lt;/g;
439    $value =~ s/\>/&gt;/g;
440    $value =~ s/\\/\\\\/g;
441    return $value;
442}
443
444
445sub clean_up_after_exploding
446{
447    my $self = shift(@_);
448
449    # Delete the FDT and XRF files too
450    &util::rm($self->{'fdt_file_path'});
451    &util::rm($self->{'xrf_file_path'});
452}
453
454
4551;
Note: See TracBrowser for help on using the browser.