root/main/trunk/greenstone2/perllib/plugins/ISISPlugin.pm @ 24547

Revision 24547, 14.7 KB (checked in by ak19, 9 years ago)

Added new abstract plugin MetadataRead? that defines can_process_this_file_for_metadata that MetadataPlugin? subclasses can inherit (if MetadataRead? is listed first in the ISA inheritance list) and which will then override the one defined in BasePlugin?. For now committing MARC, ISIS and OAIPlugins which now additionally inherit from MetadataRead?. Other metadataPlugins also need to be committed.

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# ISISPlugin.pm -- A plugin for CDS/ISIS databases
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 1999-2004 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package ISISPlugin;
28
29
30use multiread;
31use SplitTextFile;
32use MetadataRead;
33
34use strict;
35no strict 'refs'; # allow filehandles to be variables and viceversa
36
37# ISISPlugin is a sub-class of SplitTextFile.
38# methods with identical signatures take precedence in the order given in the ISA list.
39sub BEGIN {
40    @ISISPlugin::ISA = ('MetadataRead', 'SplitTextFile');
41}
42
43
44my $arguments =
45    [ { 'name' => "process_exp",
46    'desc' => "{BasePlugin.process_exp}",
47    'type' => "regexp",
48    'reqd' => "no",
49    'deft' => &get_default_process_exp() },
50      { 'name' => "block_exp",
51    'desc' => "{BasePlugin.block_exp}",
52    'type' => "regexp",
53    'reqd' => "no",
54    'deft' => &get_default_block_exp(),
55    'hiddengli' => "yes" },
56      { 'name' => "split_exp",
57    'desc' => "{SplitTextFile.split_exp}",
58    'type' => "regexp",
59    'reqd' => "no",
60    'deft' => &get_default_split_exp(),
61        'hiddengli' => "yes" },
62
63      # The interesting options
64      { 'name' => "entry_separator",
65    'desc' => "{ISISPlugin.entry_separator}",
66    'type' => "string",
67    'reqd' => "no",
68    'deft' => "<br>" },
69      { 'name' => "subfield_separator",
70    'desc' => "{ISISPlugin.subfield_separator}",
71    'type' => "string",
72    'reqd' => "no",
73    'deft' => ", " }
74      ];
75
76my $options = { 'name'     => "ISISPlugin",
77        'desc'     => "{ISISPlugin.desc}",
78        'abstract' => "no",
79        'inherits' => "yes",
80        'explodes' => "yes",
81        'args'     => $arguments };
82
83
84# This plugin processes files with the suffix ".mst"
85sub get_default_process_exp {
86    return q^(?i)(\.mst)$^;
87}
88
89
90# This plugin blocks files with the suffix ".fdt" and ".xrf"
91sub get_default_block_exp {
92    return q^(?i)(\.fdt|\.xrf)$^;
93    #return "";
94}
95
96   
97# This plugin splits the input text at the "----------" lines
98sub get_default_split_exp {
99    return q^\r?\n----------\r?\n^;
100}
101
102
103sub new
104{
105    my ($class) = shift (@_);
106    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
107    push(@$pluginlist, $class);
108
109    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
110    push(@{$hashArgOptLists->{"OptList"}},$options);
111
112    my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
113
114    if ($self->{'info_only'}) {
115    # don't worry about any options etc
116    return bless $self, $class;
117    }
118
119    # isis plug doesn't care about encoding - it assumes ascii unless the user
120    # has specified an encoding
121    if ($self->{'input_encoding'} eq "auto") {
122    $self->{'input_encoding'} = "ascii";
123    }
124    return bless $self, $class;
125}
126
127# we block the corresponding fdt and xrf
128# a pain on windows. blocks xxx.FDT, but if actual file is xx.fdt then
129# complains that no plugin can process it. Have put it back to using
130# block exp for now
131# This works now, as are doing case insenstive blocking on windows. However,
132# a pain for GLI as will not know what plugin processes the fdt and xrf.
133# if add to process expression, then get more problems.
134sub store_block_files_tmp {
135   
136    my $self =shift (@_);
137    my ($filename_full_path, $block_hash) = @_;
138    print STDERR "in store block files\n";
139    $self->check_auxiliary_files($filename_full_path);
140    if (-e $self->{'fdt_file_path'}) {
141    print STDERR "$self->{'fdt_file_path'}\n";
142    my $fdt_file = $self->{'fdt_file_path'};
143    &util::block_filename($block_hash,$fdt_file);
144    }
145    if (-e $self->{'xrf_file_path'}) {
146    print STDERR "$self->{'xrf_file_path'}\n";
147    my $xrf_file = $self->{'xrf_file_path'};
148    &util::block_filename($block_hash,$xrf_file);
149    }
150   
151
152}
153
154sub check_auxiliary_files {
155    my $self = shift (@_);
156    my ($filename) = @_;
157
158    my ($database_file_path_root) = ($filename =~ /(.*)\.mst$/i);
159    # Check the associated .fdt and .xrf files exist
160    $self->{'fdt_file_path'} = $database_file_path_root . ".FDT";
161    if (!-e $self->{'fdt_file_path'}) {
162    $self->{'fdt_file_path'} = $database_file_path_root . ".fdt";
163    }
164    $self->{'xrf_file_path'} = $database_file_path_root . ".XRF";
165    if (!-e $self->{'xrf_file_path'}) {
166    $self->{'xrf_file_path'} = $database_file_path_root . ".xrf";
167    }
168}
169   
170
171sub read_file
172{
173    my $self = shift (@_);
174    my ($filename, $encoding, $language, $textref) = @_;
175    my $outhandle = $self->{'outhandle'};
176
177    my ($database_file_path_root) = ($filename =~ /(.*)\.mst$/i);
178    my $mst_file_path_relative = $filename;
179    $mst_file_path_relative =~ s/^.+import.(.*?)$/$1/;
180
181    # Check the associated .fdt and .xrf files exist
182    $self->check_auxiliary_files($filename);
183   
184    if (!-e $self->{'fdt_file_path'}) {
185    print STDERR "<ProcessingError n='$mst_file_path_relative' r='Could not find ISIS FDT file $self->{'fdt_file_path'}'>\n" if ($self->{'gli'});
186    print $outhandle "Error: Could not find ISIS FDT file " . $self->{'fdt_file_path'} . ".\n";
187    return;
188    }
189    if (!-e $self->{'xrf_file_path'}) {
190    print STDERR "<ProcessingError n='$mst_file_path_relative' r='Could not find ISIS XRF file $self->{'xrf_file_path'}'>\n" if ($self->{'gli'});
191    print $outhandle "Error: Could not find ISIS XRF file " . $self->{'xrf_file_path'} . ".\n";
192    return;
193    }
194
195    # The text to split is exported from the database by the IsisGdl program
196    open(FILE, "IsisGdl \"$filename\" |");
197
198    my $reader = new multiread();
199    $reader->set_handle('ISISPlugin::FILE');
200    $reader->set_encoding($encoding);
201    $reader->read_file($textref);
202
203    close(FILE);
204
205    # Parse the associated ISIS database Field Definition Table file (.fdt)
206    my %fdt_mapping = &parse_field_definition_table($self->{'fdt_file_path'}, $encoding);
207    $self->{'fdt_mapping'} = \%fdt_mapping;
208
209    # Remove the line at the start, and any blank lines, so the data is split and processed properly
210    $$textref =~ s/^----------\n//;
211    $$textref =~ s/\n\n/\n/g;
212}
213
214
215sub process
216{
217    my $self = shift (@_);
218    my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
219    my $outhandle = $self->{'outhandle'};
220
221    # store the auxiliary files so we know which ones were used
222    # (mst file becomes the source file)
223    $doc_obj->associate_source_file($self->{'fdt_file_path'});
224    $doc_obj->associate_source_file($self->{'xrf_file_path'});
225
226    my $section = $doc_obj->get_top_section();
227    my $fdt_mapping = $self->{'fdt_mapping'};
228    my $subfield_separator = $self->{'subfield_separator'};
229    my $entry_separator = $self->{'entry_separator'};
230    my $isis_record_html_metadata_value = "<table cellpadding=\"4\" cellspacing=\"0\">";
231
232    # Process each line of the ISIS record, one at a time
233    foreach my $line (split(/\n/, $$textref)) {
234    $line =~ s/(\s*)$//;  # Remove any nasty whitespace (very important for Windows)
235    $line =~ /^tag=(.*) data=(.+)$/;
236    my $tag = $1;
237    my $tag_data = $2;
238        # print STDERR "\nTag: $tag, Data: $tag_data\n";
239
240    # Convert the tag number into a name, and remove any invalid characters
241    my $raw_metadata_name = $fdt_mapping->{$tag}{'name'} || "";
242    $raw_metadata_name =~ s/[,&\#\.\-\/]/ /g;
243    next if ($raw_metadata_name eq "");
244
245    # Metadata field names: title case, then remove spaces
246    my $metadata_name = "";
247    foreach my $word (split(/\s+/, $raw_metadata_name)) {
248        substr($word, 0, 1) =~ tr/a-z/A-Z/;
249        $metadata_name .= $word;
250    }
251
252    my $all_metadata_name = $metadata_name . "^all";
253    my $all_metadata_value = "";
254
255    # Handle repeatable fields
256    if ($fdt_mapping->{$tag}{'repeatable'}) {
257        # Multiple values are separated using the '%' character
258        foreach my $raw_metadata_value (split(/%/, $tag_data)) {
259        my $metadata_value = "";
260
261        # Handle subfields
262        while ($raw_metadata_value ne "") {
263            # If there is a subfield specifier, parse it off
264            my $sub_metadata_name = $metadata_name;
265            if ($raw_metadata_value =~ s/^\^// && $raw_metadata_value =~ s/^([a-z])//) {
266            $sub_metadata_name .= "^$1";
267            }
268
269            # Parse the value off and add it as metadata
270            $raw_metadata_value =~ s/^([^\^]*)//;
271            my $sub_metadata_value = &escape_metadata_value($1);
272
273            # print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
274            if ($sub_metadata_name ne $metadata_name) {
275            $doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
276            }
277
278            # If this tag has subfields and this is the first, use the value for the CDS/ISIS ^* field
279            if ($fdt_mapping->{$tag}{'subfields'} ne "" && $metadata_value eq "") {
280            $doc_obj->add_utf8_metadata($section, $metadata_name . "^*", $sub_metadata_value);
281            }
282
283            $metadata_value .= $subfield_separator unless ($metadata_value eq "");
284            $metadata_value .= $sub_metadata_value;
285        }
286
287        # Add the metadata value
288        # print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
289        $doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
290
291        $all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
292        $all_metadata_value .= $metadata_value;
293        }
294    }
295
296    # Handle non-repeatable fields
297    else {
298        my $raw_metadata_value = $tag_data;
299        my $metadata_value = "";
300
301        # Handle subfields
302        while ($raw_metadata_value ne "") {
303        # If there is a subfield specifier, parse it off
304        my $sub_metadata_name = $metadata_name;
305        if ($raw_metadata_value =~ s/^\^// && $raw_metadata_value =~ s/^([a-z])//) {
306            $sub_metadata_name .= "^$1";
307        }
308
309        # Parse the value off and add it as metadata
310        $raw_metadata_value =~ s/^([^\^]*)//;
311        my $sub_metadata_value = $1;
312
313        # Deal with the case when multiple values are specified using <...>
314        if ($sub_metadata_value =~ /\<(.+)\>/) {
315            my $sub_sub_metadata_name = $sub_metadata_name . "^sub";
316            my $tmp_sub_metadata_value = $sub_metadata_value;
317            while ($tmp_sub_metadata_value =~ s/\<(.+?)\>//) {
318            my $sub_sub_metadata_value = $1;
319            $doc_obj->add_utf8_metadata($section, $sub_sub_metadata_name, $sub_sub_metadata_value);
320            }
321        }
322        # Deal with the legacy case when multiple values are specified using /.../
323        elsif ($sub_metadata_value =~ /\/(.+)\//) {
324            my $sub_sub_metadata_name = $sub_metadata_name . "^sub";
325            my $tmp_sub_metadata_value = $sub_metadata_value;
326            while ($tmp_sub_metadata_value =~ s/\/(.+?)\///) {
327            my $sub_sub_metadata_value = $1;
328            $doc_obj->add_utf8_metadata($section, $sub_sub_metadata_name, $sub_sub_metadata_value);
329            }
330        }
331
332        # Escape the metadata value so it appears correctly in the final collection
333        $sub_metadata_value = &escape_metadata_value($sub_metadata_value);
334
335        # print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
336        if ($sub_metadata_name ne $metadata_name) {
337            $doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
338        }
339
340        # If this tag has subfields and this is the first, use the value for the CDS/ISIS ^* field
341        if ($fdt_mapping->{$tag}{'subfields'} ne "" && $metadata_value eq "") {
342            $doc_obj->add_utf8_metadata($section, $metadata_name . "^*", $sub_metadata_value);
343        }
344
345        $metadata_value .= $subfield_separator unless ($metadata_value eq "");
346        $metadata_value .= $sub_metadata_value;
347        }
348
349        # Add the metadata value
350        # print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
351        $doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
352
353        $all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
354        $all_metadata_value .= $metadata_value;
355    }
356
357    # Add the "^all" metadata value
358    # print STDERR "All metadata name: $all_metadata_name, value: $all_metadata_value\n";
359    $doc_obj->add_utf8_metadata($section, $all_metadata_name, $all_metadata_value);
360
361    $isis_record_html_metadata_value .= "<tr><td valign=top><nobr><b>" . $fdt_mapping->{$tag}{'name'} . "</b></nobr></td><td valign=top>" . $all_metadata_value . "</td></tr>";
362    }
363
364    # Add a reasonably formatted HTML table view of the record as the document text
365    $isis_record_html_metadata_value .= "</table>";
366    $doc_obj->add_utf8_text($section, $isis_record_html_metadata_value);
367
368    # Add the full raw record as metadata
369    my $isis_raw_record_metadata_value = &escape_metadata_value($$textref);
370    $doc_obj->add_utf8_metadata($section, "ISISRawRecord", $isis_raw_record_metadata_value);
371
372    # Add FileFormat metadata
373    $doc_obj->add_utf8_metadata($section, "FileFormat", "CDS/ISIS");
374
375    # Record was processed successfully
376    return 1;
377}
378
379
380sub parse_field_definition_table
381{
382    my $fdtfilename = shift(@_);
383    my $encoding = shift(@_);
384
385    my %fdtmapping = ();
386
387    open(FDT_FILE, "<$fdtfilename") || die "Error: Could not open file $fdtfilename.\n";
388
389    my $fdtfiletext = "";
390    my $reader = new multiread();
391    $reader->set_handle('ISISPlugin::FDT_FILE');
392    $reader->set_encoding($encoding);
393    $reader->read_file($fdtfiletext);
394
395    my $amongstdefinitions = 0;
396    foreach my $fdtfileline (split(/\n/, $$fdtfiletext)) {
397    $fdtfileline =~ s/(\s*)$//;  # Remove any nasty spaces at the end of the lines
398
399    if ($amongstdefinitions) {
400        my $fieldname      = &unicode::substr($fdtfileline,  0, 30);
401        my $fieldsubfields = &unicode::substr($fdtfileline, 30, 20);
402        my $fieldspecs     = &unicode::substr($fdtfileline, 50, 50);
403
404        # Remove extra spaces
405        $fieldname =~ s/(\s*)$//;
406        $fieldsubfields =~ s/(\s*)$//;
407        $fieldspecs =~ s/(\s*)$//;
408
409        # Map from tag number to metadata field title, subfields, and repeatability
410        my $fieldtag = (split(/ /, $fieldspecs))[0];
411        my $fieldrepeatable = (split(/ /, $fieldspecs))[3];
412        $fdtmapping{$fieldtag} = { 'name' => $fieldname,
413                       'subfields' => $fieldsubfields,
414                       'repeatable' => $fieldrepeatable };
415    }
416    elsif ($fdtfileline eq "***") {
417        $amongstdefinitions = 1;
418    }
419    }
420
421    close(FDT_FILE);
422
423    return %fdtmapping;
424}
425
426
427sub escape_metadata_value
428{
429    my $value = shift(@_);
430    $value =~ s/\</&lt;/g;
431    $value =~ s/\>/&gt;/g;
432    $value =~ s/\\/\\\\/g;
433    return $value;
434}
435
436
437sub clean_up_after_exploding
438{
439    my $self = shift(@_);
440
441    # Delete the FDT and XRF files too
442    &util::rm($self->{'fdt_file_path'});
443    &util::rm($self->{'xrf_file_path'});
444}
445
446
4471;
Note: See TracBrowser for help on using the browser.