source: trunk/gsdl/perllib/plugins/ISISPlug.pm@ 10229

Last change on this file since 10229 was 10218, checked in by kjdon, 19 years ago

Jeffrey's new parsing modifications, committed approx 6 July, 15.16

  • Property svn:keywords set to Author Date Id Revision
File size: 8.2 KB
RevLine 
[6107]1###########################################################################
2#
3# ISISPlug.pm -- A plugin for CDS/ISIS databases
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
[7686]9# Copyright 1999-2004 New Zealand Digital Library Project
[6107]10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package ISISPlug;
28
29
30use multiread;
31use SplitPlug;
32
33
34# ISISPlug is a sub-class of SplitPlug.
35sub BEGIN {
36 @ISA = ('SplitPlug');
37}
38
39
40my $arguments =
[7686]41 [ { 'name' => "process_exp",
[6408]42 'desc' => "{BasPlug.process_exp}",
43 'type' => "regexp",
44 'reqd' => "no",
[6107]45 'deft' => &get_default_process_exp() },
46 { 'name' => "block_exp",
47 'desc' => "{BasPlug.block_exp}",
[6408]48 'type' => "regexp",
[7686]49 'reqd' => "no",
[6107]50 'deft' => &get_default_block_exp() },
[7686]51 { 'name' => "split_exp",
52 'desc' => "{SplitPlug.split_exp}",
53 'type' => "regexp",
54 'reqd' => "no",
55 'deft' => &get_default_split_exp() },
56
57 # The interesting options
58 { 'name' => "entry_separator",
59 'desc' => "{ISISPlug.entry_separator}",
60 'type' => "string",
61 'reqd' => "no",
62 'deft' => "<br>" },
[6107]63 { 'name' => "subfield_separator",
64 'desc' => "{ISISPlug.subfield_separator}",
65 'type' => "string",
66 'reqd' => "no",
[8563]67 'deft' => ", " }
[6408]68 ];
[6107]69
70my $options = { 'name' => "ISISPlug",
71 'desc' => "{ISISPlug.desc}",
[6408]72 'abstract' => "no",
73 'inherits' => "yes",
[8762]74 'explodes' => "yes",
[6107]75 'args' => $arguments };
76
77
78# This plugin processes files with the suffix ".mst"
79sub get_default_process_exp {
80 return q^(?i)(\.mst)$^;
81}
82
83
84# This plugin blocks files with the suffix ".fdt" and ".xrf"
85sub get_default_block_exp {
86 return q^(?i)(\.fdt|\.xrf)$^;
87}
88
89
90# This plugin splits the input text at the "----------" lines
91sub get_default_split_exp {
[9998]92 return q^\r?\n----------\r?\n^;
[6107]93}
94
95
[8563]96sub new
97{
[10218]98 my ($class) = shift (@_);
99 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
100 push(@$pluginlist, $class);
[6107]101
[10218]102 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
103 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
[6107]104
[10218]105 my $self = (defined $hashArgOptLists)? new SplitPlug($pluginlist,$inputargs,$hashArgOptLists): new SplitPlug($pluginlist,$inputargs);
[6107]106
107 return bless $self, $class;
108}
109
110
[8563]111sub read_file
[7686]112{
[6107]113 my $self = shift (@_);
114 my ($filename, $encoding, $language, $textref) = @_;
115
[8563]116 my ($databasename) = ($filename =~ /(.*)\.mst$/i);
[6107]117
[7048]118 # Check the associated .fdt and .xrf files exist
119 # These files must have a lowercase extension for the IsisGdl program to work
120 # Bailing out because of this is kind of crappy but it is only an issue on Unix
121 my $fdtfilename = $databasename . ".fdt";
122 if (! -e $fdtfilename) {
[7049]123 die "Error: Could not find ISIS FDT file $fdtfilename.\n";
[7048]124 }
125 my $xrffilename = $databasename . ".xrf";
126 if (! -e $xrffilename) {
[7049]127 die "Error: Could not find ISIS XRF file $xrffilename.\n";
[7048]128 }
129
[6107]130 # The text to split is exported from the database by the IsisGdl program
[7021]131 open(FILE, "IsisGdl \"$filename\" |");
[6107]132
133 my $reader = new multiread();
[8563]134 $reader->set_handle('ISISPlug::FILE');
135 $reader->set_encoding($encoding);
136 $reader->read_file($textref);
[6107]137
138 close(FILE);
139
140 # Parse the associated ISIS database Field Definition Table file (.fdt)
141 my %fdtmapping = &parse_field_definition_table($fdtfilename);
142
143 # Map the tag numbers to tag names, using the FDT mapping
[9998]144 $$textref =~ s/\r?\ntag=(\d+) /\ntag=$fdtmapping{$1}{'title'} /g;
[6107]145
[7686]146 # Remove the line at the start so it is split and processed properly
147 $$textref =~ s/^----------\n//;
[6107]148}
149
150
151sub process
152{
153 my $self = shift (@_);
[6332]154 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[6107]155 my $outhandle = $self->{'outhandle'};
156
157 my $subfield_separator = $self->{'subfield_separator'};
158 my $entry_separator = $self->{'entry_separator'};
159
160 # Report that we're processing the file
[6332]161 print STDERR "<Processing n='$file' p='ISISPlug'>\n" if ($gli);
[6107]162 print $outhandle "IsisPlug: processing $file\n"
163 if ($self->{'verbosity'}) > 1;
164
165 # Process each line of the ISIS record, one at a time
166 foreach $line (split(/\n/, $$textref)) {
[8646]167 $line =~ /^tag=(.*) data=(.+)$/;
[7686]168 my $rawtagname = $1;
169 my $rawtagdata = $2;
[8646]170 # print STDERR "Raw tag: $rawtagname, Raw data: $rawtagdata\n";
171 next if ($rawtagname eq "");
[6107]172
173 # Metadata field names: title case, then remove spaces
[7686]174 my $tagname = "";
[6107]175 foreach $word (split(/\s+/, $rawtagname)) {
176 substr($word, 0, 1) =~ tr/a-z/A-Z/;
177 $tagname .= $word;
178 }
179
[6123]180 # Make sure there is nothing bad in the tag names
181 $tagname =~ s/&//g;
182
[6107]183 # Handle each piece of metadata ('%' separated)
[7686]184 my $completetagvalue = "";
[6107]185 foreach $rawtagvalue (split(/%/, $rawtagdata)) {
186 $completetagvalue .= $entry_separator unless ($completetagvalue eq "");
187
188 # Metadata field values: take care with subfields
[7686]189 my $completeentryvalue = "";
[6107]190 while ($rawtagvalue ne "") {
191 # If there is a subfield specifier, parse it off
[7686]192 my $subfieldname = "";
[8646]193 if ($rawtagvalue =~ s/^\^// && $rawtagvalue =~ s/([a-z])//) {
[8563]194 $subfieldname = "^$1";
[6107]195 }
196
197 # Parse the metadata value off
198 $rawtagvalue =~ s/^([^\^]*)//;
[7686]199 my $metadatafieldname = $tagname . $subfieldname;
200 my $metadatafieldvalue = $1;
[8749]201
[6107]202 # Handle Keywords specially
203 if ($metadatafieldname eq "Keywords") {
[7686]204 my $keywordmetadatavalue = $metadatafieldvalue;
205 my $keywordlist = "";
[9998]206 while ($keywordmetadatavalue =~ s/\<(.+?)\>//) {
[7686]207 my $keyword = $1;
[6107]208 $doc_obj->add_utf8_metadata($cursection, $metadatafieldname, $keyword);
209 $keywordlist .= ", " unless ($keywordlist eq "");
210 $keywordlist .= $keyword;
211 }
212
213 $metadatafieldvalue = $keywordlist;
214 }
215
[9998]216 # Escape any '<' and '>' characters so they appear correctly in the final collection
217 $metadatafieldvalue =~ s/\</&lt;/g;
218 $metadatafieldvalue =~ s/\>/&gt;/g;
219
220 # We have already added Keywords metadata above
221 unless ($metadatafieldname eq "Keywords") {
[6107]222 $doc_obj->add_utf8_metadata($cursection, $metadatafieldname, $metadatafieldvalue);
223 }
224
225 $completeentryvalue .= $subfield_separator unless ($completeentryvalue eq "");
226 $completeentryvalue .= $metadatafieldvalue;
227 }
228
229 $completetagvalue .= $completeentryvalue;
230 }
231
[8563]232 $doc_obj->add_utf8_metadata($cursection, $tagname . "^all", $completetagvalue);
[6107]233 }
[8563]234
[6107]235 # Add the full record as the document text
236 $$textref =~ s/\</&lt;/g;
237 $$textref =~ s/\>/&gt;/g;
[8563]238 $doc_obj->add_utf8_text($cursection, $$textref);
[6107]239
[8563]240 # Add FileFormat metadata
241 $doc_obj->add_utf8_metadata($cursection, "FileFormat", "CDS/ISIS");
[7686]242
243 # Record was processed successfully (and there was no document obtained)
[6107]244 return 1;
245}
246
247
248sub parse_field_definition_table
249{
[7686]250 my $fdtfilename = shift(@_);
[6107]251
[7686]252 my %fdtmapping = ();
[6107]253
254 open(FDT_FILE, "<$fdtfilename") || die "Error: Could not open file $fdtfilename.\n";
255
[7686]256 my $amongstdefinitions = 0;
[6107]257 foreach $fdtfileline (<FDT_FILE>) {
258 $fdtfileline =~ s/(\s*)$//; # Remove any nasty spaces at the end of the lines
259
260 if ($amongstdefinitions) {
[7686]261 my $fieldtitle = substr($fdtfileline, 0, 30);
262 my $fieldsubfields = substr($fdtfileline, 30, 20);
263 my $fieldspecs = substr($fdtfileline, 50);
[6107]264
265 # Remove extra spaces
266 $fieldtitle =~ s/(\s*)$//;
267 $fieldsubfields =~ s/(\s*)$//;
268
269 # Map from tag number to metadata field title and subfields
[7686]270 my $fieldtag = (split(/ /, $fieldspecs))[0];
[6107]271 $fdtmapping{$fieldtag} = { 'title' => $fieldtitle,
272 'subfields' => $fieldsubfields };
273 }
274 elsif ($fdtfileline eq "***") {
275 $amongstdefinitions = 1;
276 }
277 }
278
279 close(FDT_FILE);
280
281 return %fdtmapping;
282}
283
284
2851;
Note: See TracBrowser for help on using the repository browser.