source: trunk/gsdl/perllib/plugins/ISISPlug.pm@ 7243

Last change on this file since 7243 was 7049, checked in by mdewsnip, 20 years ago

Reduced the error messages just added because they would be confusing on Windows.

  • Property svn:keywords set to Author Date Id Revision
File size: 8.0 KB
Line 
1###########################################################################
2#
3# ISISPlug.pm -- A plugin for CDS/ISIS databases
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 1999-2003 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package ISISPlug;
28
29
30use multiread;
31use SplitPlug;
32
33
34# ISISPlug is a sub-class of SplitPlug.
35sub BEGIN {
36 @ISA = ('SplitPlug');
37}
38
39
40my $arguments =
41 [ { 'name' => "entry_separator",
42 'desc' => "{ISISPlug.entry_separator}",
43 'type' => "string",
44 'reqd' => "no",
45 'deft' => "<br>" },
46 { 'name' => "process_exp",
47 'desc' => "{BasPlug.process_exp}",
48 'type' => "regexp",
49 'reqd' => "no",
50 'deft' => &get_default_process_exp() },
51 { 'name' => "block_exp",
52 'desc' => "{BasPlug.block_exp}",
53 'type' => "regexp",
54 'deft' => &get_default_block_exp() },
55 { 'name' => "subfield_separator",
56 'desc' => "{ISISPlug.subfield_separator}",
57 'type' => "string",
58 'reqd' => "no",
59 'deft' => ", " },
60 { 'name' => "split_exp",
61 'desc' => "{SplitPlug.split_exp}",
62 'type' => "regexp",
63 'deft' => &get_default_split_exp(),
64 'reqd' => "no" }
65 ];
66
67my $options = { 'name' => "ISISPlug",
68 'desc' => "{ISISPlug.desc}",
69 'abstract' => "no",
70 'inherits' => "yes",
71 'args' => $arguments };
72
73
74# This plugin processes files with the suffix ".mst"
75sub get_default_process_exp {
76 return q^(?i)(\.mst)$^;
77}
78
79
80# This plugin blocks files with the suffix ".fdt" and ".xrf"
81sub get_default_block_exp {
82 return q^(?i)(\.fdt|\.xrf)$^;
83}
84
85
86# This plugin splits the input text at the "----------" lines
87sub get_default_split_exp {
88 return q^\n----------\n^;
89}
90
91
92sub new {
93 my $class = shift(@_);
94
95 my $self = new SplitPlug($class, @_);
96 if (!parsargv::parse(\@_,
97 q^subfield_separator/.*/, ^, \$self->{'subfield_separator'},
98 q^entry_separator/.*/<br>^, \$self->{'entry_separator'},
99 "allow_extra_options")) {
100 print STDERR "\nIncorrect options passed to ISISPlug, check your collect.cfg configuration file\n";
101 die "\n";
102 }
103
104 # To allow for proper inheritance of arguments
105 my $option_list = $self->{'option_list'};
106 push(@{$option_list}, $options);
107 $self->{'plugin_type'} = "ISISPlug";
108
109 return bless $self, $class;
110}
111
112
113sub read_file {
114 my $self = shift (@_);
115 my ($filename, $encoding, $language, $textref) = @_;
116
117 my ($databasename) = ($filename =~ /([^\.]+)\.mst$/i);
118
119 # Check the associated .fdt and .xrf files exist
120 # These files must have a lowercase extension for the IsisGdl program to work
121 # Bailing out because of this is kind of crappy but it is only an issue on Unix
122 my $fdtfilename = $databasename . ".fdt";
123 if (! -e $fdtfilename) {
124 die "Error: Could not find ISIS FDT file $fdtfilename.\n";
125 }
126 my $xrffilename = $databasename . ".xrf";
127 if (! -e $xrffilename) {
128 die "Error: Could not find ISIS XRF file $xrffilename.\n";
129 }
130
131 # The text to split is exported from the database by the IsisGdl program
132 open(FILE, "IsisGdl \"$filename\" |");
133
134 my $reader = new multiread();
135 $reader->set_handle ('ISISPlug::FILE');
136 $reader->set_encoding ($encoding);
137 $reader->read_file ($textref);
138
139 close(FILE);
140
141 # Parse the associated ISIS database Field Definition Table file (.fdt)
142 my %fdtmapping = &parse_field_definition_table($fdtfilename);
143
144 # Map the tag numbers to tag names, using the FDT mapping
145 $$textref =~ s/\ntag=(\d+) /\ntag=$fdtmapping{$1}{'title'} /g;
146
147 # Add a newline at the start so it is split properly
148 $$textref = "\n" . $$textref;
149}
150
151
152sub process
153{
154 my $self = shift (@_);
155 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
156 my $outhandle = $self->{'outhandle'};
157
158 my $subfield_separator = $self->{'subfield_separator'};
159 my $entry_separator = $self->{'entry_separator'};
160
161 # Report that we're processing the file
162 print STDERR "<Processing n='$file' p='ISISPlug'>\n" if ($gli);
163 print $outhandle "IsisPlug: processing $file\n"
164 if ($self->{'verbosity'}) > 1;
165
166 # Process each line of the ISIS record, one at a time
167 foreach $line (split(/\n/, $$textref)) {
168 $line =~ /^tag=(.+) data=(.+)$/;
169 local $rawtagname = $1;
170 local $rawtagdata = $2;
171 # print "Raw tag: $rawtagname, Raw data: $rawtagdata\n";
172
173 # Metadata field names: title case, then remove spaces
174 local $tagname = "";
175 foreach $word (split(/\s+/, $rawtagname)) {
176 substr($word, 0, 1) =~ tr/a-z/A-Z/;
177 $tagname .= $word;
178 }
179
180 # Make sure there is nothing bad in the tag names
181 $tagname =~ s/&//g;
182
183 # Handle each piece of metadata ('%' separated)
184 local $completetagvalue = "";
185 foreach $rawtagvalue (split(/%/, $rawtagdata)) {
186 $completetagvalue .= $entry_separator unless ($completetagvalue eq "");
187
188 # Metadata field values: take care with subfields
189 local $completeentryvalue = "";
190 while ($rawtagvalue ne "") {
191 # If there is a subfield specifier, parse it off
192 local $subfieldname = "";
193 if ($rawtagvalue =~ s/^\^([a-z])//) {
194 $subfieldname = "." . $1;
195 }
196
197 # Parse the metadata value off
198 $rawtagvalue =~ s/^([^\^]*)//;
199 local $metadatafieldname = $tagname . $subfieldname;
200 local $metadatafieldvalue = $1;
201 # print "Metadata: $metadatafieldname -> $metadatafieldvalue\n";
202
203 # Handle Keywords specially
204 if ($metadatafieldname eq "Keywords") {
205 local $keywordmetadatavalue = $metadatafieldvalue;
206 local $keywordlist = "";
207 while ($keywordmetadatavalue =~ s/\<([^\>]+)\>//) {
208 local $keyword = $1;
209 $doc_obj->add_utf8_metadata($cursection, $metadatafieldname, $keyword);
210 $keywordlist .= ", " unless ($keywordlist eq "");
211 $keywordlist .= $keyword;
212 }
213
214 $metadatafieldvalue = $keywordlist;
215 }
216
217 else {
218 $doc_obj->add_utf8_metadata($cursection, $metadatafieldname, $metadatafieldvalue);
219 }
220
221 $completeentryvalue .= $subfield_separator unless ($completeentryvalue eq "");
222 $completeentryvalue .= $metadatafieldvalue;
223 }
224
225 $completetagvalue .= $completeentryvalue;
226 }
227
228 # print "Metadata: $tagname.all -> $completetagvalue\n";
229 $doc_obj->add_utf8_metadata($cursection, $tagname . ".all", $completetagvalue);
230 }
231 # print "\n";
232
233 # Add the full record as the document text
234 $$textref =~ s/\</&lt;/g;
235 $$textref =~ s/\>/&gt;/g;
236 $doc_obj->add_utf8_text ($cursection, $$textref);
237
238 # Document was processed successfully
239 return 1;
240}
241
242
243sub parse_field_definition_table
244{
245 local $fdtfilename = shift(@_);
246
247 local %fdtmapping = ();
248
249 open(FDT_FILE, "<$fdtfilename") || die "Error: Could not open file $fdtfilename.\n";
250
251 local $amongstdefinitions = 0;
252 foreach $fdtfileline (<FDT_FILE>) {
253 $fdtfileline =~ s/(\s*)$//; # Remove any nasty spaces at the end of the lines
254
255 if ($amongstdefinitions) {
256 local $fieldtitle = substr($fdtfileline, 0, 30);
257 local $fieldsubfields = substr($fdtfileline, 30, 20);
258 local $fieldspecs = substr($fdtfileline, 50);
259
260 # Remove extra spaces
261 $fieldtitle =~ s/(\s*)$//;
262 $fieldsubfields =~ s/(\s*)$//;
263
264 # Map from tag number to metadata field title and subfields
265 local $fieldtag = (split(/ /, $fieldspecs))[0];
266 $fdtmapping{$fieldtag} = { 'title' => $fieldtitle,
267 'subfields' => $fieldsubfields };
268 }
269 elsif ($fdtfileline eq "***") {
270 $amongstdefinitions = 1;
271 }
272 }
273
274 close(FDT_FILE);
275
276 return %fdtmapping;
277}
278
279
2801;
Note: See TracBrowser for help on using the repository browser.