source: trunk/gsdl/perllib/plugins/ISISPlug.pm@ 11090

Last change on this file since 11090 was 10254, checked in by kjdon, 19 years ago

added 'use strict' to all plugins, and made modifications (mostly adding 'my') to make them compile

  • Property svn:keywords set to Author Date Id Revision
File size: 8.4 KB
Line 
1###########################################################################
2#
3# ISISPlug.pm -- A plugin for CDS/ISIS databases
4#
5# A component of the Greenstone digital library software
6# from the New Zealand Digital Library Project at the
7# University of Waikato, New Zealand.
8#
9# Copyright 1999-2004 New Zealand Digital Library Project
10#
11# This program is free software; you can redistribute it and/or modify
12# it under the terms of the GNU General Public License as published by
13# the Free Software Foundation; either version 2 of the License, or
14# (at your option) any later version.
15#
16# This program is distributed in the hope that it will be useful,
17# but WITHOUT ANY WARRANTY; without even the implied warranty of
18# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19# GNU General Public License for more details.
20#
21# You should have received a copy of the GNU General Public License
22# along with this program; if not, write to the Free Software
23# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24#
25###########################################################################
26
27package ISISPlug;
28
29
30use multiread;
31use SplitPlug;
32
33use strict;
34no strict 'refs'; # allow filehandles to be variables and viceversa
35
36# ISISPlug is a sub-class of SplitPlug.
37sub BEGIN {
38 @ISISPlug::ISA = ('SplitPlug');
39}
40
41
42my $arguments =
43 [ { 'name' => "process_exp",
44 'desc' => "{BasPlug.process_exp}",
45 'type' => "regexp",
46 'reqd' => "no",
47 'deft' => &get_default_process_exp() },
48 { 'name' => "block_exp",
49 'desc' => "{BasPlug.block_exp}",
50 'type' => "regexp",
51 'reqd' => "no",
52 'deft' => &get_default_block_exp() },
53 { 'name' => "split_exp",
54 'desc' => "{SplitPlug.split_exp}",
55 'type' => "regexp",
56 'reqd' => "no",
57 'deft' => &get_default_split_exp() },
58
59 # The interesting options
60 { 'name' => "entry_separator",
61 'desc' => "{ISISPlug.entry_separator}",
62 'type' => "string",
63 'reqd' => "no",
64 'deft' => "<br>" },
65 { 'name' => "subfield_separator",
66 'desc' => "{ISISPlug.subfield_separator}",
67 'type' => "string",
68 'reqd' => "no",
69 'deft' => ", " }
70 ];
71
72my $options = { 'name' => "ISISPlug",
73 'desc' => "{ISISPlug.desc}",
74 'abstract' => "no",
75 'inherits' => "yes",
76 'explodes' => "yes",
77 'args' => $arguments };
78
79
80# This plugin processes files with the suffix ".mst"
81sub get_default_process_exp {
82 return q^(?i)(\.mst)$^;
83}
84
85
86# This plugin blocks files with the suffix ".fdt" and ".xrf"
87sub get_default_block_exp {
88 return q^(?i)(\.fdt|\.xrf)$^;
89}
90
91
92# This plugin splits the input text at the "----------" lines
93sub get_default_split_exp {
94 return q^\r?\n----------\r?\n^;
95}
96
97
98sub new
99{
100 my ($class) = shift (@_);
101 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
102 push(@$pluginlist, $class);
103
104 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
105 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
106
107 my $self = (defined $hashArgOptLists)? new SplitPlug($pluginlist,$inputargs,$hashArgOptLists): new SplitPlug($pluginlist,$inputargs);
108
109 return bless $self, $class;
110}
111
112
113sub read_file
114{
115 my $self = shift (@_);
116 my ($filename, $encoding, $language, $textref) = @_;
117
118 my ($databasename) = ($filename =~ /(.*)\.mst$/i);
119
120 # Check the associated .fdt and .xrf files exist
121 # These files must have a lowercase extension for the IsisGdl program to work
122 # Bailing out because of this is kind of crappy but it is only an issue on Unix
123 my $fdtfilename = $databasename . ".fdt";
124 if (! -e $fdtfilename) {
125 die "Error: Could not find ISIS FDT file $fdtfilename.\n";
126 }
127 my $xrffilename = $databasename . ".xrf";
128 if (! -e $xrffilename) {
129 die "Error: Could not find ISIS XRF file $xrffilename.\n";
130 }
131
132 # The text to split is exported from the database by the IsisGdl program
133 open(FILE, "IsisGdl \"$filename\" |");
134
135 my $reader = new multiread();
136 $reader->set_handle('ISISPlug::FILE');
137 $reader->set_encoding($encoding);
138 $reader->read_file($textref);
139
140 close(FILE);
141
142 # Parse the associated ISIS database Field Definition Table file (.fdt)
143 my %fdtmapping = &parse_field_definition_table($fdtfilename);
144
145 # Map the tag numbers to tag names, using the FDT mapping
146 $$textref =~ s/\r?\ntag=(\d+) /\ntag=$fdtmapping{$1}{'title'} /g;
147
148 # Remove the line at the start so it is split and processed properly
149 $$textref =~ s/^----------\n//;
150}
151
152
153sub process
154{
155 my $self = shift (@_);
156 my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
157 my $outhandle = $self->{'outhandle'};
158
159 my $cursection = $doc_obj->get_top_section();
160 my $subfield_separator = $self->{'subfield_separator'};
161 my $entry_separator = $self->{'entry_separator'};
162
163 # Report that we're processing the file
164 print STDERR "<Processing n='$file' p='ISISPlug'>\n" if ($gli);
165 print $outhandle "IsisPlug: processing $file\n"
166 if ($self->{'verbosity'}) > 1;
167
168 # Process each line of the ISIS record, one at a time
169 foreach my $line (split(/\n/, $$textref)) {
170 $line =~ /^tag=(.*) data=(.+)$/;
171 my $rawtagname = $1;
172 my $rawtagdata = $2;
173 # print STDERR "Raw tag: $rawtagname, Raw data: $rawtagdata\n";
174 next if ($rawtagname eq "");
175
176 # Metadata field names: title case, then remove spaces
177 my $tagname = "";
178 foreach my $word (split(/\s+/, $rawtagname)) {
179 substr($word, 0, 1) =~ tr/a-z/A-Z/;
180 $tagname .= $word;
181 }
182
183 # Make sure there is nothing bad in the tag names
184 $tagname =~ s/&//g;
185
186 # Handle each piece of metadata ('%' separated)
187 my $completetagvalue = "";
188 foreach my $rawtagvalue (split(/%/, $rawtagdata)) {
189 $completetagvalue .= $entry_separator unless ($completetagvalue eq "");
190
191 # Metadata field values: take care with subfields
192 my $completeentryvalue = "";
193 while ($rawtagvalue ne "") {
194 # If there is a subfield specifier, parse it off
195 my $subfieldname = "";
196 if ($rawtagvalue =~ s/^\^// && $rawtagvalue =~ s/([a-z])//) {
197 $subfieldname = "^$1";
198 }
199
200 # Parse the metadata value off
201 $rawtagvalue =~ s/^([^\^]*)//;
202 my $metadatafieldname = $tagname . $subfieldname;
203 my $metadatafieldvalue = $1;
204
205 # Handle Keywords specially
206 if ($metadatafieldname eq "Keywords") {
207 my $keywordmetadatavalue = $metadatafieldvalue;
208 my $keywordlist = "";
209 while ($keywordmetadatavalue =~ s/\<(.+?)\>//) {
210 my $keyword = $1;
211 $doc_obj->add_utf8_metadata($cursection, $metadatafieldname, $keyword);
212 $keywordlist .= ", " unless ($keywordlist eq "");
213 $keywordlist .= $keyword;
214 }
215
216 $metadatafieldvalue = $keywordlist;
217 }
218
219 # Escape any '<' and '>' characters so they appear correctly in the final collection
220 $metadatafieldvalue =~ s/\</&lt;/g;
221 $metadatafieldvalue =~ s/\>/&gt;/g;
222
223 # We have already added Keywords metadata above
224 unless ($metadatafieldname eq "Keywords") {
225 $doc_obj->add_utf8_metadata($cursection, $metadatafieldname, $metadatafieldvalue);
226 }
227
228 $completeentryvalue .= $subfield_separator unless ($completeentryvalue eq "");
229 $completeentryvalue .= $metadatafieldvalue;
230 }
231
232 $completetagvalue .= $completeentryvalue;
233 }
234
235 $doc_obj->add_utf8_metadata($cursection, $tagname . "^all", $completetagvalue);
236 }
237
238 # Add the full record as the document text
239 $$textref =~ s/\</&lt;/g;
240 $$textref =~ s/\>/&gt;/g;
241 $doc_obj->add_utf8_text($cursection, $$textref);
242
243 # Add FileFormat metadata
244 $doc_obj->add_utf8_metadata($cursection, "FileFormat", "CDS/ISIS");
245
246 # Record was processed successfully (and there was no document obtained)
247 return 1;
248}
249
250
251sub parse_field_definition_table
252{
253 my $fdtfilename = shift(@_);
254
255 my %fdtmapping = ();
256
257 open(FDT_FILE, "<$fdtfilename") || die "Error: Could not open file $fdtfilename.\n";
258
259 my $amongstdefinitions = 0;
260 foreach my $fdtfileline (<FDT_FILE>) {
261 $fdtfileline =~ s/(\s*)$//; # Remove any nasty spaces at the end of the lines
262
263 if ($amongstdefinitions) {
264 my $fieldtitle = substr($fdtfileline, 0, 30);
265 my $fieldsubfields = substr($fdtfileline, 30, 20);
266 my $fieldspecs = substr($fdtfileline, 50);
267
268 # Remove extra spaces
269 $fieldtitle =~ s/(\s*)$//;
270 $fieldsubfields =~ s/(\s*)$//;
271
272 # Map from tag number to metadata field title and subfields
273 my $fieldtag = (split(/ /, $fieldspecs))[0];
274 $fdtmapping{$fieldtag} = { 'title' => $fieldtitle,
275 'subfields' => $fieldsubfields };
276 }
277 elsif ($fdtfileline eq "***") {
278 $amongstdefinitions = 1;
279 }
280 }
281
282 close(FDT_FILE);
283
284 return %fdtmapping;
285}
286
287
2881;
Note: See TracBrowser for help on using the repository browser.