Context Navigation

source: trunk/gsdl/perllib/plugins/ISISPlug.pm@ 11334

Last change on this file since 11334 was 11334, checked in by mdewsnip, 18 years ago
Errors reading the .fdt or .xrf files are now reported back to the GLI nicely.
Property svn:keywords set to `Author Date Id Revision`
File size: 11.3 KB

Line
1	###########################################################################
2	#
3	# ISISPlug.pm -- A plugin for CDS/ISIS databases
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright 1999-2004 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	package ISISPlug;
28
29
30	use multiread;
31	use SplitPlug;
32
33	use strict;
34	no strict 'refs'; # allow filehandles to be variables and viceversa
35
36	# ISISPlug is a sub-class of SplitPlug.
37	sub BEGIN {
38	@ISISPlug::ISA = ('SplitPlug');
39	}
40
41
42	my $arguments =
43	[ { 'name' => "process_exp",
44	'desc' => "{BasPlug.process_exp}",
45	'type' => "regexp",
46	'reqd' => "no",
47	'deft' => &get_default_process_exp() },
48	{ 'name' => "block_exp",
49	'desc' => "{BasPlug.block_exp}",
50	'type' => "regexp",
51	'reqd' => "no",
52	'deft' => &get_default_block_exp(),
53	'hiddengli' => "yes" },
54	{ 'name' => "split_exp",
55	'desc' => "{SplitPlug.split_exp}",
56	'type' => "regexp",
57	'reqd' => "no",
58	'deft' => &get_default_split_exp(),
59	'hiddengli' => "yes" },
60
61	# The interesting options
62	{ 'name' => "entry_separator",
63	'desc' => "{ISISPlug.entry_separator}",
64	'type' => "string",
65	'reqd' => "no",
66	'deft' => "<br>" },
67	{ 'name' => "subfield_separator",
68	'desc' => "{ISISPlug.subfield_separator}",
69	'type' => "string",
70	'reqd' => "no",
71	'deft' => ", " }
72	];
73
74	my $options = { 'name' => "ISISPlug",
75	'desc' => "{ISISPlug.desc}",
76	'abstract' => "no",
77	'inherits' => "yes",
78	'explodes' => "yes",
79	'args' => $arguments };
80
81
82	# This plugin processes files with the suffix ".mst"
83	sub get_default_process_exp {
84	return q^(?i)(\.mst)$^;
85	}
86
87
88	# This plugin blocks files with the suffix ".fdt" and ".xrf"
89	sub get_default_block_exp {
90	return q^(?i)(\.fdt\|\.xrf)$^;
91	}
92
93
94	# This plugin splits the input text at the "----------" lines
95	sub get_default_split_exp {
96	return q^\r?\n----------\r?\n^;
97	}
98
99
100	sub new
101	{
102	my ($class) = shift (@_);
103	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
104	push(@$pluginlist, $class);
105
106	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
107	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
108
109	my $self = (defined $hashArgOptLists)? new SplitPlug($pluginlist,$inputargs,$hashArgOptLists): new SplitPlug($pluginlist,$inputargs);
110
111	return bless $self, $class;
112	}
113
114
115	sub read_file
116	{
117	my $self = shift (@_);
118	my ($filename, $encoding, $language, $textref) = @_;
119	my $outhandle = $self->{'outhandle'};
120
121	my ($database_file_path_root) = ($filename =~ /(.*)\.mst$/i);
122	my $mst_file_path_relative = $filename;
123	$mst_file_path_relative =~ s/^.+import.(.*?)$/$1/;
124
125	# Check the associated .fdt and .xrf files exist
126	# These files must have a lowercase extension for the IsisGdl program to work
127	# Bailing out because of this is kind of crappy but it is only an issue on Unix
128	$self->{'fdt_file_path'} = $database_file_path_root . ".fdt";
129	if (!-e $self->{'fdt_file_path'}) {
130	print STDERR "<ProcessingError n='$mst_file_path_relative' r='Could not find ISIS FDT file $self->{'fdt_file_path'}'>\n" if ($self->{'gli'});
131	print $outhandle "Error: Could not find ISIS FDT file " . $self->{'fdt_file_path'} . ".\n";
132	return;
133	}
134	$self->{'xrf_file_path'} = $database_file_path_root . ".xrf";
135	if (!-e $self->{'xrf_file_path'}) {
136	print STDERR "<ProcessingError n='$mst_file_path_relative' r='Could not find ISIS XRF file $self->{'xrf_file_path'}'>\n" if ($self->{'gli'});
137	print $outhandle "Error: Could not find ISIS XRF file " . $self->{'xrf_file_path'} . ".\n";
138	return;
139	}
140
141	# The text to split is exported from the database by the IsisGdl program
142	open(FILE, "IsisGdl \"$filename\" \|");
143
144	my $reader = new multiread();
145	$reader->set_handle('ISISPlug::FILE');
146	$reader->set_encoding($encoding);
147	$reader->read_file($textref);
148
149	close(FILE);
150
151	# Parse the associated ISIS database Field Definition Table file (.fdt)
152	my %fdt_mapping = &parse_field_definition_table($self->{'fdt_file_path'}, $encoding);
153	$self->{'fdt_mapping'} = \%fdt_mapping;
154
155	# Remove the line at the start so it is split and processed properly
156	$$textref =~ s/^----------\n//;
157	}
158
159
160	sub process
161	{
162	my $self = shift (@_);
163	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
164	my $outhandle = $self->{'outhandle'};
165
166	my $section = $doc_obj->get_top_section();
167	my $fdt_mapping = $self->{'fdt_mapping'};
168	my $subfield_separator = $self->{'subfield_separator'};
169	my $entry_separator = $self->{'entry_separator'};
170
171	# Report that we're processing the file
172	print STDERR "<Processing n='$file' p='ISISPlug'>\n" if ($gli);
173	print $outhandle "IsisPlug: processing $file\n" if ($self->{'verbosity'}) > 1;
174
175	# Process each line of the ISIS record, one at a time
176	foreach my $line (split(/\n/, $$textref)) {
177	$line =~ /^tag=(.*) data=(.+)$/;
178	my $tag = $1;
179	my $tag_data = $2;
180	# print STDERR "\nTag: $tag, Data: $tag_data\n";
181
182	# Convert the tag number into a name, and remove any invalid characters
183	my $raw_metadata_name = $fdt_mapping->{$tag}{'name'} \|\| "";
184	$raw_metadata_name =~ s/[,&\#\.\-\/]/ /g;
185	next if ($raw_metadata_name eq "");
186
187	# Metadata field names: title case, then remove spaces
188	my $metadata_name = "";
189	foreach my $word (split(/\s+/, $raw_metadata_name)) {
190	substr($word, 0, 1) =~ tr/a-z/A-Z/;
191	$metadata_name .= $word;
192	}
193
194	my $all_metadata_name = $metadata_name . "^all";
195	my $all_metadata_value = "";
196
197	# Handle repeatable fields
198	if ($fdt_mapping->{$tag}{'repeatable'}) {
199	# Multiple values are separated using the '%' character
200	foreach my $raw_metadata_value (split(/%/, $tag_data)) {
201	my $metadata_value = "";
202
203	# Handle subfields
204	while ($raw_metadata_value ne "") {
205	# If there is a subfield specifier, parse it off
206	my $sub_metadata_name = $metadata_name;
207	if ($raw_metadata_value =~ s/^\^// && $raw_metadata_value =~ s/^([a-z])//) {
208	$sub_metadata_name .= "^$1";
209	}
210
211	# Parse the value off and add it as metadata
212	$raw_metadata_value =~ s/^([^\^]*)//;
213	my $sub_metadata_value = $1;
214
215	# Escape any '<' and '>' characters so they appear correctly in the final collection
216	$sub_metadata_value =~ s/\</</g;
217	$sub_metadata_value =~ s/\>/>/g;
218
219	# print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
220	if ($sub_metadata_name ne $metadata_name) {
221	$doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
222	}
223
224	$metadata_value .= $subfield_separator unless ($metadata_value eq "");
225	$metadata_value .= $sub_metadata_value;
226	}
227
228	# Add the metadata value
229	# print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
230	$doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
231
232	$all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
233	$all_metadata_value .= $metadata_value;
234	}
235	}
236
237	# Handle non-repeatable fields
238	else {
239	my $raw_metadata_value = $tag_data;
240	my $metadata_value = "";
241
242	# Handle subfields
243	while ($raw_metadata_value ne "") {
244	# If there is a subfield specifier, parse it off
245	my $sub_metadata_name = $metadata_name;
246	if ($raw_metadata_value =~ s/^(\^[a-z])//) {
247	$sub_metadata_name .= $1;
248	}
249
250	# Parse the value off and add it as metadata
251	$raw_metadata_value =~ s/^([^\^]*)//;
252	my $sub_metadata_value = $1;
253
254	# Deal with the case when multiple values are specified using <...>
255	if ($sub_metadata_value =~ /\<(.*)\>$/) {
256	my $sub_sub_metadata_name = $sub_metadata_name . "^sub";
257	my $tmp_sub_metadata_value = $sub_metadata_value;
258	while ($tmp_sub_metadata_value =~ s/\<(.*?)\>//) {
259	my $sub_sub_metadata_value = $1;
260	$doc_obj->add_utf8_metadata($section, $sub_sub_metadata_name, $sub_sub_metadata_value);
261	}
262	}
263
264	# Escape any '<' and '>' characters so they appear correctly in the final collection
265	$sub_metadata_value =~ s/\</</g;
266	$sub_metadata_value =~ s/\>/>/g;
267
268	# print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
269	if ($sub_metadata_name ne $metadata_name) {
270	$doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
271	}
272
273	$metadata_value .= $subfield_separator unless ($metadata_value eq "");
274	$metadata_value .= $sub_metadata_value;
275	}
276
277	# Add the metadata value
278	# print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
279	$doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
280
281	$all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
282	$all_metadata_value .= $metadata_value;
283	}
284
285	# Add the "^all" metadata value
286	# print STDERR "All metadata name: $all_metadata_name, value: $all_metadata_value\n";
287	$doc_obj->add_utf8_metadata($section, $all_metadata_name, $all_metadata_value);
288	}
289
290	# Add the full record as the document text
291	$$textref =~ s/\</</g;
292	$$textref =~ s/\>/>/g;
293	$doc_obj->add_utf8_text($section, $$textref);
294
295	# Add FileFormat metadata
296	$doc_obj->add_utf8_metadata($section, "FileFormat", "CDS/ISIS");
297
298	# Record was processed successfully
299	return 1;
300	}
301
302
303	sub parse_field_definition_table
304	{
305	my $fdtfilename = shift(@_);
306	my $encoding = shift(@_);
307
308	my %fdtmapping = ();
309
310	open(FDT_FILE, "<$fdtfilename") \|\| die "Error: Could not open file $fdtfilename.\n";
311
312	my $fdtfiletext = "";
313	my $reader = new multiread();
314	$reader->set_handle('ISISPlug::FDT_FILE');
315	$reader->set_encoding($encoding);
316	$reader->read_file($fdtfiletext);
317
318	my $amongstdefinitions = 0;
319	foreach my $fdtfileline (split(/\n/, $$fdtfiletext)) {
320	$fdtfileline =~ s/(\s*)$//; # Remove any nasty spaces at the end of the lines
321
322	if ($amongstdefinitions) {
323	my $fieldname = substr($fdtfileline, 0, 30);
324	my $fieldsubfields = substr($fdtfileline, 30, 20);
325	my $fieldspecs = substr($fdtfileline, 50);
326
327	# Remove extra spaces
328	$fieldname =~ s/(\s*)$//;
329	$fieldsubfields =~ s/(\s*)$//;
330	$fieldspecs =~ s/(\s*)$//;
331
332	# Map from tag number to metadata field title, subfields, and repeatability
333	my $fieldtag = (split(/ /, $fieldspecs))[0];
334	my $fieldrepeatable = (split(/ /, $fieldspecs))[3];
335	$fdtmapping{$fieldtag} = { 'name' => $fieldname,
336	'subfields' => $fieldsubfields,
337	'repeatable' => $fieldrepeatable };
338	}
339	elsif ($fdtfileline eq "***") {
340	$amongstdefinitions = 1;
341	}
342	}
343
344	close(FDT_FILE);
345
346	return %fdtmapping;
347	}
348
349
350	sub clean_up_after_exploding
351	{
352	my $self = shift(@_);
353
354	# Delete the FDT and XRF files too
355	&util::rm($self->{'fdt_file_path'});
356	&util::rm($self->{'xrf_file_path'});
357	}
358
359
360	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: