Context Navigation

source: trunk/gsdl/perllib/plugins/ISISPlug.pm@ 11353

Last change on this file since 11353 was 11353, checked in by mdewsnip, 18 years ago
Added a bit of code to prevent infinite looping on bad data.
Property svn:keywords set to `Author Date Id Revision`
File size: 11.4 KB

Line
1	###########################################################################
2	#
3	# ISISPlug.pm -- A plugin for CDS/ISIS databases
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright 1999-2004 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	package ISISPlug;
28
29
30	use multiread;
31	use SplitPlug;
32
33	use strict;
34	no strict 'refs'; # allow filehandles to be variables and viceversa
35
36	# ISISPlug is a sub-class of SplitPlug.
37	sub BEGIN {
38	@ISISPlug::ISA = ('SplitPlug');
39	}
40
41
42	my $arguments =
43	[ { 'name' => "process_exp",
44	'desc' => "{BasPlug.process_exp}",
45	'type' => "regexp",
46	'reqd' => "no",
47	'deft' => &get_default_process_exp() },
48	{ 'name' => "block_exp",
49	'desc' => "{BasPlug.block_exp}",
50	'type' => "regexp",
51	'reqd' => "no",
52	'deft' => &get_default_block_exp(),
53	'hiddengli' => "yes" },
54	{ 'name' => "split_exp",
55	'desc' => "{SplitPlug.split_exp}",
56	'type' => "regexp",
57	'reqd' => "no",
58	'deft' => &get_default_split_exp(),
59	'hiddengli' => "yes" },
60
61	# The interesting options
62	{ 'name' => "entry_separator",
63	'desc' => "{ISISPlug.entry_separator}",
64	'type' => "string",
65	'reqd' => "no",
66	'deft' => "<br>" },
67	{ 'name' => "subfield_separator",
68	'desc' => "{ISISPlug.subfield_separator}",
69	'type' => "string",
70	'reqd' => "no",
71	'deft' => ", " }
72	];
73
74	my $options = { 'name' => "ISISPlug",
75	'desc' => "{ISISPlug.desc}",
76	'abstract' => "no",
77	'inherits' => "yes",
78	'explodes' => "yes",
79	'args' => $arguments };
80
81
82	# This plugin processes files with the suffix ".mst"
83	sub get_default_process_exp {
84	return q^(?i)(\.mst)$^;
85	}
86
87
88	# This plugin blocks files with the suffix ".fdt" and ".xrf"
89	sub get_default_block_exp {
90	return q^(?i)(\.fdt\|\.xrf)$^;
91	}
92
93
94	# This plugin splits the input text at the "----------" lines
95	sub get_default_split_exp {
96	return q^\r?\n----------\r?\n^;
97	}
98
99
100	sub new
101	{
102	my ($class) = shift (@_);
103	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
104	push(@$pluginlist, $class);
105
106	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
107	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
108
109	my $self = (defined $hashArgOptLists)? new SplitPlug($pluginlist,$inputargs,$hashArgOptLists): new SplitPlug($pluginlist,$inputargs);
110
111	return bless $self, $class;
112	}
113
114
115	sub read_file
116	{
117	my $self = shift (@_);
118	my ($filename, $encoding, $language, $textref) = @_;
119	my $outhandle = $self->{'outhandle'};
120
121	my ($database_file_path_root) = ($filename =~ /(.*)\.mst$/i);
122	my $mst_file_path_relative = $filename;
123	$mst_file_path_relative =~ s/^.+import.(.*?)$/$1/;
124
125	# Check the associated .fdt and .xrf files exist
126	# These files must have a lowercase extension for the IsisGdl program to work
127	# Bailing out because of this is kind of crappy but it is only an issue on Unix
128	$self->{'fdt_file_path'} = $database_file_path_root . ".fdt";
129	if (!-e $self->{'fdt_file_path'}) {
130	print STDERR "<ProcessingError n='$mst_file_path_relative' r='Could not find ISIS FDT file $self->{'fdt_file_path'}'>\n" if ($self->{'gli'});
131	print $outhandle "Error: Could not find ISIS FDT file " . $self->{'fdt_file_path'} . ".\n";
132	return;
133	}
134	$self->{'xrf_file_path'} = $database_file_path_root . ".xrf";
135	if (!-e $self->{'xrf_file_path'}) {
136	print STDERR "<ProcessingError n='$mst_file_path_relative' r='Could not find ISIS XRF file $self->{'xrf_file_path'}'>\n" if ($self->{'gli'});
137	print $outhandle "Error: Could not find ISIS XRF file " . $self->{'xrf_file_path'} . ".\n";
138	return;
139	}
140
141	# The text to split is exported from the database by the IsisGdl program
142	open(FILE, "IsisGdl \"$filename\" \|");
143
144	my $reader = new multiread();
145	$reader->set_handle('ISISPlug::FILE');
146	$reader->set_encoding($encoding);
147	$reader->read_file($textref);
148
149	close(FILE);
150
151	# Parse the associated ISIS database Field Definition Table file (.fdt)
152	my %fdt_mapping = &parse_field_definition_table($self->{'fdt_file_path'}, $encoding);
153	$self->{'fdt_mapping'} = \%fdt_mapping;
154
155	# Remove the line at the start so it is split and processed properly
156	$$textref =~ s/^----------\n//;
157	}
158
159
160	sub process
161	{
162	my $self = shift (@_);
163	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
164	my $outhandle = $self->{'outhandle'};
165
166	my $section = $doc_obj->get_top_section();
167	my $fdt_mapping = $self->{'fdt_mapping'};
168	my $subfield_separator = $self->{'subfield_separator'};
169	my $entry_separator = $self->{'entry_separator'};
170
171	# Report that we're processing the file
172	print STDERR "\n<Processing n='$file' p='ISISPlug'>\n" if ($gli);
173	print $outhandle "IsisPlug: processing $file\n" if ($self->{'verbosity'}) > 1;
174
175	# Process each line of the ISIS record, one at a time
176	foreach my $line (split(/\n/, $$textref)) {
177	$line =~ /^tag=(.*) data=(.+)$/;
178	my $tag = $1;
179	my $tag_data = $2;
180	# print STDERR "\nTag: $tag, Data: $tag_data\n";
181
182	# Convert the tag number into a name, and remove any invalid characters
183	my $raw_metadata_name = $fdt_mapping->{$tag}{'name'} \|\| "";
184	$raw_metadata_name =~ s/[,&\#\.\-\/]/ /g;
185	next if ($raw_metadata_name eq "");
186
187	# Metadata field names: title case, then remove spaces
188	my $metadata_name = "";
189	foreach my $word (split(/\s+/, $raw_metadata_name)) {
190	substr($word, 0, 1) =~ tr/a-z/A-Z/;
191	$metadata_name .= $word;
192	}
193
194	my $all_metadata_name = $metadata_name . "^all";
195	my $all_metadata_value = "";
196
197	# Handle repeatable fields
198	if ($fdt_mapping->{$tag}{'repeatable'}) {
199	# Multiple values are separated using the '%' character
200	foreach my $raw_metadata_value (split(/%/, $tag_data)) {
201	my $metadata_value = "";
202
203	# Handle subfields
204	while ($raw_metadata_value ne "") {
205	# If there is a subfield specifier, parse it off
206	my $sub_metadata_name = $metadata_name;
207	if ($raw_metadata_value =~ s/^\^// && $raw_metadata_value =~ s/^([a-z])//) {
208	$sub_metadata_name .= "^$1";
209	}
210
211	# Parse the value off and add it as metadata
212	$raw_metadata_value =~ s/^([^\^]*)//;
213	my $sub_metadata_value = $1;
214
215	# Escape any '<' and '>' characters so they appear correctly in the final collection
216	$sub_metadata_value =~ s/\</</g;
217	$sub_metadata_value =~ s/\>/>/g;
218
219	# print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
220	if ($sub_metadata_name ne $metadata_name) {
221	$doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
222	}
223
224	$metadata_value .= $subfield_separator unless ($metadata_value eq "");
225	$metadata_value .= $sub_metadata_value;
226	}
227
228	# Add the metadata value
229	# print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
230	$doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
231
232	$all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
233	$all_metadata_value .= $metadata_value;
234	}
235	}
236
237	# Handle non-repeatable fields
238	else {
239	my $raw_metadata_value = $tag_data;
240	my $metadata_value = "";
241
242	# Handle subfields
243	while ($raw_metadata_value ne "") {
244	# If there is a subfield specifier, parse it off
245	my $sub_metadata_name = $metadata_name;
246	if ($raw_metadata_value =~ s/^\^// && $raw_metadata_value =~ s/^([a-z])//) {
247	$sub_metadata_name .= $1;
248	}
249
250	# Parse the value off and add it as metadata
251	$raw_metadata_value =~ s/^([^\^]*)//;
252	my $sub_metadata_value = $1;
253
254	# Deal with the case when multiple values are specified using <...>
255	if ($sub_metadata_value =~ /\<(.*)\>$/) {
256	my $sub_sub_metadata_name = $sub_metadata_name . "^sub";
257	my $tmp_sub_metadata_value = $sub_metadata_value;
258	while ($tmp_sub_metadata_value =~ s/\<(.*?)\>//) {
259	my $sub_sub_metadata_value = $1;
260	$doc_obj->add_utf8_metadata($section, $sub_sub_metadata_name, $sub_sub_metadata_value);
261	}
262	}
263
264	# Escape any '<' and '>' characters so they appear correctly in the final collection
265	$sub_metadata_value =~ s/\</</g;
266	$sub_metadata_value =~ s/\>/>/g;
267
268	# print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
269	if ($sub_metadata_name ne $metadata_name) {
270	$doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
271	}
272
273	$metadata_value .= $subfield_separator unless ($metadata_value eq "");
274	$metadata_value .= $sub_metadata_value;
275	}
276
277	# Add the metadata value
278	# print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
279	$doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
280
281	$all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
282	$all_metadata_value .= $metadata_value;
283	}
284
285	# Add the "^all" metadata value
286	# print STDERR "All metadata name: $all_metadata_name, value: $all_metadata_value\n";
287	$doc_obj->add_utf8_metadata($section, $all_metadata_name, $all_metadata_value);
288	}
289
290	# Add the full record as the document text
291	$$textref =~ s/\</</g;
292	$$textref =~ s/\>/>/g;
293	$doc_obj->add_utf8_text($section, $$textref);
294
295	# Add FileFormat metadata
296	$doc_obj->add_utf8_metadata($section, "FileFormat", "CDS/ISIS");
297
298	# Record was processed successfully
299	return 1;
300	}
301
302
303	sub parse_field_definition_table
304	{
305	my $fdtfilename = shift(@_);
306	my $encoding = shift(@_);
307
308	my %fdtmapping = ();
309
310	open(FDT_FILE, "<$fdtfilename") \|\| die "Error: Could not open file $fdtfilename.\n";
311
312	my $fdtfiletext = "";
313	my $reader = new multiread();
314	$reader->set_handle('ISISPlug::FDT_FILE');
315	$reader->set_encoding($encoding);
316	$reader->read_file($fdtfiletext);
317
318	my $amongstdefinitions = 0;
319	foreach my $fdtfileline (split(/\n/, $$fdtfiletext)) {
320	$fdtfileline =~ s/(\s*)$//; # Remove any nasty spaces at the end of the lines
321
322	if ($amongstdefinitions) {
323	my $fieldname = substr($fdtfileline, 0, 30);
324	my $fieldsubfields = substr($fdtfileline, 30, 20);
325	my $fieldspecs = substr($fdtfileline, 50);
326
327	# Remove extra spaces
328	$fieldname =~ s/(\s*)$//;
329	$fieldsubfields =~ s/(\s*)$//;
330	$fieldspecs =~ s/(\s*)$//;
331
332	# Map from tag number to metadata field title, subfields, and repeatability
333	my $fieldtag = (split(/ /, $fieldspecs))[0];
334	my $fieldrepeatable = (split(/ /, $fieldspecs))[3];
335	$fdtmapping{$fieldtag} = { 'name' => $fieldname,
336	'subfields' => $fieldsubfields,
337	'repeatable' => $fieldrepeatable };
338	}
339	elsif ($fdtfileline eq "***") {
340	$amongstdefinitions = 1;
341	}
342	}
343
344	close(FDT_FILE);
345
346	return %fdtmapping;
347	}
348
349
350	sub clean_up_after_exploding
351	{
352	my $self = shift(@_);
353
354	# Delete the FDT and XRF files too
355	&util::rm($self->{'fdt_file_path'});
356	&util::rm($self->{'xrf_file_path'});
357	}
358
359
360	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: