Context Navigation

source: trunk/gsdl/perllib/plugins/ISISPlug.pm@ 11332

Last change on this file since 11332 was 11332, checked in by mdewsnip, 18 years ago
Added a mechanism for plugins to do tidying up after exploding. ISISPlug uses this to delete the associated .fdt and .xrf files.
Property svn:keywords set to `Author Date Id Revision`
File size: 10.8 KB

Line
1	###########################################################################
2	#
3	# ISISPlug.pm -- A plugin for CDS/ISIS databases
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright 1999-2004 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	package ISISPlug;
28
29
30	use multiread;
31	use SplitPlug;
32
33	use strict;
34	no strict 'refs'; # allow filehandles to be variables and viceversa
35
36	# ISISPlug is a sub-class of SplitPlug.
37	sub BEGIN {
38	@ISISPlug::ISA = ('SplitPlug');
39	}
40
41
42	my $arguments =
43	[ { 'name' => "process_exp",
44	'desc' => "{BasPlug.process_exp}",
45	'type' => "regexp",
46	'reqd' => "no",
47	'deft' => &get_default_process_exp() },
48	{ 'name' => "block_exp",
49	'desc' => "{BasPlug.block_exp}",
50	'type' => "regexp",
51	'reqd' => "no",
52	'deft' => &get_default_block_exp(),
53	'hiddengli' => "yes" },
54	{ 'name' => "split_exp",
55	'desc' => "{SplitPlug.split_exp}",
56	'type' => "regexp",
57	'reqd' => "no",
58	'deft' => &get_default_split_exp(),
59	'hiddengli' => "yes" },
60
61	# The interesting options
62	{ 'name' => "entry_separator",
63	'desc' => "{ISISPlug.entry_separator}",
64	'type' => "string",
65	'reqd' => "no",
66	'deft' => "<br>" },
67	{ 'name' => "subfield_separator",
68	'desc' => "{ISISPlug.subfield_separator}",
69	'type' => "string",
70	'reqd' => "no",
71	'deft' => ", " }
72	];
73
74	my $options = { 'name' => "ISISPlug",
75	'desc' => "{ISISPlug.desc}",
76	'abstract' => "no",
77	'inherits' => "yes",
78	'explodes' => "yes",
79	'args' => $arguments };
80
81
82	# This plugin processes files with the suffix ".mst"
83	sub get_default_process_exp {
84	return q^(?i)(\.mst)$^;
85	}
86
87
88	# This plugin blocks files with the suffix ".fdt" and ".xrf"
89	sub get_default_block_exp {
90	return q^(?i)(\.fdt\|\.xrf)$^;
91	}
92
93
94	# This plugin splits the input text at the "----------" lines
95	sub get_default_split_exp {
96	return q^\r?\n----------\r?\n^;
97	}
98
99
100	sub new
101	{
102	my ($class) = shift (@_);
103	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
104	push(@$pluginlist, $class);
105
106	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
107	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
108
109	my $self = (defined $hashArgOptLists)? new SplitPlug($pluginlist,$inputargs,$hashArgOptLists): new SplitPlug($pluginlist,$inputargs);
110
111	return bless $self, $class;
112	}
113
114
115	sub read_file
116	{
117	my $self = shift (@_);
118	my ($filename, $encoding, $language, $textref) = @_;
119
120	my ($databasename) = ($filename =~ /(.*)\.mst$/i);
121
122	# Check the associated .fdt and .xrf files exist
123	# These files must have a lowercase extension for the IsisGdl program to work
124	# Bailing out because of this is kind of crappy but it is only an issue on Unix
125	$self->{'fdt_filename'} = $databasename . ".fdt";
126	if (!-e $self->{'fdt_filename'}) {
127	die "Error: Could not find ISIS FDT file " . $self->{'fdt_filename'} . ".\n";
128	}
129	$self->{'xrf_filename'} = $databasename . ".xrf";
130	if (!-e $self->{'xrf_filename'}) {
131	die "Error: Could not find ISIS XRF file " . $self->{'xrf_filename'} . ".\n";
132	}
133
134	# The text to split is exported from the database by the IsisGdl program
135	open(FILE, "IsisGdl \"$filename\" \|");
136
137	my $reader = new multiread();
138	$reader->set_handle('ISISPlug::FILE');
139	$reader->set_encoding($encoding);
140	$reader->read_file($textref);
141
142	close(FILE);
143
144	# Parse the associated ISIS database Field Definition Table file (.fdt)
145	my %fdt_mapping = &parse_field_definition_table($self->{'fdt_filename'}, $encoding);
146	$self->{'fdt_mapping'} = \%fdt_mapping;
147
148	# Remove the line at the start so it is split and processed properly
149	$$textref =~ s/^----------\n//;
150	}
151
152
153	sub process
154	{
155	my $self = shift (@_);
156	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
157	my $outhandle = $self->{'outhandle'};
158
159	my $section = $doc_obj->get_top_section();
160	my $fdt_mapping = $self->{'fdt_mapping'};
161	my $subfield_separator = $self->{'subfield_separator'};
162	my $entry_separator = $self->{'entry_separator'};
163
164	# Report that we're processing the file
165	print STDERR "<Processing n='$file' p='ISISPlug'>\n" if ($gli);
166	print $outhandle "IsisPlug: processing $file\n" if ($self->{'verbosity'}) > 1;
167
168	# Process each line of the ISIS record, one at a time
169	foreach my $line (split(/\n/, $$textref)) {
170	$line =~ /^tag=(.*) data=(.+)$/;
171	my $tag = $1;
172	my $tag_data = $2;
173	# print STDERR "\nTag: $tag, Data: $tag_data\n";
174
175	# Convert the tag number into a name, and remove any invalid characters
176	my $raw_metadata_name = $fdt_mapping->{$tag}{'name'} \|\| "";
177	$raw_metadata_name =~ s/[,&\#\.\-\/]/ /g;
178	next if ($raw_metadata_name eq "");
179
180	# Metadata field names: title case, then remove spaces
181	my $metadata_name = "";
182	foreach my $word (split(/\s+/, $raw_metadata_name)) {
183	substr($word, 0, 1) =~ tr/a-z/A-Z/;
184	$metadata_name .= $word;
185	}
186
187	my $all_metadata_name = $metadata_name . "^all";
188	my $all_metadata_value = "";
189
190	# Handle repeatable fields
191	if ($fdt_mapping->{$tag}{'repeatable'}) {
192	# Multiple values are separated using the '%' character
193	foreach my $raw_metadata_value (split(/%/, $tag_data)) {
194	my $metadata_value = "";
195
196	# Handle subfields
197	while ($raw_metadata_value ne "") {
198	# If there is a subfield specifier, parse it off
199	my $sub_metadata_name = $metadata_name;
200	if ($raw_metadata_value =~ s/^\^// && $raw_metadata_value =~ s/^([a-z])//) {
201	$sub_metadata_name .= "^$1";
202	}
203
204	# Parse the value off and add it as metadata
205	$raw_metadata_value =~ s/^([^\^]*)//;
206	my $sub_metadata_value = $1;
207
208	# Escape any '<' and '>' characters so they appear correctly in the final collection
209	$sub_metadata_value =~ s/\</</g;
210	$sub_metadata_value =~ s/\>/>/g;
211
212	# print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
213	if ($sub_metadata_name ne $metadata_name) {
214	$doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
215	}
216
217	$metadata_value .= $subfield_separator unless ($metadata_value eq "");
218	$metadata_value .= $sub_metadata_value;
219	}
220
221	# Add the metadata value
222	# print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
223	$doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
224
225	$all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
226	$all_metadata_value .= $metadata_value;
227	}
228	}
229
230	# Handle non-repeatable fields
231	else {
232	my $raw_metadata_value = $tag_data;
233	my $metadata_value = "";
234
235	# Handle subfields
236	while ($raw_metadata_value ne "") {
237	# If there is a subfield specifier, parse it off
238	my $sub_metadata_name = $metadata_name;
239	if ($raw_metadata_value =~ s/^(\^[a-z])//) {
240	$sub_metadata_name .= $1;
241	}
242
243	# Parse the value off and add it as metadata
244	$raw_metadata_value =~ s/^([^\^]*)//;
245	my $sub_metadata_value = $1;
246
247	# Deal with the case when multiple values are specified using <...>
248	if ($sub_metadata_value =~ /\<(.*)\>$/) {
249	my $sub_sub_metadata_name = $sub_metadata_name . "^sub";
250	my $tmp_sub_metadata_value = $sub_metadata_value;
251	while ($tmp_sub_metadata_value =~ s/\<(.*?)\>//) {
252	my $sub_sub_metadata_value = $1;
253	$doc_obj->add_utf8_metadata($section, $sub_sub_metadata_name, $sub_sub_metadata_value);
254	}
255	}
256
257	# Escape any '<' and '>' characters so they appear correctly in the final collection
258	$sub_metadata_value =~ s/\</</g;
259	$sub_metadata_value =~ s/\>/>/g;
260
261	# print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
262	if ($sub_metadata_name ne $metadata_name) {
263	$doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
264	}
265
266	$metadata_value .= $subfield_separator unless ($metadata_value eq "");
267	$metadata_value .= $sub_metadata_value;
268	}
269
270	# Add the metadata value
271	# print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
272	$doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
273
274	$all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
275	$all_metadata_value .= $metadata_value;
276	}
277
278	# Add the "^all" metadata value
279	# print STDERR "All metadata name: $all_metadata_name, value: $all_metadata_value\n";
280	$doc_obj->add_utf8_metadata($section, $all_metadata_name, $all_metadata_value);
281	}
282
283	# Add the full record as the document text
284	$$textref =~ s/\</</g;
285	$$textref =~ s/\>/>/g;
286	$doc_obj->add_utf8_text($section, $$textref);
287
288	# Add FileFormat metadata
289	$doc_obj->add_utf8_metadata($section, "FileFormat", "CDS/ISIS");
290
291	# Record was processed successfully
292	return 1;
293	}
294
295
296	sub parse_field_definition_table
297	{
298	my $fdtfilename = shift(@_);
299	my $encoding = shift(@_);
300
301	my %fdtmapping = ();
302
303	open(FDT_FILE, "<$fdtfilename") \|\| die "Error: Could not open file $fdtfilename.\n";
304
305	my $fdtfiletext = "";
306	my $reader = new multiread();
307	$reader->set_handle('ISISPlug::FDT_FILE');
308	$reader->set_encoding($encoding);
309	$reader->read_file($fdtfiletext);
310
311	my $amongstdefinitions = 0;
312	foreach my $fdtfileline (split(/\n/, $$fdtfiletext)) {
313	$fdtfileline =~ s/(\s*)$//; # Remove any nasty spaces at the end of the lines
314
315	if ($amongstdefinitions) {
316	my $fieldname = substr($fdtfileline, 0, 30);
317	my $fieldsubfields = substr($fdtfileline, 30, 20);
318	my $fieldspecs = substr($fdtfileline, 50);
319
320	# Remove extra spaces
321	$fieldname =~ s/(\s*)$//;
322	$fieldsubfields =~ s/(\s*)$//;
323	$fieldspecs =~ s/(\s*)$//;
324
325	# Map from tag number to metadata field title, subfields, and repeatability
326	my $fieldtag = (split(/ /, $fieldspecs))[0];
327	my $fieldrepeatable = (split(/ /, $fieldspecs))[3];
328	$fdtmapping{$fieldtag} = { 'name' => $fieldname,
329	'subfields' => $fieldsubfields,
330	'repeatable' => $fieldrepeatable };
331	}
332	elsif ($fdtfileline eq "***") {
333	$amongstdefinitions = 1;
334	}
335	}
336
337	close(FDT_FILE);
338
339	return %fdtmapping;
340	}
341
342
343	sub clean_up_after_exploding
344	{
345	my $self = shift(@_);
346
347	# Delete the FDT and XRF files too
348	&util::rm($self->{'fdt_filename'});
349	&util::rm($self->{'xrf_filename'});
350	}
351
352
353	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: