Context Navigation

source: trunk/gsdl/perllib/plugins/ISISPlug.pm@ 11300

Last change on this file since 11300 was 11300, checked in by mdewsnip, 18 years ago
Now also removes '#' characters from field names.
Property svn:keywords set to `Author Date Id Revision`
File size: 10.6 KB

Line
1	###########################################################################
2	#
3	# ISISPlug.pm -- A plugin for CDS/ISIS databases
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright 1999-2004 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	package ISISPlug;
28
29
30	use multiread;
31	use SplitPlug;
32
33	use strict;
34	no strict 'refs'; # allow filehandles to be variables and viceversa
35
36	# ISISPlug is a sub-class of SplitPlug.
37	sub BEGIN {
38	@ISISPlug::ISA = ('SplitPlug');
39	}
40
41
42	my $arguments =
43	[ { 'name' => "process_exp",
44	'desc' => "{BasPlug.process_exp}",
45	'type' => "regexp",
46	'reqd' => "no",
47	'deft' => &get_default_process_exp() },
48	{ 'name' => "block_exp",
49	'desc' => "{BasPlug.block_exp}",
50	'type' => "regexp",
51	'reqd' => "no",
52	'deft' => &get_default_block_exp() },
53	{ 'name' => "split_exp",
54	'desc' => "{SplitPlug.split_exp}",
55	'type' => "regexp",
56	'reqd' => "no",
57	'deft' => &get_default_split_exp(),
58	'hiddengli' => "yes" },
59
60	# The interesting options
61	{ 'name' => "entry_separator",
62	'desc' => "{ISISPlug.entry_separator}",
63	'type' => "string",
64	'reqd' => "no",
65	'deft' => "<br>" },
66	{ 'name' => "subfield_separator",
67	'desc' => "{ISISPlug.subfield_separator}",
68	'type' => "string",
69	'reqd' => "no",
70	'deft' => ", " }
71	];
72
73	my $options = { 'name' => "ISISPlug",
74	'desc' => "{ISISPlug.desc}",
75	'abstract' => "no",
76	'inherits' => "yes",
77	'explodes' => "yes",
78	'args' => $arguments };
79
80
81	# This plugin processes files with the suffix ".mst"
82	sub get_default_process_exp {
83	return q^(?i)(\.mst)$^;
84	}
85
86
87	# This plugin blocks files with the suffix ".fdt" and ".xrf"
88	sub get_default_block_exp {
89	return q^(?i)(\.fdt\|\.xrf)$^;
90	}
91
92
93	# This plugin splits the input text at the "----------" lines
94	sub get_default_split_exp {
95	return q^\r?\n----------\r?\n^;
96	}
97
98
99	sub new
100	{
101	my ($class) = shift (@_);
102	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
103	push(@$pluginlist, $class);
104
105	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
106	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
107
108	my $self = (defined $hashArgOptLists)? new SplitPlug($pluginlist,$inputargs,$hashArgOptLists): new SplitPlug($pluginlist,$inputargs);
109
110	return bless $self, $class;
111	}
112
113
114	sub read_file
115	{
116	my $self = shift (@_);
117	my ($filename, $encoding, $language, $textref) = @_;
118
119	my ($databasename) = ($filename =~ /(.*)\.mst$/i);
120
121	# Check the associated .fdt and .xrf files exist
122	# These files must have a lowercase extension for the IsisGdl program to work
123	# Bailing out because of this is kind of crappy but it is only an issue on Unix
124	my $fdtfilename = $databasename . ".fdt";
125	if (! -e $fdtfilename) {
126	die "Error: Could not find ISIS FDT file $fdtfilename.\n";
127	}
128	my $xrffilename = $databasename . ".xrf";
129	if (! -e $xrffilename) {
130	die "Error: Could not find ISIS XRF file $xrffilename.\n";
131	}
132
133	# The text to split is exported from the database by the IsisGdl program
134	open(FILE, "IsisGdl \"$filename\" \|");
135
136	my $reader = new multiread();
137	$reader->set_handle('ISISPlug::FILE');
138	$reader->set_encoding($encoding);
139	$reader->read_file($textref);
140
141	close(FILE);
142
143	# Parse the associated ISIS database Field Definition Table file (.fdt)
144	my %fdtmapping = &parse_field_definition_table($fdtfilename, $encoding);
145	$self->{'fdt_mapping'} = \%fdtmapping;
146
147	# Remove the line at the start so it is split and processed properly
148	$$textref =~ s/^----------\n//;
149	}
150
151
152	sub process
153	{
154	my $self = shift (@_);
155	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
156	my $outhandle = $self->{'outhandle'};
157
158	my $section = $doc_obj->get_top_section();
159	my $fdt_mapping = $self->{'fdt_mapping'};
160	my $subfield_separator = $self->{'subfield_separator'};
161	my $entry_separator = $self->{'entry_separator'};
162
163	# Report that we're processing the file
164	print STDERR "<Processing n='$file' p='ISISPlug'>\n" if ($gli);
165	print $outhandle "IsisPlug: processing $file\n" if ($self->{'verbosity'}) > 1;
166
167	# Process each line of the ISIS record, one at a time
168	foreach my $line (split(/\n/, $$textref)) {
169	$line =~ /^tag=(.*) data=(.+)$/;
170	my $tag = $1;
171	my $tag_data = $2;
172	# print STDERR "\nTag: $tag, Data: $tag_data\n";
173
174	# Convert the tag number into a name, and remove any invalid characters
175	my $raw_metadata_name = $fdt_mapping->{$tag}{'name'} \|\| "";
176	$raw_metadata_name =~ s/[,&\#\.\-\/]/ /g;
177	next if ($raw_metadata_name eq "");
178
179	# Metadata field names: title case, then remove spaces
180	my $metadata_name = "";
181	foreach my $word (split(/\s+/, $raw_metadata_name)) {
182	substr($word, 0, 1) =~ tr/a-z/A-Z/;
183	$metadata_name .= $word;
184	}
185
186	my $all_metadata_name = $metadata_name . "^all";
187	my $all_metadata_value = "";
188
189	# Handle repeatable fields
190	if ($fdt_mapping->{$tag}{'repeatable'}) {
191	# Multiple values are separated using the '%' character
192	foreach my $raw_metadata_value (split(/%/, $tag_data)) {
193	my $metadata_value = "";
194
195	# Handle subfields
196	while ($raw_metadata_value ne "") {
197	# If there is a subfield specifier, parse it off
198	my $sub_metadata_name = $metadata_name;
199	if ($raw_metadata_value =~ s/^\^// && $raw_metadata_value =~ s/^([a-z])//) {
200	$sub_metadata_name .= "^$1";
201	}
202
203	# Parse the value off and add it as metadata
204	$raw_metadata_value =~ s/^([^\^]*)//;
205	my $sub_metadata_value = $1;
206
207	# Escape any '<' and '>' characters so they appear correctly in the final collection
208	$sub_metadata_value =~ s/\</</g;
209	$sub_metadata_value =~ s/\>/>/g;
210
211	# print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
212	if ($sub_metadata_name ne $metadata_name) {
213	$doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
214	}
215
216	$metadata_value .= $subfield_separator unless ($metadata_value eq "");
217	$metadata_value .= $sub_metadata_value;
218	}
219
220	# Add the metadata value
221	# print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
222	$doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
223
224	$all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
225	$all_metadata_value .= $metadata_value;
226	}
227	}
228
229	# Handle non-repeatable fields
230	else {
231	my $raw_metadata_value = $tag_data;
232	my $metadata_value = "";
233
234	# Handle subfields
235	while ($raw_metadata_value ne "") {
236	# If there is a subfield specifier, parse it off
237	my $sub_metadata_name = $metadata_name;
238	if ($raw_metadata_value =~ s/^(\^[a-z])//) {
239	$sub_metadata_name .= $1;
240	}
241
242	# Parse the value off and add it as metadata
243	$raw_metadata_value =~ s/^([^\^]*)//;
244	my $sub_metadata_value = $1;
245
246	# Deal with the case when multiple values are specified using <...>
247	if ($sub_metadata_value =~ /\<(.*)\>$/) {
248	my $sub_sub_metadata_name = $sub_metadata_name . "^sub";
249	my $tmp_sub_metadata_value = $sub_metadata_value;
250	while ($tmp_sub_metadata_value =~ s/\<(.*?)\>//) {
251	my $sub_sub_metadata_value = $1;
252	$doc_obj->add_utf8_metadata($section, $sub_sub_metadata_name, $sub_sub_metadata_value);
253	}
254	}
255
256	# Escape any '<' and '>' characters so they appear correctly in the final collection
257	$sub_metadata_value =~ s/\</</g;
258	$sub_metadata_value =~ s/\>/>/g;
259
260	# print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
261	if ($sub_metadata_name ne $metadata_name) {
262	$doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
263	}
264
265	$metadata_value .= $subfield_separator unless ($metadata_value eq "");
266	$metadata_value .= $sub_metadata_value;
267	}
268
269	# Add the metadata value
270	# print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
271	$doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
272
273	$all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
274	$all_metadata_value .= $metadata_value;
275	}
276
277	# Add the "^all" metadata value
278	# print STDERR "All metadata name: $all_metadata_name, value: $all_metadata_value\n";
279	$doc_obj->add_utf8_metadata($section, $all_metadata_name, $all_metadata_value);
280	}
281
282	# Add the full record as the document text
283	$$textref =~ s/\</</g;
284	$$textref =~ s/\>/>/g;
285	$doc_obj->add_utf8_text($section, $$textref);
286
287	# Add FileFormat metadata
288	$doc_obj->add_utf8_metadata($section, "FileFormat", "CDS/ISIS");
289
290	# Record was processed successfully
291	return 1;
292	}
293
294
295	sub parse_field_definition_table
296	{
297	my $fdtfilename = shift(@_);
298	my $encoding = shift(@_);
299
300	my %fdtmapping = ();
301
302	open(FDT_FILE, "<$fdtfilename") \|\| die "Error: Could not open file $fdtfilename.\n";
303
304	my $fdtfiletext = "";
305	my $reader = new multiread();
306	$reader->set_handle('ISISPlug::FDT_FILE');
307	$reader->set_encoding($encoding);
308	$reader->read_file($fdtfiletext);
309
310	my $amongstdefinitions = 0;
311	foreach my $fdtfileline (split(/\n/, $$fdtfiletext)) {
312	$fdtfileline =~ s/(\s*)$//; # Remove any nasty spaces at the end of the lines
313
314	if ($amongstdefinitions) {
315	my $fieldname = substr($fdtfileline, 0, 30);
316	my $fieldsubfields = substr($fdtfileline, 30, 20);
317	my $fieldspecs = substr($fdtfileline, 50);
318
319	# Remove extra spaces
320	$fieldname =~ s/(\s*)$//;
321	$fieldsubfields =~ s/(\s*)$//;
322	$fieldspecs =~ s/(\s*)$//;
323
324	# Map from tag number to metadata field title, subfields, and repeatability
325	my $fieldtag = (split(/ /, $fieldspecs))[0];
326	my $fieldrepeatable = (split(/ /, $fieldspecs))[3];
327	$fdtmapping{$fieldtag} = { 'name' => $fieldname,
328	'subfields' => $fieldsubfields,
329	'repeatable' => $fieldrepeatable };
330	}
331	elsif ($fdtfileline eq "***") {
332	$amongstdefinitions = 1;
333	}
334	}
335
336	close(FDT_FILE);
337
338	return %fdtmapping;
339	}
340
341
342	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: