Context Navigation

source: trunk/gsdl/perllib/plugins/ISISPlug.pm@ 12610

Last change on this file since 12610 was 12169, checked in by mdewsnip, 18 years ago
Tidied up that horrible long line in the new() function of every plugin.
Property svn:keywords set to `Author Date Id Revision`
File size: 12.4 KB

Line
1	###########################################################################
2	#
3	# ISISPlug.pm -- A plugin for CDS/ISIS databases
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright 1999-2004 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	package ISISPlug;
28
29
30	use multiread;
31	use SplitPlug;
32
33	use strict;
34	no strict 'refs'; # allow filehandles to be variables and viceversa
35
36	# ISISPlug is a sub-class of SplitPlug.
37	sub BEGIN {
38	@ISISPlug::ISA = ('SplitPlug');
39	}
40
41
42	my $arguments =
43	[ { 'name' => "process_exp",
44	'desc' => "{BasPlug.process_exp}",
45	'type' => "regexp",
46	'reqd' => "no",
47	'deft' => &get_default_process_exp() },
48	{ 'name' => "block_exp",
49	'desc' => "{BasPlug.block_exp}",
50	'type' => "regexp",
51	'reqd' => "no",
52	'deft' => &get_default_block_exp(),
53	'hiddengli' => "yes" },
54	{ 'name' => "split_exp",
55	'desc' => "{SplitPlug.split_exp}",
56	'type' => "regexp",
57	'reqd' => "no",
58	'deft' => &get_default_split_exp(),
59	'hiddengli' => "yes" },
60
61	# The interesting options
62	{ 'name' => "entry_separator",
63	'desc' => "{ISISPlug.entry_separator}",
64	'type' => "string",
65	'reqd' => "no",
66	'deft' => "<br>" },
67	{ 'name' => "subfield_separator",
68	'desc' => "{ISISPlug.subfield_separator}",
69	'type' => "string",
70	'reqd' => "no",
71	'deft' => ", " }
72	];
73
74	my $options = { 'name' => "ISISPlug",
75	'desc' => "{ISISPlug.desc}",
76	'abstract' => "no",
77	'inherits' => "yes",
78	'explodes' => "yes",
79	'args' => $arguments };
80
81
82	# This plugin processes files with the suffix ".mst"
83	sub get_default_process_exp {
84	return q^(?i)(\.mst)$^;
85	}
86
87
88	# This plugin blocks files with the suffix ".fdt" and ".xrf"
89	sub get_default_block_exp {
90	return q^(?i)(\.fdt\|\.xrf)$^;
91	}
92
93
94	# This plugin splits the input text at the "----------" lines
95	sub get_default_split_exp {
96	return q^\r?\n----------\r?\n^;
97	}
98
99
100	sub new
101	{
102	my ($class) = shift (@_);
103	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
104	push(@$pluginlist, $class);
105
106	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
107	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
108
109	my $self = new SplitPlug($pluginlist, $inputargs, $hashArgOptLists);
110
111	return bless $self, $class;
112	}
113
114
115	sub read_file
116	{
117	my $self = shift (@_);
118	my ($filename, $encoding, $language, $textref) = @_;
119	my $outhandle = $self->{'outhandle'};
120
121	my ($database_file_path_root) = ($filename =~ /(.*)\.mst$/i);
122	my $mst_file_path_relative = $filename;
123	$mst_file_path_relative =~ s/^.+import.(.*?)$/$1/;
124
125	# Check the associated .fdt and .xrf files exist
126	# These files must have a lowercase extension for the IsisGdl program to work
127	# Bailing out because of this is kind of crappy but it is only an issue on Unix
128	$self->{'fdt_file_path'} = $database_file_path_root . ".fdt";
129	if (!-e $self->{'fdt_file_path'}) {
130	print STDERR "<ProcessingError n='$mst_file_path_relative' r='Could not find ISIS FDT file $self->{'fdt_file_path'}'>\n" if ($self->{'gli'});
131	print $outhandle "Error: Could not find ISIS FDT file " . $self->{'fdt_file_path'} . ".\n";
132	return;
133	}
134	$self->{'xrf_file_path'} = $database_file_path_root . ".xrf";
135	if (!-e $self->{'xrf_file_path'}) {
136	print STDERR "<ProcessingError n='$mst_file_path_relative' r='Could not find ISIS XRF file $self->{'xrf_file_path'}'>\n" if ($self->{'gli'});
137	print $outhandle "Error: Could not find ISIS XRF file " . $self->{'xrf_file_path'} . ".\n";
138	return;
139	}
140
141	# The text to split is exported from the database by the IsisGdl program
142	open(FILE, "IsisGdl \"$filename\" \|");
143
144	my $reader = new multiread();
145	$reader->set_handle('ISISPlug::FILE');
146	$reader->set_encoding($encoding);
147	$reader->read_file($textref);
148
149	close(FILE);
150
151	# Parse the associated ISIS database Field Definition Table file (.fdt)
152	my %fdt_mapping = &parse_field_definition_table($self->{'fdt_file_path'}, $encoding);
153	$self->{'fdt_mapping'} = \%fdt_mapping;
154
155	# Remove the line at the start, and any blank lines, so the data is split and processed properly
156	$$textref =~ s/^----------\n//;
157	$$textref =~ s/\n\n/\n/g;
158	}
159
160
161	sub process
162	{
163	my $self = shift (@_);
164	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
165	my $outhandle = $self->{'outhandle'};
166
167	my $section = $doc_obj->get_top_section();
168	my $fdt_mapping = $self->{'fdt_mapping'};
169	my $subfield_separator = $self->{'subfield_separator'};
170	my $entry_separator = $self->{'entry_separator'};
171	my $isis_record_html_metadata_value = "<table cellpadding=\"4\" cellspacing=\"0\">";
172
173	# Report that we're processing the file
174	print STDERR "\n<Processing n='$file' p='ISISPlug'>\n" if ($gli);
175	print $outhandle "IsisPlug: processing $file\n" if ($self->{'verbosity'}) > 1;
176
177	# Process each line of the ISIS record, one at a time
178	foreach my $line (split(/\n/, $$textref)) {
179	$line =~ s/(\s*)$//; # Remove any nasty whitespace (very important for Windows)
180	$line =~ /^tag=(.*) data=(.+)$/;
181	my $tag = $1;
182	my $tag_data = $2;
183	# print STDERR "\nTag: $tag, Data: $tag_data\n";
184
185	# Convert the tag number into a name, and remove any invalid characters
186	my $raw_metadata_name = $fdt_mapping->{$tag}{'name'} \|\| "";
187	$raw_metadata_name =~ s/[,&\#\.\-\/]/ /g;
188	next if ($raw_metadata_name eq "");
189
190	# Metadata field names: title case, then remove spaces
191	my $metadata_name = "";
192	foreach my $word (split(/\s+/, $raw_metadata_name)) {
193	substr($word, 0, 1) =~ tr/a-z/A-Z/;
194	$metadata_name .= $word;
195	}
196
197	my $all_metadata_name = $metadata_name . "^all";
198	my $all_metadata_value = "";
199
200	# Handle repeatable fields
201	if ($fdt_mapping->{$tag}{'repeatable'}) {
202	# Multiple values are separated using the '%' character
203	foreach my $raw_metadata_value (split(/%/, $tag_data)) {
204	my $metadata_value = "";
205
206	# Handle subfields
207	while ($raw_metadata_value ne "") {
208	# If there is a subfield specifier, parse it off
209	my $sub_metadata_name = $metadata_name;
210	if ($raw_metadata_value =~ s/^\^// && $raw_metadata_value =~ s/^([a-z])//) {
211	$sub_metadata_name .= "^$1";
212	}
213
214	# Parse the value off and add it as metadata
215	$raw_metadata_value =~ s/^([^\^]*)//;
216	my $sub_metadata_value = $1;
217
218	# Escape any '<' and '>' characters so they appear correctly in the final collection
219	$sub_metadata_value =~ s/\</</g;
220	$sub_metadata_value =~ s/\>/>/g;
221
222	# print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
223	if ($sub_metadata_name ne $metadata_name) {
224	$doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
225	}
226
227	$metadata_value .= $subfield_separator unless ($metadata_value eq "");
228	$metadata_value .= $sub_metadata_value;
229	}
230
231	# Add the metadata value
232	# print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
233	$doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
234
235	$all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
236	$all_metadata_value .= $metadata_value;
237	}
238	}
239
240	# Handle non-repeatable fields
241	else {
242	my $raw_metadata_value = $tag_data;
243	my $metadata_value = "";
244
245	# Handle subfields
246	while ($raw_metadata_value ne "") {
247	# If there is a subfield specifier, parse it off
248	my $sub_metadata_name = $metadata_name;
249	if ($raw_metadata_value =~ s/^\^// && $raw_metadata_value =~ s/^([a-z])//) {
250	$sub_metadata_name .= "^$1";
251	}
252
253	# Parse the value off and add it as metadata
254	$raw_metadata_value =~ s/^([^\^]*)//;
255	my $sub_metadata_value = $1;
256
257	# Deal with the case when multiple values are specified using <...>
258	if ($sub_metadata_value =~ /\<(.+)\>/) {
259	my $sub_sub_metadata_name = $sub_metadata_name . "^sub";
260	my $tmp_sub_metadata_value = $sub_metadata_value;
261	while ($tmp_sub_metadata_value =~ s/\<(.+?)\>//) {
262	my $sub_sub_metadata_value = $1;
263	$doc_obj->add_utf8_metadata($section, $sub_sub_metadata_name, $sub_sub_metadata_value);
264	}
265	}
266	# Deal with the legacy case when multiple values are specified using /.../
267	elsif ($sub_metadata_value =~ /\/(.+)\//) {
268	my $sub_sub_metadata_name = $sub_metadata_name . "^sub";
269	my $tmp_sub_metadata_value = $sub_metadata_value;
270	while ($tmp_sub_metadata_value =~ s/\/(.+?)\///) {
271	my $sub_sub_metadata_value = $1;
272	$doc_obj->add_utf8_metadata($section, $sub_sub_metadata_name, $sub_sub_metadata_value);
273	}
274	}
275
276	# Escape any '<' and '>' characters so they appear correctly in the final collection
277	$sub_metadata_value =~ s/\</</g;
278	$sub_metadata_value =~ s/\>/>/g;
279
280	# print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
281	if ($sub_metadata_name ne $metadata_name) {
282	$doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
283	}
284
285	$metadata_value .= $subfield_separator unless ($metadata_value eq "");
286	$metadata_value .= $sub_metadata_value;
287	}
288
289	# Add the metadata value
290	# print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
291	$doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
292
293	$all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
294	$all_metadata_value .= $metadata_value;
295	}
296
297	# Add the "^all" metadata value
298	# print STDERR "All metadata name: $all_metadata_name, value: $all_metadata_value\n";
299	$doc_obj->add_utf8_metadata($section, $all_metadata_name, $all_metadata_value);
300
301	$isis_record_html_metadata_value .= "<tr><td valign=top><nobr><b>" . $fdt_mapping->{$tag}{'name'} . "</b></nobr></td><td valign=top>" . $all_metadata_value . "</td></tr>";
302	}
303
304	# Add a reasonably formatted HTML table view of the record as the document text
305	$isis_record_html_metadata_value .= "</table>";
306	$doc_obj->add_utf8_text($section, $isis_record_html_metadata_value);
307
308	# Add the full raw record as metadata
309	$$textref =~ s/</&lt\;/g;
310	$$textref =~ s/>/&gt\;/g;
311	$doc_obj->add_utf8_metadata($section, "ISISRawRecord", $$textref);
312
313	# Add FileFormat metadata
314	$doc_obj->add_utf8_metadata($section, "FileFormat", "CDS/ISIS");
315
316	# Record was processed successfully
317	return 1;
318	}
319
320
321	sub parse_field_definition_table
322	{
323	my $fdtfilename = shift(@_);
324	my $encoding = shift(@_);
325
326	my %fdtmapping = ();
327
328	open(FDT_FILE, "<$fdtfilename") \|\| die "Error: Could not open file $fdtfilename.\n";
329
330	my $fdtfiletext = "";
331	my $reader = new multiread();
332	$reader->set_handle('ISISPlug::FDT_FILE');
333	$reader->set_encoding($encoding);
334	$reader->read_file($fdtfiletext);
335
336	my $amongstdefinitions = 0;
337	foreach my $fdtfileline (split(/\n/, $$fdtfiletext)) {
338	$fdtfileline =~ s/(\s*)$//; # Remove any nasty spaces at the end of the lines
339
340	if ($amongstdefinitions) {
341	my $fieldname = substr($fdtfileline, 0, 30);
342	my $fieldsubfields = substr($fdtfileline, 30, 20);
343	my $fieldspecs = substr($fdtfileline, 50);
344
345	# Remove extra spaces
346	$fieldname =~ s/(\s*)$//;
347	$fieldsubfields =~ s/(\s*)$//;
348	$fieldspecs =~ s/(\s*)$//;
349
350	# Map from tag number to metadata field title, subfields, and repeatability
351	my $fieldtag = (split(/ /, $fieldspecs))[0];
352	my $fieldrepeatable = (split(/ /, $fieldspecs))[3];
353	$fdtmapping{$fieldtag} = { 'name' => $fieldname,
354	'subfields' => $fieldsubfields,
355	'repeatable' => $fieldrepeatable };
356	}
357	elsif ($fdtfileline eq "***") {
358	$amongstdefinitions = 1;
359	}
360	}
361
362	close(FDT_FILE);
363
364	return %fdtmapping;
365	}
366
367
368	sub clean_up_after_exploding
369	{
370	my $self = shift(@_);
371
372	# Delete the FDT and XRF files too
373	&util::rm($self->{'fdt_file_path'});
374	&util::rm($self->{'xrf_file_path'});
375	}
376
377
378	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: