Context Navigation

source: trunk/gsdl/perllib/plugins/ISISPlug.pm@ 12715

Last change on this file since 12715 was 12715, checked in by mdewsnip, 18 years ago
Now supports MST and XRF files with uppercase extensions on Linux, since IsisGdl has been updated.
Property svn:keywords set to `Author Date Id Revision`
File size: 12.9 KB

Line
1	###########################################################################
2	#
3	# ISISPlug.pm -- A plugin for CDS/ISIS databases
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright 1999-2004 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	package ISISPlug;
28
29
30	use multiread;
31	use SplitPlug;
32
33	use strict;
34	no strict 'refs'; # allow filehandles to be variables and viceversa
35
36	# ISISPlug is a sub-class of SplitPlug.
37	sub BEGIN {
38	@ISISPlug::ISA = ('SplitPlug');
39	}
40
41
42	my $arguments =
43	[ { 'name' => "process_exp",
44	'desc' => "{BasPlug.process_exp}",
45	'type' => "regexp",
46	'reqd' => "no",
47	'deft' => &get_default_process_exp() },
48	{ 'name' => "block_exp",
49	'desc' => "{BasPlug.block_exp}",
50	'type' => "regexp",
51	'reqd' => "no",
52	'deft' => &get_default_block_exp(),
53	'hiddengli' => "yes" },
54	{ 'name' => "split_exp",
55	'desc' => "{SplitPlug.split_exp}",
56	'type' => "regexp",
57	'reqd' => "no",
58	'deft' => &get_default_split_exp(),
59	'hiddengli' => "yes" },
60
61	# The interesting options
62	{ 'name' => "entry_separator",
63	'desc' => "{ISISPlug.entry_separator}",
64	'type' => "string",
65	'reqd' => "no",
66	'deft' => "<br>" },
67	{ 'name' => "subfield_separator",
68	'desc' => "{ISISPlug.subfield_separator}",
69	'type' => "string",
70	'reqd' => "no",
71	'deft' => ", " }
72	];
73
74	my $options = { 'name' => "ISISPlug",
75	'desc' => "{ISISPlug.desc}",
76	'abstract' => "no",
77	'inherits' => "yes",
78	'explodes' => "yes",
79	'args' => $arguments };
80
81
82	# This plugin processes files with the suffix ".mst"
83	sub get_default_process_exp {
84	return q^(?i)(\.mst)$^;
85	}
86
87
88	# This plugin blocks files with the suffix ".fdt" and ".xrf"
89	sub get_default_block_exp {
90	return q^(?i)(\.fdt\|\.xrf)$^;
91	}
92
93
94	# This plugin splits the input text at the "----------" lines
95	sub get_default_split_exp {
96	return q^\r?\n----------\r?\n^;
97	}
98
99
100	sub new
101	{
102	my ($class) = shift (@_);
103	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
104	push(@$pluginlist, $class);
105
106	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
107	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
108
109	my $self = new SplitPlug($pluginlist, $inputargs, $hashArgOptLists);
110
111	return bless $self, $class;
112	}
113
114
115	sub read_file
116	{
117	my $self = shift (@_);
118	my ($filename, $encoding, $language, $textref) = @_;
119	my $outhandle = $self->{'outhandle'};
120
121	my ($database_file_path_root) = ($filename =~ /(.*)\.mst$/i);
122	my $mst_file_path_relative = $filename;
123	$mst_file_path_relative =~ s/^.+import.(.*?)$/$1/;
124
125	# Check the associated .fdt and .xrf files exist
126	$self->{'fdt_file_path'} = $database_file_path_root . ".FDT";
127	if (!-e $self->{'fdt_file_path'}) {
128	$self->{'fdt_file_path'} = $database_file_path_root . ".fdt";
129	}
130	if (!-e $self->{'fdt_file_path'}) {
131	print STDERR "<ProcessingError n='$mst_file_path_relative' r='Could not find ISIS FDT file $self->{'fdt_file_path'}'>\n" if ($self->{'gli'});
132	print $outhandle "Error: Could not find ISIS FDT file " . $self->{'fdt_file_path'} . ".\n";
133	return;
134	}
135	$self->{'xrf_file_path'} = $database_file_path_root . ".XRF";
136	if (!-e $self->{'xrf_file_path'}) {
137	$self->{'xrf_file_path'} = $database_file_path_root . ".xrf";
138	}
139	if (!-e $self->{'xrf_file_path'}) {
140	print STDERR "<ProcessingError n='$mst_file_path_relative' r='Could not find ISIS XRF file $self->{'xrf_file_path'}'>\n" if ($self->{'gli'});
141	print $outhandle "Error: Could not find ISIS XRF file " . $self->{'xrf_file_path'} . ".\n";
142	return;
143	}
144
145	# The text to split is exported from the database by the IsisGdl program
146	open(FILE, "IsisGdl \"$filename\" \|");
147
148	my $reader = new multiread();
149	$reader->set_handle('ISISPlug::FILE');
150	$reader->set_encoding($encoding);
151	$reader->read_file($textref);
152
153	close(FILE);
154
155	# Parse the associated ISIS database Field Definition Table file (.fdt)
156	my %fdt_mapping = &parse_field_definition_table($self->{'fdt_file_path'}, $encoding);
157	$self->{'fdt_mapping'} = \%fdt_mapping;
158
159	# Remove the line at the start, and any blank lines, so the data is split and processed properly
160	$$textref =~ s/^----------\n//;
161	$$textref =~ s/\n\n/\n/g;
162	}
163
164
165	sub process
166	{
167	my $self = shift (@_);
168	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
169	my $outhandle = $self->{'outhandle'};
170
171	my $section = $doc_obj->get_top_section();
172	my $fdt_mapping = $self->{'fdt_mapping'};
173	my $subfield_separator = $self->{'subfield_separator'};
174	my $entry_separator = $self->{'entry_separator'};
175	my $isis_record_html_metadata_value = "<table cellpadding=\"4\" cellspacing=\"0\">";
176
177	# Report that we're processing the file
178	print STDERR "\n<Processing n='$file' p='ISISPlug'>\n" if ($gli);
179	print $outhandle "IsisPlug: processing $file\n" if ($self->{'verbosity'}) > 1;
180
181	# Process each line of the ISIS record, one at a time
182	foreach my $line (split(/\n/, $$textref)) {
183	$line =~ s/(\s*)$//; # Remove any nasty whitespace (very important for Windows)
184	$line =~ /^tag=(.*) data=(.+)$/;
185	my $tag = $1;
186	my $tag_data = $2;
187	# print STDERR "\nTag: $tag, Data: $tag_data\n";
188
189	# Convert the tag number into a name, and remove any invalid characters
190	my $raw_metadata_name = $fdt_mapping->{$tag}{'name'} \|\| "";
191	$raw_metadata_name =~ s/[,&\#\.\-\/]/ /g;
192	next if ($raw_metadata_name eq "");
193
194	# Metadata field names: title case, then remove spaces
195	my $metadata_name = "";
196	foreach my $word (split(/\s+/, $raw_metadata_name)) {
197	substr($word, 0, 1) =~ tr/a-z/A-Z/;
198	$metadata_name .= $word;
199	}
200
201	my $all_metadata_name = $metadata_name . "^all";
202	my $all_metadata_value = "";
203
204	# Handle repeatable fields
205	if ($fdt_mapping->{$tag}{'repeatable'}) {
206	# Multiple values are separated using the '%' character
207	foreach my $raw_metadata_value (split(/%/, $tag_data)) {
208	my $metadata_value = "";
209
210	# Handle subfields
211	while ($raw_metadata_value ne "") {
212	# If there is a subfield specifier, parse it off
213	my $sub_metadata_name = $metadata_name;
214	if ($raw_metadata_value =~ s/^\^// && $raw_metadata_value =~ s/^([a-z])//) {
215	$sub_metadata_name .= "^$1";
216	}
217
218	# Parse the value off and add it as metadata
219	$raw_metadata_value =~ s/^([^\^]*)//;
220	my $sub_metadata_value = &escape_metadata_value($1);
221
222	# print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
223	if ($sub_metadata_name ne $metadata_name) {
224	$doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
225	}
226
227	# If this is the first subfield then the value is used for the CDS/ISIS ^* field
228	if ($metadata_value eq "") {
229	$doc_obj->add_utf8_metadata($section, $metadata_name . "^*", $sub_metadata_value);
230	}
231
232	$metadata_value .= $subfield_separator unless ($metadata_value eq "");
233	$metadata_value .= $sub_metadata_value;
234	}
235
236	# Add the metadata value
237	# print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
238	$doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
239
240	$all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
241	$all_metadata_value .= $metadata_value;
242	}
243	}
244
245	# Handle non-repeatable fields
246	else {
247	my $raw_metadata_value = $tag_data;
248	my $metadata_value = "";
249
250	# Handle subfields
251	while ($raw_metadata_value ne "") {
252	# If there is a subfield specifier, parse it off
253	my $sub_metadata_name = $metadata_name;
254	if ($raw_metadata_value =~ s/^\^// && $raw_metadata_value =~ s/^([a-z])//) {
255	$sub_metadata_name .= "^$1";
256	}
257
258	# Parse the value off and add it as metadata
259	$raw_metadata_value =~ s/^([^\^]*)//;
260	my $sub_metadata_value = $1;
261
262	# Deal with the case when multiple values are specified using <...>
263	if ($sub_metadata_value =~ /\<(.+)\>/) {
264	my $sub_sub_metadata_name = $sub_metadata_name . "^sub";
265	my $tmp_sub_metadata_value = $sub_metadata_value;
266	while ($tmp_sub_metadata_value =~ s/\<(.+?)\>//) {
267	my $sub_sub_metadata_value = $1;
268	$doc_obj->add_utf8_metadata($section, $sub_sub_metadata_name, $sub_sub_metadata_value);
269	}
270	}
271	# Deal with the legacy case when multiple values are specified using /.../
272	elsif ($sub_metadata_value =~ /\/(.+)\//) {
273	my $sub_sub_metadata_name = $sub_metadata_name . "^sub";
274	my $tmp_sub_metadata_value = $sub_metadata_value;
275	while ($tmp_sub_metadata_value =~ s/\/(.+?)\///) {
276	my $sub_sub_metadata_value = $1;
277	$doc_obj->add_utf8_metadata($section, $sub_sub_metadata_name, $sub_sub_metadata_value);
278	}
279	}
280
281	# Escape the metadata value so it appears correctly in the final collection
282	$sub_metadata_value = &escape_metadata_value($sub_metadata_value);
283
284	# print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
285	if ($sub_metadata_name ne $metadata_name) {
286	$doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
287	}
288
289	# If this is the first subfield then the value is used for the CDS/ISIS ^* field
290	if ($metadata_value eq "") {
291	$doc_obj->add_utf8_metadata($section, $metadata_name . "^*", $sub_metadata_value);
292	}
293
294	$metadata_value .= $subfield_separator unless ($metadata_value eq "");
295	$metadata_value .= $sub_metadata_value;
296	}
297
298	# Add the metadata value
299	# print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
300	$doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
301
302	$all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
303	$all_metadata_value .= $metadata_value;
304	}
305
306	# Add the "^all" metadata value
307	# print STDERR "All metadata name: $all_metadata_name, value: $all_metadata_value\n";
308	$doc_obj->add_utf8_metadata($section, $all_metadata_name, $all_metadata_value);
309
310	$isis_record_html_metadata_value .= "<tr><td valign=top><nobr><b>" . $fdt_mapping->{$tag}{'name'} . "</b></nobr></td><td valign=top>" . $all_metadata_value . "</td></tr>";
311	}
312
313	# Add a reasonably formatted HTML table view of the record as the document text
314	$isis_record_html_metadata_value .= "</table>";
315	$doc_obj->add_utf8_text($section, $isis_record_html_metadata_value);
316
317	# Add the full raw record as metadata
318	my $isis_raw_record_metadata_value = &escape_metadata_value($$textref);
319	$doc_obj->add_utf8_metadata($section, "ISISRawRecord", $isis_raw_record_metadata_value);
320
321	# Add FileFormat metadata
322	$doc_obj->add_utf8_metadata($section, "FileFormat", "CDS/ISIS");
323
324	# Record was processed successfully
325	return 1;
326	}
327
328
329	sub parse_field_definition_table
330	{
331	my $fdtfilename = shift(@_);
332	my $encoding = shift(@_);
333
334	my %fdtmapping = ();
335
336	open(FDT_FILE, "<$fdtfilename") \|\| die "Error: Could not open file $fdtfilename.\n";
337
338	my $fdtfiletext = "";
339	my $reader = new multiread();
340	$reader->set_handle('ISISPlug::FDT_FILE');
341	$reader->set_encoding($encoding);
342	$reader->read_file($fdtfiletext);
343
344	my $amongstdefinitions = 0;
345	foreach my $fdtfileline (split(/\n/, $$fdtfiletext)) {
346	$fdtfileline =~ s/(\s*)$//; # Remove any nasty spaces at the end of the lines
347
348	if ($amongstdefinitions) {
349	my $fieldname = substr($fdtfileline, 0, 30);
350	my $fieldsubfields = substr($fdtfileline, 30, 20);
351	my $fieldspecs = substr($fdtfileline, 50);
352
353	# Remove extra spaces
354	$fieldname =~ s/(\s*)$//;
355	$fieldsubfields =~ s/(\s*)$//;
356	$fieldspecs =~ s/(\s*)$//;
357
358	# Map from tag number to metadata field title, subfields, and repeatability
359	my $fieldtag = (split(/ /, $fieldspecs))[0];
360	my $fieldrepeatable = (split(/ /, $fieldspecs))[3];
361	$fdtmapping{$fieldtag} = { 'name' => $fieldname,
362	'subfields' => $fieldsubfields,
363	'repeatable' => $fieldrepeatable };
364	}
365	elsif ($fdtfileline eq "***") {
366	$amongstdefinitions = 1;
367	}
368	}
369
370	close(FDT_FILE);
371
372	return %fdtmapping;
373	}
374
375
376	sub escape_metadata_value
377	{
378	my $value = shift(@_);
379	$value =~ s/\</</g;
380	$value =~ s/\>/>/g;
381	$value =~ s/\\/\\\\/g;
382	return $value;
383	}
384
385
386	sub clean_up_after_exploding
387	{
388	my $self = shift(@_);
389
390	# Delete the FDT and XRF files too
391	&util::rm($self->{'fdt_file_path'});
392	&util::rm($self->{'xrf_file_path'});
393	}
394
395
396	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: