Context Navigation

source: trunk/gsdl/perllib/plugins/ISISPlug.pm@ 12969

Last change on this file since 12969 was 12833, checked in by kjdon, 18 years ago
we don't want textcat to be run for isis files. we assume ascii encoding unless the user has specified an encoding
Property svn:keywords set to `Author Date Id Revision`
File size: 13.0 KB

Line
1	###########################################################################
2	#
3	# ISISPlug.pm -- A plugin for CDS/ISIS databases
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright 1999-2004 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	package ISISPlug;
28
29
30	use multiread;
31	use SplitPlug;
32
33	use strict;
34	no strict 'refs'; # allow filehandles to be variables and viceversa
35
36	# ISISPlug is a sub-class of SplitPlug.
37	sub BEGIN {
38	@ISISPlug::ISA = ('SplitPlug');
39	}
40
41
42	my $arguments =
43	[ { 'name' => "process_exp",
44	'desc' => "{BasPlug.process_exp}",
45	'type' => "regexp",
46	'reqd' => "no",
47	'deft' => &get_default_process_exp() },
48	{ 'name' => "block_exp",
49	'desc' => "{BasPlug.block_exp}",
50	'type' => "regexp",
51	'reqd' => "no",
52	'deft' => &get_default_block_exp(),
53	'hiddengli' => "yes" },
54	{ 'name' => "split_exp",
55	'desc' => "{SplitPlug.split_exp}",
56	'type' => "regexp",
57	'reqd' => "no",
58	'deft' => &get_default_split_exp(),
59	'hiddengli' => "yes" },
60
61	# The interesting options
62	{ 'name' => "entry_separator",
63	'desc' => "{ISISPlug.entry_separator}",
64	'type' => "string",
65	'reqd' => "no",
66	'deft' => "<br>" },
67	{ 'name' => "subfield_separator",
68	'desc' => "{ISISPlug.subfield_separator}",
69	'type' => "string",
70	'reqd' => "no",
71	'deft' => ", " }
72	];
73
74	my $options = { 'name' => "ISISPlug",
75	'desc' => "{ISISPlug.desc}",
76	'abstract' => "no",
77	'inherits' => "yes",
78	'explodes' => "yes",
79	'args' => $arguments };
80
81
82	# This plugin processes files with the suffix ".mst"
83	sub get_default_process_exp {
84	return q^(?i)(\.mst)$^;
85	}
86
87
88	# This plugin blocks files with the suffix ".fdt" and ".xrf"
89	sub get_default_block_exp {
90	return q^(?i)(\.fdt\|\.xrf)$^;
91	}
92
93
94	# This plugin splits the input text at the "----------" lines
95	sub get_default_split_exp {
96	return q^\r?\n----------\r?\n^;
97	}
98
99
100	sub new
101	{
102	my ($class) = shift (@_);
103	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
104	push(@$pluginlist, $class);
105
106	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
107	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
108
109	my $self = new SplitPlug($pluginlist, $inputargs, $hashArgOptLists);
110
111	# isis plug doesn't care about encoding - it assumes ascii unless the user
112	# has specified an encoding
113	if ($self->{'input_encoding'} eq "auto") {
114	$self->{'input_encoding'} = "ascii";
115	}
116	return bless $self, $class;
117	}
118
119
120	sub read_file
121	{
122	my $self = shift (@_);
123	my ($filename, $encoding, $language, $textref) = @_;
124	my $outhandle = $self->{'outhandle'};
125
126	my ($database_file_path_root) = ($filename =~ /(.*)\.mst$/i);
127	my $mst_file_path_relative = $filename;
128	$mst_file_path_relative =~ s/^.+import.(.*?)$/$1/;
129
130	# Check the associated .fdt and .xrf files exist
131	$self->{'fdt_file_path'} = $database_file_path_root . ".FDT";
132	if (!-e $self->{'fdt_file_path'}) {
133	$self->{'fdt_file_path'} = $database_file_path_root . ".fdt";
134	}
135	if (!-e $self->{'fdt_file_path'}) {
136	print STDERR "<ProcessingError n='$mst_file_path_relative' r='Could not find ISIS FDT file $self->{'fdt_file_path'}'>\n" if ($self->{'gli'});
137	print $outhandle "Error: Could not find ISIS FDT file " . $self->{'fdt_file_path'} . ".\n";
138	return;
139	}
140	$self->{'xrf_file_path'} = $database_file_path_root . ".XRF";
141	if (!-e $self->{'xrf_file_path'}) {
142	$self->{'xrf_file_path'} = $database_file_path_root . ".xrf";
143	}
144	if (!-e $self->{'xrf_file_path'}) {
145	print STDERR "<ProcessingError n='$mst_file_path_relative' r='Could not find ISIS XRF file $self->{'xrf_file_path'}'>\n" if ($self->{'gli'});
146	print $outhandle "Error: Could not find ISIS XRF file " . $self->{'xrf_file_path'} . ".\n";
147	return;
148	}
149
150	# The text to split is exported from the database by the IsisGdl program
151	open(FILE, "IsisGdl \"$filename\" \|");
152
153	my $reader = new multiread();
154	$reader->set_handle('ISISPlug::FILE');
155	$reader->set_encoding($encoding);
156	$reader->read_file($textref);
157
158	close(FILE);
159
160	# Parse the associated ISIS database Field Definition Table file (.fdt)
161	my %fdt_mapping = &parse_field_definition_table($self->{'fdt_file_path'}, $encoding);
162	$self->{'fdt_mapping'} = \%fdt_mapping;
163
164	# Remove the line at the start, and any blank lines, so the data is split and processed properly
165	$$textref =~ s/^----------\n//;
166	$$textref =~ s/\n\n/\n/g;
167	}
168
169
170	sub process
171	{
172	my $self = shift (@_);
173	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
174	my $outhandle = $self->{'outhandle'};
175
176	my $section = $doc_obj->get_top_section();
177	my $fdt_mapping = $self->{'fdt_mapping'};
178	my $subfield_separator = $self->{'subfield_separator'};
179	my $entry_separator = $self->{'entry_separator'};
180	my $isis_record_html_metadata_value = "<table cellpadding=\"4\" cellspacing=\"0\">";
181
182	# Report that we're processing the file
183	print STDERR "\n<Processing n='$file' p='ISISPlug'>\n" if ($gli);
184	print $outhandle "IsisPlug: processing $file\n" if ($self->{'verbosity'}) > 1;
185
186	# Process each line of the ISIS record, one at a time
187	foreach my $line (split(/\n/, $$textref)) {
188	$line =~ s/(\s*)$//; # Remove any nasty whitespace (very important for Windows)
189	$line =~ /^tag=(.*) data=(.+)$/;
190	my $tag = $1;
191	my $tag_data = $2;
192	# print STDERR "\nTag: $tag, Data: $tag_data\n";
193
194	# Convert the tag number into a name, and remove any invalid characters
195	my $raw_metadata_name = $fdt_mapping->{$tag}{'name'} \|\| "";
196	$raw_metadata_name =~ s/[,&\#\.\-\/]/ /g;
197	next if ($raw_metadata_name eq "");
198
199	# Metadata field names: title case, then remove spaces
200	my $metadata_name = "";
201	foreach my $word (split(/\s+/, $raw_metadata_name)) {
202	substr($word, 0, 1) =~ tr/a-z/A-Z/;
203	$metadata_name .= $word;
204	}
205
206	my $all_metadata_name = $metadata_name . "^all";
207	my $all_metadata_value = "";
208
209	# Handle repeatable fields
210	if ($fdt_mapping->{$tag}{'repeatable'}) {
211	# Multiple values are separated using the '%' character
212	foreach my $raw_metadata_value (split(/%/, $tag_data)) {
213	my $metadata_value = "";
214
215	# Handle subfields
216	while ($raw_metadata_value ne "") {
217	# If there is a subfield specifier, parse it off
218	my $sub_metadata_name = $metadata_name;
219	if ($raw_metadata_value =~ s/^\^// && $raw_metadata_value =~ s/^([a-z])//) {
220	$sub_metadata_name .= "^$1";
221	}
222
223	# Parse the value off and add it as metadata
224	$raw_metadata_value =~ s/^([^\^]*)//;
225	my $sub_metadata_value = &escape_metadata_value($1);
226
227	# print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
228	if ($sub_metadata_name ne $metadata_name) {
229	$doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
230	}
231
232	# If this is the first subfield then the value is used for the CDS/ISIS ^* field
233	if ($metadata_value eq "") {
234	$doc_obj->add_utf8_metadata($section, $metadata_name . "^*", $sub_metadata_value);
235	}
236
237	$metadata_value .= $subfield_separator unless ($metadata_value eq "");
238	$metadata_value .= $sub_metadata_value;
239	}
240
241	# Add the metadata value
242	# print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
243	$doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
244
245	$all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
246	$all_metadata_value .= $metadata_value;
247	}
248	}
249
250	# Handle non-repeatable fields
251	else {
252	my $raw_metadata_value = $tag_data;
253	my $metadata_value = "";
254
255	# Handle subfields
256	while ($raw_metadata_value ne "") {
257	# If there is a subfield specifier, parse it off
258	my $sub_metadata_name = $metadata_name;
259	if ($raw_metadata_value =~ s/^\^// && $raw_metadata_value =~ s/^([a-z])//) {
260	$sub_metadata_name .= "^$1";
261	}
262
263	# Parse the value off and add it as metadata
264	$raw_metadata_value =~ s/^([^\^]*)//;
265	my $sub_metadata_value = $1;
266
267	# Deal with the case when multiple values are specified using <...>
268	if ($sub_metadata_value =~ /\<(.+)\>/) {
269	my $sub_sub_metadata_name = $sub_metadata_name . "^sub";
270	my $tmp_sub_metadata_value = $sub_metadata_value;
271	while ($tmp_sub_metadata_value =~ s/\<(.+?)\>//) {
272	my $sub_sub_metadata_value = $1;
273	$doc_obj->add_utf8_metadata($section, $sub_sub_metadata_name, $sub_sub_metadata_value);
274	}
275	}
276	# Deal with the legacy case when multiple values are specified using /.../
277	elsif ($sub_metadata_value =~ /\/(.+)\//) {
278	my $sub_sub_metadata_name = $sub_metadata_name . "^sub";
279	my $tmp_sub_metadata_value = $sub_metadata_value;
280	while ($tmp_sub_metadata_value =~ s/\/(.+?)\///) {
281	my $sub_sub_metadata_value = $1;
282	$doc_obj->add_utf8_metadata($section, $sub_sub_metadata_name, $sub_sub_metadata_value);
283	}
284	}
285
286	# Escape the metadata value so it appears correctly in the final collection
287	$sub_metadata_value = &escape_metadata_value($sub_metadata_value);
288
289	# print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
290	if ($sub_metadata_name ne $metadata_name) {
291	$doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
292	}
293
294	# If this is the first subfield then the value is used for the CDS/ISIS ^* field
295	if ($metadata_value eq "") {
296	$doc_obj->add_utf8_metadata($section, $metadata_name . "^*", $sub_metadata_value);
297	}
298
299	$metadata_value .= $subfield_separator unless ($metadata_value eq "");
300	$metadata_value .= $sub_metadata_value;
301	}
302
303	# Add the metadata value
304	# print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
305	$doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
306
307	$all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
308	$all_metadata_value .= $metadata_value;
309	}
310
311	# Add the "^all" metadata value
312	# print STDERR "All metadata name: $all_metadata_name, value: $all_metadata_value\n";
313	$doc_obj->add_utf8_metadata($section, $all_metadata_name, $all_metadata_value);
314
315	$isis_record_html_metadata_value .= "<tr><td valign=top><nobr><b>" . $fdt_mapping->{$tag}{'name'} . "</b></nobr></td><td valign=top>" . $all_metadata_value . "</td></tr>";
316	}
317
318	# Add a reasonably formatted HTML table view of the record as the document text
319	$isis_record_html_metadata_value .= "</table>";
320	$doc_obj->add_utf8_text($section, $isis_record_html_metadata_value);
321
322	# Add the full raw record as metadata
323	my $isis_raw_record_metadata_value = &escape_metadata_value($$textref);
324	$doc_obj->add_utf8_metadata($section, "ISISRawRecord", $isis_raw_record_metadata_value);
325
326	# Add FileFormat metadata
327	$doc_obj->add_utf8_metadata($section, "FileFormat", "CDS/ISIS");
328
329	# Record was processed successfully
330	return 1;
331	}
332
333
334	sub parse_field_definition_table
335	{
336	my $fdtfilename = shift(@_);
337	my $encoding = shift(@_);
338
339	my %fdtmapping = ();
340
341	open(FDT_FILE, "<$fdtfilename") \|\| die "Error: Could not open file $fdtfilename.\n";
342
343	my $fdtfiletext = "";
344	my $reader = new multiread();
345	$reader->set_handle('ISISPlug::FDT_FILE');
346	$reader->set_encoding($encoding);
347	$reader->read_file($fdtfiletext);
348
349	my $amongstdefinitions = 0;
350	foreach my $fdtfileline (split(/\n/, $$fdtfiletext)) {
351	$fdtfileline =~ s/(\s*)$//; # Remove any nasty spaces at the end of the lines
352
353	if ($amongstdefinitions) {
354	my $fieldname = substr($fdtfileline, 0, 30);
355	my $fieldsubfields = substr($fdtfileline, 30, 20);
356	my $fieldspecs = substr($fdtfileline, 50);
357
358	# Remove extra spaces
359	$fieldname =~ s/(\s*)$//;
360	$fieldsubfields =~ s/(\s*)$//;
361	$fieldspecs =~ s/(\s*)$//;
362
363	# Map from tag number to metadata field title, subfields, and repeatability
364	my $fieldtag = (split(/ /, $fieldspecs))[0];
365	my $fieldrepeatable = (split(/ /, $fieldspecs))[3];
366	$fdtmapping{$fieldtag} = { 'name' => $fieldname,
367	'subfields' => $fieldsubfields,
368	'repeatable' => $fieldrepeatable };
369	}
370	elsif ($fdtfileline eq "***") {
371	$amongstdefinitions = 1;
372	}
373	}
374
375	close(FDT_FILE);
376
377	return %fdtmapping;
378	}
379
380
381	sub escape_metadata_value
382	{
383	my $value = shift(@_);
384	$value =~ s/\</</g;
385	$value =~ s/\>/>/g;
386	$value =~ s/\\/\\\\/g;
387	return $value;
388	}
389
390
391	sub clean_up_after_exploding
392	{
393	my $self = shift(@_);
394
395	# Delete the FDT and XRF files too
396	&util::rm($self->{'fdt_file_path'});
397	&util::rm($self->{'xrf_file_path'});
398	}
399
400
401	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: