Context Navigation

source: gsdl/trunk/perllib/plugins/ISISPlugin.pm@ 15880

Last change on this file since 15880 was 15872, checked in by kjdon, 16 years ago
plugin overhaul: plugins renamed to xxPlugin, and in some cases the names are made more sensible. They now use the new base plugins. Hopefully we have better code reuse. Some of the plugins still need work done as I didn't want to spend another month doing this before committing it. Alos, I haven't really tested anything yet...
Property svn:keywords set to `Author Date Id Revision`
File size: 13.3 KB

Line
1	###########################################################################
2	#
3	# ISISPlugin.pm -- A plugin for CDS/ISIS databases
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright 1999-2004 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	package ISISPlugin;
28
29
30	use multiread;
31	use SplitTextFile;
32
33	use strict;
34	no strict 'refs'; # allow filehandles to be variables and viceversa
35
36	# ISISPlugin is a sub-class of SplitTextFile.
37	sub BEGIN {
38	@ISISPlugin::ISA = ('SplitTextFile');
39	}
40
41
42	my $arguments =
43	[ { 'name' => "process_exp",
44	'desc' => "{BasePlugin.process_exp}",
45	'type' => "regexp",
46	'reqd' => "no",
47	'deft' => &get_default_process_exp() },
48	{ 'name' => "block_exp",
49	'desc' => "{BasePlugin.block_exp}",
50	'type' => "regexp",
51	'reqd' => "no",
52	'deft' => &get_default_block_exp(),
53	'hiddengli' => "yes" },
54	{ 'name' => "split_exp",
55	'desc' => "{SplitTextFile.split_exp}",
56	'type' => "regexp",
57	'reqd' => "no",
58	'deft' => &get_default_split_exp(),
59	'hiddengli' => "yes" },
60
61	# The interesting options
62	{ 'name' => "entry_separator",
63	'desc' => "{ISISPlugin.entry_separator}",
64	'type' => "string",
65	'reqd' => "no",
66	'deft' => "<br>" },
67	{ 'name' => "subfield_separator",
68	'desc' => "{ISISPlugin.subfield_separator}",
69	'type' => "string",
70	'reqd' => "no",
71	'deft' => ", " }
72	];
73
74	my $options = { 'name' => "ISISPlugin",
75	'desc' => "{ISISPlugin.desc}",
76	'abstract' => "no",
77	'inherits' => "yes",
78	'explodes' => "yes",
79	'args' => $arguments };
80
81
82	# This plugin processes files with the suffix ".mst"
83	sub get_default_process_exp {
84	return q^(?i)(\.mst)$^;
85	}
86
87
88	# This plugin blocks files with the suffix ".fdt" and ".xrf"
89	sub get_default_block_exp {
90	return q^(?i)(\.fdt\|\.xrf)$^;
91	}
92
93
94	# This plugin splits the input text at the "----------" lines
95	sub get_default_split_exp {
96	return q^\r?\n----------\r?\n^;
97	}
98
99
100	sub new
101	{
102	my ($class) = shift (@_);
103	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
104	push(@$pluginlist, $class);
105
106	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
107	push(@{$hashArgOptLists->{"OptList"}},$options);
108
109	my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
110
111	if ($self->{'info_only'}) {
112	# don't worry about any options etc
113	return bless $self, $class;
114	}
115
116	# isis plug doesn't care about encoding - it assumes ascii unless the user
117	# has specified an encoding
118	if ($self->{'input_encoding'} eq "auto") {
119	$self->{'input_encoding'} = "ascii";
120	}
121	return bless $self, $class;
122	}
123
124
125	sub read_file
126	{
127	my $self = shift (@_);
128	my ($filename, $encoding, $language, $textref) = @_;
129	my $outhandle = $self->{'outhandle'};
130
131	my ($database_file_path_root) = ($filename =~ /(.*)\.mst$/i);
132	my $mst_file_path_relative = $filename;
133	$mst_file_path_relative =~ s/^.+import.(.*?)$/$1/;
134
135	# Check the associated .fdt and .xrf files exist
136	$self->{'fdt_file_path'} = $database_file_path_root . ".FDT";
137	if (!-e $self->{'fdt_file_path'}) {
138	$self->{'fdt_file_path'} = $database_file_path_root . ".fdt";
139	}
140	if (!-e $self->{'fdt_file_path'}) {
141	print STDERR "<ProcessingError n='$mst_file_path_relative' r='Could not find ISIS FDT file $self->{'fdt_file_path'}'>\n" if ($self->{'gli'});
142	print $outhandle "Error: Could not find ISIS FDT file " . $self->{'fdt_file_path'} . ".\n";
143	return;
144	}
145	$self->{'xrf_file_path'} = $database_file_path_root . ".XRF";
146	if (!-e $self->{'xrf_file_path'}) {
147	$self->{'xrf_file_path'} = $database_file_path_root . ".xrf";
148	}
149	if (!-e $self->{'xrf_file_path'}) {
150	print STDERR "<ProcessingError n='$mst_file_path_relative' r='Could not find ISIS XRF file $self->{'xrf_file_path'}'>\n" if ($self->{'gli'});
151	print $outhandle "Error: Could not find ISIS XRF file " . $self->{'xrf_file_path'} . ".\n";
152	return;
153	}
154
155	# The text to split is exported from the database by the IsisGdl program
156	open(FILE, "IsisGdl \"$filename\" \|");
157
158	my $reader = new multiread();
159	$reader->set_handle('ISISPlugin::FILE');
160	$reader->set_encoding($encoding);
161	$reader->read_file($textref);
162
163	close(FILE);
164
165	# Parse the associated ISIS database Field Definition Table file (.fdt)
166	my %fdt_mapping = &parse_field_definition_table($self->{'fdt_file_path'}, $encoding);
167	$self->{'fdt_mapping'} = \%fdt_mapping;
168
169	# Remove the line at the start, and any blank lines, so the data is split and processed properly
170	$$textref =~ s/^----------\n//;
171	$$textref =~ s/\n\n/\n/g;
172	}
173
174
175	sub process
176	{
177	my $self = shift (@_);
178	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
179	my $outhandle = $self->{'outhandle'};
180
181	my $section = $doc_obj->get_top_section();
182	my $fdt_mapping = $self->{'fdt_mapping'};
183	my $subfield_separator = $self->{'subfield_separator'};
184	my $entry_separator = $self->{'entry_separator'};
185	my $isis_record_html_metadata_value = "<table cellpadding=\"4\" cellspacing=\"0\">";
186
187	# Report that we're processing the file
188	print STDERR "\n<Processing n='$file' p='ISISPlugin'>\n" if ($gli);
189	print $outhandle "IsisPlug: processing $file\n" if ($self->{'verbosity'}) > 1;
190
191	# Process each line of the ISIS record, one at a time
192	foreach my $line (split(/\n/, $$textref)) {
193	$line =~ s/(\s*)$//; # Remove any nasty whitespace (very important for Windows)
194	$line =~ /^tag=(.*) data=(.+)$/;
195	my $tag = $1;
196	my $tag_data = $2;
197	# print STDERR "\nTag: $tag, Data: $tag_data\n";
198
199	# Convert the tag number into a name, and remove any invalid characters
200	my $raw_metadata_name = $fdt_mapping->{$tag}{'name'} \|\| "";
201	$raw_metadata_name =~ s/[,&\#\.\-\/]/ /g;
202	next if ($raw_metadata_name eq "");
203
204	# Metadata field names: title case, then remove spaces
205	my $metadata_name = "";
206	foreach my $word (split(/\s+/, $raw_metadata_name)) {
207	substr($word, 0, 1) =~ tr/a-z/A-Z/;
208	$metadata_name .= $word;
209	}
210
211	my $all_metadata_name = $metadata_name . "^all";
212	my $all_metadata_value = "";
213
214	# Handle repeatable fields
215	if ($fdt_mapping->{$tag}{'repeatable'}) {
216	# Multiple values are separated using the '%' character
217	foreach my $raw_metadata_value (split(/%/, $tag_data)) {
218	my $metadata_value = "";
219
220	# Handle subfields
221	while ($raw_metadata_value ne "") {
222	# If there is a subfield specifier, parse it off
223	my $sub_metadata_name = $metadata_name;
224	if ($raw_metadata_value =~ s/^\^// && $raw_metadata_value =~ s/^([a-z])//) {
225	$sub_metadata_name .= "^$1";
226	}
227
228	# Parse the value off and add it as metadata
229	$raw_metadata_value =~ s/^([^\^]*)//;
230	my $sub_metadata_value = &escape_metadata_value($1);
231
232	# print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
233	if ($sub_metadata_name ne $metadata_name) {
234	$doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
235	}
236
237	# If this tag has subfields and this is the first, use the value for the CDS/ISIS ^* field
238	if ($fdt_mapping->{$tag}{'subfields'} ne "" && $metadata_value eq "") {
239	$doc_obj->add_utf8_metadata($section, $metadata_name . "^*", $sub_metadata_value);
240	}
241
242	$metadata_value .= $subfield_separator unless ($metadata_value eq "");
243	$metadata_value .= $sub_metadata_value;
244	}
245
246	# Add the metadata value
247	# print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
248	$doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
249
250	$all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
251	$all_metadata_value .= $metadata_value;
252	}
253	}
254
255	# Handle non-repeatable fields
256	else {
257	my $raw_metadata_value = $tag_data;
258	my $metadata_value = "";
259
260	# Handle subfields
261	while ($raw_metadata_value ne "") {
262	# If there is a subfield specifier, parse it off
263	my $sub_metadata_name = $metadata_name;
264	if ($raw_metadata_value =~ s/^\^// && $raw_metadata_value =~ s/^([a-z])//) {
265	$sub_metadata_name .= "^$1";
266	}
267
268	# Parse the value off and add it as metadata
269	$raw_metadata_value =~ s/^([^\^]*)//;
270	my $sub_metadata_value = $1;
271
272	# Deal with the case when multiple values are specified using <...>
273	if ($sub_metadata_value =~ /\<(.+)\>/) {
274	my $sub_sub_metadata_name = $sub_metadata_name . "^sub";
275	my $tmp_sub_metadata_value = $sub_metadata_value;
276	while ($tmp_sub_metadata_value =~ s/\<(.+?)\>//) {
277	my $sub_sub_metadata_value = $1;
278	$doc_obj->add_utf8_metadata($section, $sub_sub_metadata_name, $sub_sub_metadata_value);
279	}
280	}
281	# Deal with the legacy case when multiple values are specified using /.../
282	elsif ($sub_metadata_value =~ /\/(.+)\//) {
283	my $sub_sub_metadata_name = $sub_metadata_name . "^sub";
284	my $tmp_sub_metadata_value = $sub_metadata_value;
285	while ($tmp_sub_metadata_value =~ s/\/(.+?)\///) {
286	my $sub_sub_metadata_value = $1;
287	$doc_obj->add_utf8_metadata($section, $sub_sub_metadata_name, $sub_sub_metadata_value);
288	}
289	}
290
291	# Escape the metadata value so it appears correctly in the final collection
292	$sub_metadata_value = &escape_metadata_value($sub_metadata_value);
293
294	# print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
295	if ($sub_metadata_name ne $metadata_name) {
296	$doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
297	}
298
299	# If this tag has subfields and this is the first, use the value for the CDS/ISIS ^* field
300	if ($fdt_mapping->{$tag}{'subfields'} ne "" && $metadata_value eq "") {
301	$doc_obj->add_utf8_metadata($section, $metadata_name . "^*", $sub_metadata_value);
302	}
303
304	$metadata_value .= $subfield_separator unless ($metadata_value eq "");
305	$metadata_value .= $sub_metadata_value;
306	}
307
308	# Add the metadata value
309	# print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
310	$doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
311
312	$all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
313	$all_metadata_value .= $metadata_value;
314	}
315
316	# Add the "^all" metadata value
317	# print STDERR "All metadata name: $all_metadata_name, value: $all_metadata_value\n";
318	$doc_obj->add_utf8_metadata($section, $all_metadata_name, $all_metadata_value);
319
320	$isis_record_html_metadata_value .= "<tr><td valign=top><nobr><b>" . $fdt_mapping->{$tag}{'name'} . "</b></nobr></td><td valign=top>" . $all_metadata_value . "</td></tr>";
321	}
322
323	# Add a reasonably formatted HTML table view of the record as the document text
324	$isis_record_html_metadata_value .= "</table>";
325	$doc_obj->add_utf8_text($section, $isis_record_html_metadata_value);
326
327	# Add the full raw record as metadata
328	my $isis_raw_record_metadata_value = &escape_metadata_value($$textref);
329	$doc_obj->add_utf8_metadata($section, "ISISRawRecord", $isis_raw_record_metadata_value);
330
331	# Add FileFormat metadata
332	$doc_obj->add_utf8_metadata($section, "FileFormat", "CDS/ISIS");
333
334	# Record was processed successfully
335	return 1;
336	}
337
338
339	sub parse_field_definition_table
340	{
341	my $fdtfilename = shift(@_);
342	my $encoding = shift(@_);
343
344	my %fdtmapping = ();
345
346	open(FDT_FILE, "<$fdtfilename") \|\| die "Error: Could not open file $fdtfilename.\n";
347
348	my $fdtfiletext = "";
349	my $reader = new multiread();
350	$reader->set_handle('ISISPlugin::FDT_FILE');
351	$reader->set_encoding($encoding);
352	$reader->read_file($fdtfiletext);
353
354	my $amongstdefinitions = 0;
355	foreach my $fdtfileline (split(/\n/, $$fdtfiletext)) {
356	$fdtfileline =~ s/(\s*)$//; # Remove any nasty spaces at the end of the lines
357
358	if ($amongstdefinitions) {
359	my $fieldname = &unicode::substr($fdtfileline, 0, 30);
360	my $fieldsubfields = &unicode::substr($fdtfileline, 30, 20);
361	my $fieldspecs = &unicode::substr($fdtfileline, 50, 50);
362
363	# Remove extra spaces
364	$fieldname =~ s/(\s*)$//;
365	$fieldsubfields =~ s/(\s*)$//;
366	$fieldspecs =~ s/(\s*)$//;
367
368	# Map from tag number to metadata field title, subfields, and repeatability
369	my $fieldtag = (split(/ /, $fieldspecs))[0];
370	my $fieldrepeatable = (split(/ /, $fieldspecs))[3];
371	$fdtmapping{$fieldtag} = { 'name' => $fieldname,
372	'subfields' => $fieldsubfields,
373	'repeatable' => $fieldrepeatable };
374	}
375	elsif ($fdtfileline eq "***") {
376	$amongstdefinitions = 1;
377	}
378	}
379
380	close(FDT_FILE);
381
382	return %fdtmapping;
383	}
384
385
386	sub escape_metadata_value
387	{
388	my $value = shift(@_);
389	$value =~ s/\</</g;
390	$value =~ s/\>/>/g;
391	$value =~ s/\\/\\\\/g;
392	return $value;
393	}
394
395
396	sub clean_up_after_exploding
397	{
398	my $self = shift(@_);
399
400	# Delete the FDT and XRF files too
401	&util::rm($self->{'fdt_file_path'});
402	&util::rm($self->{'xrf_file_path'});
403	}
404
405
406	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: