Context Navigation

← Previous Revision
Latest Revision
Next Revision →
Blame
Revision Log

source: gsdl/trunk/perllib/plugins/ISISPlugin.pm@ 15865

Last change on this file since 15865 was 15865, checked in by kjdon, 16 years ago
renaming plugins in preparation for my plugin overhaul
Property svn:keywords set to `Author Date Id Revision`
File size: 13.3 KB

Line
1	###########################################################################
2	#
3	# ISISPlug.pm -- A plugin for CDS/ISIS databases
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright 1999-2004 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	package ISISPlug;
28
29
30	use multiread;
31	use SplitPlug;
32
33	use strict;
34	no strict 'refs'; # allow filehandles to be variables and viceversa
35
36	# ISISPlug is a sub-class of SplitPlug.
37	sub BEGIN {
38	@ISISPlug::ISA = ('SplitPlug');
39	}
40
41
42	my $arguments =
43	[ { 'name' => "process_exp",
44	'desc' => "{BasPlug.process_exp}",
45	'type' => "regexp",
46	'reqd' => "no",
47	'deft' => &get_default_process_exp() },
48	{ 'name' => "block_exp",
49	'desc' => "{BasPlug.block_exp}",
50	'type' => "regexp",
51	'reqd' => "no",
52	'deft' => &get_default_block_exp(),
53	'hiddengli' => "yes" },
54	{ 'name' => "split_exp",
55	'desc' => "{SplitPlug.split_exp}",
56	'type' => "regexp",
57	'reqd' => "no",
58	'deft' => &get_default_split_exp(),
59	'hiddengli' => "yes" },
60
61	# The interesting options
62	{ 'name' => "entry_separator",
63	'desc' => "{ISISPlug.entry_separator}",
64	'type' => "string",
65	'reqd' => "no",
66	'deft' => "<br>" },
67	{ 'name' => "subfield_separator",
68	'desc' => "{ISISPlug.subfield_separator}",
69	'type' => "string",
70	'reqd' => "no",
71	'deft' => ", " }
72	];
73
74	my $options = { 'name' => "ISISPlug",
75	'desc' => "{ISISPlug.desc}",
76	'abstract' => "no",
77	'inherits' => "yes",
78	'explodes' => "yes",
79	'args' => $arguments };
80
81
82	# This plugin processes files with the suffix ".mst"
83	sub get_default_process_exp {
84	return q^(?i)(\.mst)$^;
85	}
86
87
88	# This plugin blocks files with the suffix ".fdt" and ".xrf"
89	sub get_default_block_exp {
90	return q^(?i)(\.fdt\|\.xrf)$^;
91	}
92
93
94	# This plugin splits the input text at the "----------" lines
95	sub get_default_split_exp {
96	return q^\r?\n----------\r?\n^;
97	}
98
99
100	sub new
101	{
102	my ($class) = shift (@_);
103	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
104	push(@$pluginlist, $class);
105
106	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
107	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
108
109	my $self = new SplitPlug($pluginlist, $inputargs, $hashArgOptLists);
110
111	if ($self->{'info_only'}) {
112	# don't worry about any options etc
113	return bless $self, $class;
114	}
115
116	# isis plug doesn't care about encoding - it assumes ascii unless the user
117	# has specified an encoding
118	if ($self->{'input_encoding'} eq "auto") {
119	$self->{'input_encoding'} = "ascii";
120	}
121	return bless $self, $class;
122	}
123
124
125	sub read_file
126	{
127	my $self = shift (@_);
128	my ($filename, $encoding, $language, $textref) = @_;
129	my $outhandle = $self->{'outhandle'};
130
131	my ($database_file_path_root) = ($filename =~ /(.*)\.mst$/i);
132	my $mst_file_path_relative = $filename;
133	$mst_file_path_relative =~ s/^.+import.(.*?)$/$1/;
134
135	# Check the associated .fdt and .xrf files exist
136	$self->{'fdt_file_path'} = $database_file_path_root . ".FDT";
137	if (!-e $self->{'fdt_file_path'}) {
138	$self->{'fdt_file_path'} = $database_file_path_root . ".fdt";
139	}
140	if (!-e $self->{'fdt_file_path'}) {
141	print STDERR "<ProcessingError n='$mst_file_path_relative' r='Could not find ISIS FDT file $self->{'fdt_file_path'}'>\n" if ($self->{'gli'});
142	print $outhandle "Error: Could not find ISIS FDT file " . $self->{'fdt_file_path'} . ".\n";
143	return;
144	}
145	$self->{'xrf_file_path'} = $database_file_path_root . ".XRF";
146	if (!-e $self->{'xrf_file_path'}) {
147	$self->{'xrf_file_path'} = $database_file_path_root . ".xrf";
148	}
149	if (!-e $self->{'xrf_file_path'}) {
150	print STDERR "<ProcessingError n='$mst_file_path_relative' r='Could not find ISIS XRF file $self->{'xrf_file_path'}'>\n" if ($self->{'gli'});
151	print $outhandle "Error: Could not find ISIS XRF file " . $self->{'xrf_file_path'} . ".\n";
152	return;
153	}
154
155	# The text to split is exported from the database by the IsisGdl program
156	open(FILE, "IsisGdl \"$filename\" \|");
157
158	my $reader = new multiread();
159	$reader->set_handle('ISISPlug::FILE');
160	$reader->set_encoding($encoding);
161	$reader->read_file($textref);
162
163	close(FILE);
164
165	# Parse the associated ISIS database Field Definition Table file (.fdt)
166	my %fdt_mapping = &parse_field_definition_table($self->{'fdt_file_path'}, $encoding);
167	$self->{'fdt_mapping'} = \%fdt_mapping;
168
169	# Remove the line at the start, and any blank lines, so the data is split and processed properly
170	$$textref =~ s/^----------\n//;
171	$$textref =~ s/\n\n/\n/g;
172	}
173
174
175	sub process
176	{
177	my $self = shift (@_);
178	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
179	my $outhandle = $self->{'outhandle'};
180
181	my $section = $doc_obj->get_top_section();
182	my $fdt_mapping = $self->{'fdt_mapping'};
183	my $subfield_separator = $self->{'subfield_separator'};
184	my $entry_separator = $self->{'entry_separator'};
185	my $isis_record_html_metadata_value = "<table cellpadding=\"4\" cellspacing=\"0\">";
186
187	# Report that we're processing the file
188	print STDERR "\n<Processing n='$file' p='ISISPlug'>\n" if ($gli);
189	print $outhandle "IsisPlug: processing $file\n" if ($self->{'verbosity'}) > 1;
190
191	# Process each line of the ISIS record, one at a time
192	foreach my $line (split(/\n/, $$textref)) {
193	$line =~ s/(\s*)$//; # Remove any nasty whitespace (very important for Windows)
194	$line =~ /^tag=(.*) data=(.+)$/;
195	my $tag = $1;
196	my $tag_data = $2;
197	# print STDERR "\nTag: $tag, Data: $tag_data\n";
198
199	# Convert the tag number into a name, and remove any invalid characters
200	my $raw_metadata_name = $fdt_mapping->{$tag}{'name'} \|\| "";
201	$raw_metadata_name =~ s/[,&\#\.\-\/]/ /g;
202	next if ($raw_metadata_name eq "");
203
204	# Metadata field names: title case, then remove spaces
205	my $metadata_name = "";
206	foreach my $word (split(/\s+/, $raw_metadata_name)) {
207	substr($word, 0, 1) =~ tr/a-z/A-Z/;
208	$metadata_name .= $word;
209	}
210
211	my $all_metadata_name = $metadata_name . "^all";
212	my $all_metadata_value = "";
213
214	# Handle repeatable fields
215	if ($fdt_mapping->{$tag}{'repeatable'}) {
216	# Multiple values are separated using the '%' character
217	foreach my $raw_metadata_value (split(/%/, $tag_data)) {
218	my $metadata_value = "";
219
220	# Handle subfields
221	while ($raw_metadata_value ne "") {
222	# If there is a subfield specifier, parse it off
223	my $sub_metadata_name = $metadata_name;
224	if ($raw_metadata_value =~ s/^\^// && $raw_metadata_value =~ s/^([a-z])//) {
225	$sub_metadata_name .= "^$1";
226	}
227
228	# Parse the value off and add it as metadata
229	$raw_metadata_value =~ s/^([^\^]*)//;
230	my $sub_metadata_value = &escape_metadata_value($1);
231
232	# print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
233	if ($sub_metadata_name ne $metadata_name) {
234	$doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
235	}
236
237	# If this tag has subfields and this is the first, use the value for the CDS/ISIS ^* field
238	if ($fdt_mapping->{$tag}{'subfields'} ne "" && $metadata_value eq "") {
239	$doc_obj->add_utf8_metadata($section, $metadata_name . "^*", $sub_metadata_value);
240	}
241
242	$metadata_value .= $subfield_separator unless ($metadata_value eq "");
243	$metadata_value .= $sub_metadata_value;
244	}
245
246	# Add the metadata value
247	# print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
248	$doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
249
250	$all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
251	$all_metadata_value .= $metadata_value;
252	}
253	}
254
255	# Handle non-repeatable fields
256	else {
257	my $raw_metadata_value = $tag_data;
258	my $metadata_value = "";
259
260	# Handle subfields
261	while ($raw_metadata_value ne "") {
262	# If there is a subfield specifier, parse it off
263	my $sub_metadata_name = $metadata_name;
264	if ($raw_metadata_value =~ s/^\^// && $raw_metadata_value =~ s/^([a-z])//) {
265	$sub_metadata_name .= "^$1";
266	}
267
268	# Parse the value off and add it as metadata
269	$raw_metadata_value =~ s/^([^\^]*)//;
270	my $sub_metadata_value = $1;
271
272	# Deal with the case when multiple values are specified using <...>
273	if ($sub_metadata_value =~ /\<(.+)\>/) {
274	my $sub_sub_metadata_name = $sub_metadata_name . "^sub";
275	my $tmp_sub_metadata_value = $sub_metadata_value;
276	while ($tmp_sub_metadata_value =~ s/\<(.+?)\>//) {
277	my $sub_sub_metadata_value = $1;
278	$doc_obj->add_utf8_metadata($section, $sub_sub_metadata_name, $sub_sub_metadata_value);
279	}
280	}
281	# Deal with the legacy case when multiple values are specified using /.../
282	elsif ($sub_metadata_value =~ /\/(.+)\//) {
283	my $sub_sub_metadata_name = $sub_metadata_name . "^sub";
284	my $tmp_sub_metadata_value = $sub_metadata_value;
285	while ($tmp_sub_metadata_value =~ s/\/(.+?)\///) {
286	my $sub_sub_metadata_value = $1;
287	$doc_obj->add_utf8_metadata($section, $sub_sub_metadata_name, $sub_sub_metadata_value);
288	}
289	}
290
291	# Escape the metadata value so it appears correctly in the final collection
292	$sub_metadata_value = &escape_metadata_value($sub_metadata_value);
293
294	# print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
295	if ($sub_metadata_name ne $metadata_name) {
296	$doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
297	}
298
299	# If this tag has subfields and this is the first, use the value for the CDS/ISIS ^* field
300	if ($fdt_mapping->{$tag}{'subfields'} ne "" && $metadata_value eq "") {
301	$doc_obj->add_utf8_metadata($section, $metadata_name . "^*", $sub_metadata_value);
302	}
303
304	$metadata_value .= $subfield_separator unless ($metadata_value eq "");
305	$metadata_value .= $sub_metadata_value;
306	}
307
308	# Add the metadata value
309	# print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
310	$doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
311
312	$all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
313	$all_metadata_value .= $metadata_value;
314	}
315
316	# Add the "^all" metadata value
317	# print STDERR "All metadata name: $all_metadata_name, value: $all_metadata_value\n";
318	$doc_obj->add_utf8_metadata($section, $all_metadata_name, $all_metadata_value);
319
320	$isis_record_html_metadata_value .= "<tr><td valign=top><nobr><b>" . $fdt_mapping->{$tag}{'name'} . "</b></nobr></td><td valign=top>" . $all_metadata_value . "</td></tr>";
321	}
322
323	# Add a reasonably formatted HTML table view of the record as the document text
324	$isis_record_html_metadata_value .= "</table>";
325	$doc_obj->add_utf8_text($section, $isis_record_html_metadata_value);
326
327	# Add the full raw record as metadata
328	my $isis_raw_record_metadata_value = &escape_metadata_value($$textref);
329	$doc_obj->add_utf8_metadata($section, "ISISRawRecord", $isis_raw_record_metadata_value);
330
331	# Add FileFormat metadata
332	$doc_obj->add_utf8_metadata($section, "FileFormat", "CDS/ISIS");
333
334	# Record was processed successfully
335	return 1;
336	}
337
338
339	sub parse_field_definition_table
340	{
341	my $fdtfilename = shift(@_);
342	my $encoding = shift(@_);
343
344	my %fdtmapping = ();
345
346	open(FDT_FILE, "<$fdtfilename") \|\| die "Error: Could not open file $fdtfilename.\n";
347
348	my $fdtfiletext = "";
349	my $reader = new multiread();
350	$reader->set_handle('ISISPlug::FDT_FILE');
351	$reader->set_encoding($encoding);
352	$reader->read_file($fdtfiletext);
353
354	my $amongstdefinitions = 0;
355	foreach my $fdtfileline (split(/\n/, $$fdtfiletext)) {
356	$fdtfileline =~ s/(\s*)$//; # Remove any nasty spaces at the end of the lines
357
358	if ($amongstdefinitions) {
359	my $fieldname = &unicode::substr($fdtfileline, 0, 30);
360	my $fieldsubfields = &unicode::substr($fdtfileline, 30, 20);
361	my $fieldspecs = &unicode::substr($fdtfileline, 50, 50);
362
363	# Remove extra spaces
364	$fieldname =~ s/(\s*)$//;
365	$fieldsubfields =~ s/(\s*)$//;
366	$fieldspecs =~ s/(\s*)$//;
367
368	# Map from tag number to metadata field title, subfields, and repeatability
369	my $fieldtag = (split(/ /, $fieldspecs))[0];
370	my $fieldrepeatable = (split(/ /, $fieldspecs))[3];
371	$fdtmapping{$fieldtag} = { 'name' => $fieldname,
372	'subfields' => $fieldsubfields,
373	'repeatable' => $fieldrepeatable };
374	}
375	elsif ($fdtfileline eq "***") {
376	$amongstdefinitions = 1;
377	}
378	}
379
380	close(FDT_FILE);
381
382	return %fdtmapping;
383	}
384
385
386	sub escape_metadata_value
387	{
388	my $value = shift(@_);
389	$value =~ s/\</</g;
390	$value =~ s/\>/>/g;
391	$value =~ s/\\/\\\\/g;
392	return $value;
393	}
394
395
396	sub clean_up_after_exploding
397	{
398	my $self = shift(@_);
399
400	# Delete the FDT and XRF files too
401	&util::rm($self->{'fdt_file_path'});
402	&util::rm($self->{'xrf_file_path'});
403	}
404
405
406	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: