Context Navigation

source: main/trunk/greenstone2/perllib/plugins/ISISPlugin.pm@ 28489

Last change on this file since 28489 was 27502, checked in by kjdon, 11 years ago
trying to fix double encoding issue for isis files. not sure that I have it yet
Property svn:keywords set to `Author Date Id Revision`
File size: 14.9 KB

Line
1	###########################################################################
2	#
3	# ISISPlugin.pm -- A plugin for CDS/ISIS databases
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright 1999-2004 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	package ISISPlugin;
28
29	use Encode;
30
31	use multiread;
32	use SplitTextFile;
33	use MetadataRead;
34
35	use strict;
36	no strict 'refs'; # allow filehandles to be variables and viceversa
37
38	# ISISPlugin is a sub-class of SplitTextFile.
39	# methods with identical signatures take precedence in the order given in the ISA list.
40	sub BEGIN {
41	@ISISPlugin::ISA = ('MetadataRead', 'SplitTextFile');
42	}
43
44
45	my $arguments =
46	[ { 'name' => "process_exp",
47	'desc' => "{BasePlugin.process_exp}",
48	'type' => "regexp",
49	'reqd' => "no",
50	'deft' => &get_default_process_exp() },
51	{ 'name' => "block_exp",
52	'desc' => "{BasePlugin.block_exp}",
53	'type' => "regexp",
54	'reqd' => "no",
55	'deft' => &get_default_block_exp(),
56	'hiddengli' => "yes" },
57	{ 'name' => "split_exp",
58	'desc' => "{SplitTextFile.split_exp}",
59	'type' => "regexp",
60	'reqd' => "no",
61	'deft' => &get_default_split_exp(),
62	'hiddengli' => "yes" },
63
64	# The interesting options
65	{ 'name' => "entry_separator",
66	'desc' => "{ISISPlugin.entry_separator}",
67	'type' => "string",
68	'reqd' => "no",
69	'deft' => "<br>" },
70	{ 'name' => "subfield_separator",
71	'desc' => "{ISISPlugin.subfield_separator}",
72	'type' => "string",
73	'reqd' => "no",
74	'deft' => ", " }
75	];
76
77	my $options = { 'name' => "ISISPlugin",
78	'desc' => "{ISISPlugin.desc}",
79	'abstract' => "no",
80	'inherits' => "yes",
81	'explodes' => "yes",
82	'args' => $arguments };
83
84
85	# This plugin processes files with the suffix ".mst"
86	sub get_default_process_exp {
87	return q^(?i)(\.mst)$^;
88	}
89
90
91	# This plugin blocks files with the suffix ".fdt" and ".xrf"
92	sub get_default_block_exp {
93	return q^(?i)(\.fdt\|\.xrf)$^;
94	#return "";
95	}
96
97
98	# This plugin splits the input text at the "----------" lines
99	sub get_default_split_exp {
100	return q^\r?\n----------\r?\n^;
101	}
102
103
104	sub new
105	{
106	my ($class) = shift (@_);
107	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
108	push(@$pluginlist, $class);
109
110	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
111	push(@{$hashArgOptLists->{"OptList"}},$options);
112
113	my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
114
115	if ($self->{'info_only'}) {
116	# don't worry about any options etc
117	return bless $self, $class;
118	}
119
120	# isis plug doesn't care about encoding - it assumes ascii unless the user
121	# has specified an encoding
122	if ($self->{'input_encoding'} eq "auto") {
123	$self->{'input_encoding'} = "ascii";
124	}
125	return bless $self, $class;
126	}
127
128	# we block the corresponding fdt and xrf
129	# a pain on windows. blocks xxx.FDT, but if actual file is xx.fdt then
130	# complains that no plugin can process it. Have put it back to using
131	# block exp for now
132	# This works now, as are doing case insenstive blocking on windows. However,
133	# a pain for GLI as will not know what plugin processes the fdt and xrf.
134	# if add to process expression, then get more problems.
135	sub store_block_files_tmp {
136
137	my $self =shift (@_);
138	my ($filename_full_path, $block_hash) = @_;
139	print STDERR "in store block files\n";
140	$self->check_auxiliary_files($filename_full_path);
141	if (-e $self->{'fdt_file_path'}) {
142	print STDERR "$self->{'fdt_file_path'}\n";
143	my $fdt_file = $self->{'fdt_file_path'};
144	&util::block_filename($block_hash,$fdt_file);
145	}
146	if (-e $self->{'xrf_file_path'}) {
147	print STDERR "$self->{'xrf_file_path'}\n";
148	my $xrf_file = $self->{'xrf_file_path'};
149	&util::block_filename($block_hash,$xrf_file);
150	}
151
152
153	}
154
155	sub check_auxiliary_files {
156	my $self = shift (@_);
157	my ($filename) = @_;
158
159	my ($database_file_path_root) = ($filename =~ /(.*)\.mst$/i);
160	# Check the associated .fdt and .xrf files exist
161	$self->{'fdt_file_path'} = $database_file_path_root . ".FDT";
162	if (!-e $self->{'fdt_file_path'}) {
163	$self->{'fdt_file_path'} = $database_file_path_root . ".fdt";
164	}
165	$self->{'xrf_file_path'} = $database_file_path_root . ".XRF";
166	if (!-e $self->{'xrf_file_path'}) {
167	$self->{'xrf_file_path'} = $database_file_path_root . ".xrf";
168	}
169	}
170
171
172	sub read_file
173	{
174	my $self = shift (@_);
175	my ($filename, $encoding, $language, $textref) = @_;
176	my $outhandle = $self->{'outhandle'};
177
178	my ($database_file_path_root) = ($filename =~ /(.*)\.mst$/i);
179	my $mst_file_path_relative = $filename;
180	$mst_file_path_relative =~ s/^.+import.(.*?)$/$1/;
181
182	# Check the associated .fdt and .xrf files exist
183	$self->check_auxiliary_files($filename);
184
185	if (!-e $self->{'fdt_file_path'}) {
186	print STDERR "<ProcessingError n='$mst_file_path_relative' r='Could not find ISIS FDT file $self->{'fdt_file_path'}'>\n" if ($self->{'gli'});
187	print $outhandle "Error: Could not find ISIS FDT file " . $self->{'fdt_file_path'} . ".\n";
188	return;
189	}
190	if (!-e $self->{'xrf_file_path'}) {
191	print STDERR "<ProcessingError n='$mst_file_path_relative' r='Could not find ISIS XRF file $self->{'xrf_file_path'}'>\n" if ($self->{'gli'});
192	print $outhandle "Error: Could not find ISIS XRF file " . $self->{'xrf_file_path'} . ".\n";
193	return;
194	}
195
196	# The text to split is exported from the database by the IsisGdl program
197	open(FILE, "IsisGdl \"$filename\" \|");
198
199	my $reader = new multiread();
200	$reader->set_handle('ISISPlugin::FILE');
201	$reader->set_encoding($encoding);
202	$reader->read_file($textref);
203
204	# At this point $$textref is a binary byte string
205	# => turn it into a Unicode aware string, so full
206	# Unicode aware pattern matching can be used.
207	# For instance: 's/\x{0101}//g' or '[[:upper:]]'
208	#
209
210	$$textref = decode("utf8",$$textref);
211	close(FILE);
212
213	# Parse the associated ISIS database Field Definition Table file (.fdt)
214	my %fdt_mapping = &parse_field_definition_table($self->{'fdt_file_path'}, $encoding);
215	$self->{'fdt_mapping'} = \%fdt_mapping;
216
217	# Remove the line at the start, and any blank lines, so the data is split and processed properly
218	$$textref =~ s/^----------\n//;
219	$$textref =~ s/\n\n/\n/g;
220	}
221
222
223	sub process
224	{
225	my $self = shift (@_);
226	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
227	my $outhandle = $self->{'outhandle'};
228
229	# store the auxiliary files so we know which ones were used
230	# (mst file becomes the source file)
231	$doc_obj->associate_source_file($self->{'fdt_file_path'});
232	$doc_obj->associate_source_file($self->{'xrf_file_path'});
233
234	my $section = $doc_obj->get_top_section();
235	my $fdt_mapping = $self->{'fdt_mapping'};
236	my $subfield_separator = $self->{'subfield_separator'};
237	my $entry_separator = $self->{'entry_separator'};
238	my $isis_record_html_metadata_value = "<table cellpadding=\"4\" cellspacing=\"0\">";
239
240	# Process each line of the ISIS record, one at a time
241	foreach my $line (split(/\n/, $$textref)) {
242	$line =~ s/(\s*)$//; # Remove any nasty whitespace (very important for Windows)
243	$line =~ /^tag=(.*) data=(.+)$/;
244	my $tag = $1;
245	my $tag_data = $2;
246	# print STDERR "\nTag: $tag, Data: $tag_data\n";
247
248	# Convert the tag number into a name, and remove any invalid characters
249	my $raw_metadata_name = $fdt_mapping->{$tag}{'name'} \|\| "";
250	$raw_metadata_name =~ s/[,&\#\.\-\/]/ /g;
251	next if ($raw_metadata_name eq "");
252
253	# Metadata field names: title case, then remove spaces
254	my $metadata_name = "";
255	foreach my $word (split(/\s+/, $raw_metadata_name)) {
256	substr($word, 0, 1) =~ tr/a-z/A-Z/;
257	$metadata_name .= $word;
258	}
259
260	my $all_metadata_name = $metadata_name . "^all";
261	my $all_metadata_value = "";
262
263	# Handle repeatable fields
264	if ($fdt_mapping->{$tag}{'repeatable'}) {
265	# Multiple values are separated using the '%' character
266	foreach my $raw_metadata_value (split(/%/, $tag_data)) {
267	my $metadata_value = "";
268
269	# Handle subfields
270	while ($raw_metadata_value ne "") {
271	# If there is a subfield specifier, parse it off
272	my $sub_metadata_name = $metadata_name;
273	if ($raw_metadata_value =~ s/^\^// && $raw_metadata_value =~ s/^([a-z])//) {
274	$sub_metadata_name .= "^$1";
275	}
276
277	# Parse the value off and add it as metadata
278	$raw_metadata_value =~ s/^([^\^]*)//;
279	my $sub_metadata_value = &escape_metadata_value($1);
280
281	# print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
282	if ($sub_metadata_name ne $metadata_name) {
283	$doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
284	}
285
286	# If this tag has subfields and this is the first, use the value for the CDS/ISIS ^* field
287	if ($fdt_mapping->{$tag}{'subfields'} ne "" && $metadata_value eq "") {
288	$doc_obj->add_utf8_metadata($section, $metadata_name . "^*", $sub_metadata_value);
289	}
290
291	$metadata_value .= $subfield_separator unless ($metadata_value eq "");
292	$metadata_value .= $sub_metadata_value;
293	}
294
295	# Add the metadata value
296	# print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
297	$doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
298
299	$all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
300	$all_metadata_value .= $metadata_value;
301	}
302	}
303
304	# Handle non-repeatable fields
305	else {
306	my $raw_metadata_value = $tag_data;
307	my $metadata_value = "";
308
309	# Handle subfields
310	while ($raw_metadata_value ne "") {
311	# If there is a subfield specifier, parse it off
312	my $sub_metadata_name = $metadata_name;
313	if ($raw_metadata_value =~ s/^\^// && $raw_metadata_value =~ s/^([a-z])//) {
314	$sub_metadata_name .= "^$1";
315	}
316
317	# Parse the value off and add it as metadata
318	$raw_metadata_value =~ s/^([^\^]*)//;
319	my $sub_metadata_value = $1;
320
321	# Deal with the case when multiple values are specified using <...>
322	if ($sub_metadata_value =~ /\<(.+)\>/) {
323	my $sub_sub_metadata_name = $sub_metadata_name . "^sub";
324	my $tmp_sub_metadata_value = $sub_metadata_value;
325	while ($tmp_sub_metadata_value =~ s/\<(.+?)\>//) {
326	my $sub_sub_metadata_value = $1;
327	$doc_obj->add_utf8_metadata($section, $sub_sub_metadata_name, $sub_sub_metadata_value);
328	}
329	}
330	# Deal with the legacy case when multiple values are specified using /.../
331	elsif ($sub_metadata_value =~ /\/(.+)\//) {
332	my $sub_sub_metadata_name = $sub_metadata_name . "^sub";
333	my $tmp_sub_metadata_value = $sub_metadata_value;
334	while ($tmp_sub_metadata_value =~ s/\/(.+?)\///) {
335	my $sub_sub_metadata_value = $1;
336	$doc_obj->add_utf8_metadata($section, $sub_sub_metadata_name, $sub_sub_metadata_value);
337	}
338	}
339
340	# Escape the metadata value so it appears correctly in the final collection
341	$sub_metadata_value = &escape_metadata_value($sub_metadata_value);
342
343	# print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
344	if ($sub_metadata_name ne $metadata_name) {
345	$doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
346	}
347
348	# If this tag has subfields and this is the first, use the value for the CDS/ISIS ^* field
349	if ($fdt_mapping->{$tag}{'subfields'} ne "" && $metadata_value eq "") {
350	$doc_obj->add_utf8_metadata($section, $metadata_name . "^*", $sub_metadata_value);
351	}
352
353	$metadata_value .= $subfield_separator unless ($metadata_value eq "");
354	$metadata_value .= $sub_metadata_value;
355	}
356
357	# Add the metadata value
358	# print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
359	$doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
360
361	$all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
362	$all_metadata_value .= $metadata_value;
363	}
364
365	# Add the "^all" metadata value
366	# print STDERR "All metadata name: $all_metadata_name, value: $all_metadata_value\n";
367	$doc_obj->add_utf8_metadata($section, $all_metadata_name, $all_metadata_value);
368
369	$isis_record_html_metadata_value .= "<tr><td valign=top><nobr><b>" . $fdt_mapping->{$tag}{'name'} . "</b></nobr></td><td valign=top>" . $all_metadata_value . "</td></tr>";
370	}
371
372	# Add a reasonably formatted HTML table view of the record as the document text
373	$isis_record_html_metadata_value .= "</table>";
374	$doc_obj->add_utf8_text($section, $isis_record_html_metadata_value);
375
376	# Add the full raw record as metadata
377	my $isis_raw_record_metadata_value = &escape_metadata_value($$textref);
378	$doc_obj->add_utf8_metadata($section, "ISISRawRecord", $isis_raw_record_metadata_value);
379
380	# Add FileFormat metadata
381	$doc_obj->add_utf8_metadata($section, "FileFormat", "CDS/ISIS");
382
383	# Record was processed successfully
384	return 1;
385	}
386
387
388	sub parse_field_definition_table
389	{
390	my $fdtfilename = shift(@_);
391	my $encoding = shift(@_);
392
393	my %fdtmapping = ();
394
395	open(FDT_FILE, "<$fdtfilename") \|\| die "Error: Could not open file $fdtfilename.\n";
396
397	my $fdtfiletext = "";
398	my $reader = new multiread();
399	$reader->set_handle('ISISPlugin::FDT_FILE');
400	$reader->set_encoding($encoding);
401	$reader->read_file($fdtfiletext);
402
403	my $amongstdefinitions = 0;
404	foreach my $fdtfileline (split(/\n/, $$fdtfiletext)) {
405	$fdtfileline =~ s/(\s*)$//; # Remove any nasty spaces at the end of the lines
406
407	if ($amongstdefinitions) {
408	my $fieldname = &unicode::substr($fdtfileline, 0, 30);
409	my $fieldsubfields = &unicode::substr($fdtfileline, 30, 20);
410	my $fieldspecs = &unicode::substr($fdtfileline, 50, 50);
411
412	# Remove extra spaces
413	$fieldname =~ s/(\s*)$//;
414	$fieldsubfields =~ s/(\s*)$//;
415	$fieldspecs =~ s/(\s*)$//;
416
417	# Map from tag number to metadata field title, subfields, and repeatability
418	my $fieldtag = (split(/ /, $fieldspecs))[0];
419	my $fieldrepeatable = (split(/ /, $fieldspecs))[3];
420	$fdtmapping{$fieldtag} = { 'name' => $fieldname,
421	'subfields' => $fieldsubfields,
422	'repeatable' => $fieldrepeatable };
423	}
424	elsif ($fdtfileline eq "***") {
425	$amongstdefinitions = 1;
426	}
427	}
428
429	close(FDT_FILE);
430
431	return %fdtmapping;
432	}
433
434
435	sub escape_metadata_value
436	{
437	my $value = shift(@_);
438	$value =~ s/\</</g;
439	$value =~ s/\>/>/g;
440	$value =~ s/\\/\\\\/g;
441	return $value;
442	}
443
444
445	sub clean_up_after_exploding
446	{
447	my $self = shift(@_);
448
449	# Delete the FDT and XRF files too
450	&util::rm($self->{'fdt_file_path'});
451	&util::rm($self->{'xrf_file_path'});
452	}
453
454
455	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: