Context Navigation

source: main/trunk/greenstone2/perllib/plugins/ISISPlugin.pm@ 31492

Last change on this file since 31492 was 31492, checked in by kjdon, 7 years ago
renamed EncodingUtil to CommonUtil, BasePlugin to BaseImporter. The idea is that only top level plugins that you can specify in your collection get to have plugin in their name. Modified all other plugins to reflect these name changes
Property svn:keywords set to `Author Date Id Revision`
File size: 15.0 KB

Line
1	###########################################################################
2	#
3	# ISISPlugin.pm -- A plugin for CDS/ISIS databases
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright 1999-2004 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	package ISISPlugin;
28
29	use Encode;
30
31	use multiread;
32	use SplitTextFile;
33	use MetadataRead;
34	use FileUtils;
35
36	use strict;
37	no strict 'refs'; # allow filehandles to be variables and viceversa
38
39	# ISISPlugin is a sub-class of SplitTextFile.
40	# methods with identical signatures take precedence in the order given in the ISA list.
41	sub BEGIN {
42	@ISISPlugin::ISA = ('MetadataRead', 'SplitTextFile');
43	}
44
45
46	my $arguments =
47	[ { 'name' => "process_exp",
48	'desc' => "{BaseImporter.process_exp}",
49	'type' => "regexp",
50	'reqd' => "no",
51	'deft' => &get_default_process_exp() },
52	{ 'name' => "block_exp",
53	'desc' => "{BaseImporter.block_exp}",
54	'type' => "regexp",
55	'reqd' => "no",
56	'deft' => &get_default_block_exp(),
57	'hiddengli' => "yes" },
58	{ 'name' => "split_exp",
59	'desc' => "{SplitTextFile.split_exp}",
60	'type' => "regexp",
61	'reqd' => "no",
62	'deft' => &get_default_split_exp(),
63	'hiddengli' => "yes" },
64
65	# The interesting options
66	{ 'name' => "entry_separator",
67	'desc' => "{ISISPlugin.entry_separator}",
68	'type' => "string",
69	'reqd' => "no",
70	'deft' => "<br>" },
71	{ 'name' => "subfield_separator",
72	'desc' => "{ISISPlugin.subfield_separator}",
73	'type' => "string",
74	'reqd' => "no",
75	'deft' => ", " }
76	];
77
78	my $options = { 'name' => "ISISPlugin",
79	'desc' => "{ISISPlugin.desc}",
80	'abstract' => "no",
81	'inherits' => "yes",
82	'explodes' => "yes",
83	'args' => $arguments };
84
85
86	# This plugin processes files with the suffix ".mst"
87	sub get_default_process_exp {
88	return q^(?i)(\.mst)$^;
89	}
90
91
92	# This plugin blocks files with the suffix ".fdt" and ".xrf"
93	sub get_default_block_exp {
94	return q^(?i)(\.fdt\|\.xrf)$^;
95	#return "";
96	}
97
98
99	# This plugin splits the input text at the "----------" lines
100	sub get_default_split_exp {
101	return q^\r?\n----------\r?\n^;
102	}
103
104
105	sub new
106	{
107	my ($class) = shift (@_);
108	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
109	push(@$pluginlist, $class);
110
111	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
112	push(@{$hashArgOptLists->{"OptList"}},$options);
113
114	my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
115
116	if ($self->{'info_only'}) {
117	# don't worry about any options etc
118	return bless $self, $class;
119	}
120
121	# isis plug doesn't care about encoding - it assumes ascii unless the user
122	# has specified an encoding
123	if ($self->{'input_encoding'} eq "auto") {
124	$self->{'input_encoding'} = "ascii";
125	}
126	return bless $self, $class;
127	}
128
129	# we block the corresponding fdt and xrf
130	# a pain on windows. blocks xxx.FDT, but if actual file is xx.fdt then
131	# complains that no plugin can process it. Have put it back to using
132	# block exp for now
133	# This works now, as are doing case insenstive blocking on windows. However,
134	# a pain for GLI as will not know what plugin processes the fdt and xrf.
135	# if add to process expression, then get more problems.
136	sub store_block_files_tmp {
137
138	my $self =shift (@_);
139	my ($filename_full_path, $block_hash) = @_;
140	print STDERR "in store block files\n";
141	$self->check_auxiliary_files($filename_full_path);
142	if (-e $self->{'fdt_file_path'}) {
143	print STDERR "$self->{'fdt_file_path'}\n";
144	my $fdt_file = $self->{'fdt_file_path'};
145	$self->block_raw_filename($block_hash,$fdt_file);
146	}
147	if (-e $self->{'xrf_file_path'}) {
148	print STDERR "$self->{'xrf_file_path'}\n";
149	my $xrf_file = $self->{'xrf_file_path'};
150	$self->block_raw_filename($block_hash,$xrf_file);
151	}
152
153
154	}
155
156	sub check_auxiliary_files {
157	my $self = shift (@_);
158	my ($filename) = @_;
159
160	my ($database_file_path_root) = ($filename =~ /(.*)\.mst$/i);
161	# Check the associated .fdt and .xrf files exist
162	$self->{'fdt_file_path'} = $database_file_path_root . ".FDT";
163	if (!-e $self->{'fdt_file_path'}) {
164	$self->{'fdt_file_path'} = $database_file_path_root . ".fdt";
165	}
166	$self->{'xrf_file_path'} = $database_file_path_root . ".XRF";
167	if (!-e $self->{'xrf_file_path'}) {
168	$self->{'xrf_file_path'} = $database_file_path_root . ".xrf";
169	}
170	}
171
172
173	sub read_file
174	{
175	my $self = shift (@_);
176	my ($filename, $encoding, $language, $textref) = @_;
177	my $outhandle = $self->{'outhandle'};
178
179	my ($database_file_path_root) = ($filename =~ /(.*)\.mst$/i);
180	my $mst_file_path_relative = $filename;
181	$mst_file_path_relative =~ s/^.+import.(.*?)$/$1/;
182
183	# Check the associated .fdt and .xrf files exist
184	$self->check_auxiliary_files($filename);
185
186	if (!-e $self->{'fdt_file_path'}) {
187	print STDERR "<ProcessingError n='$mst_file_path_relative' r='Could not find ISIS FDT file $self->{'fdt_file_path'}'>\n" if ($self->{'gli'});
188	print $outhandle "Error: Could not find ISIS FDT file " . $self->{'fdt_file_path'} . ".\n";
189	return;
190	}
191	if (!-e $self->{'xrf_file_path'}) {
192	print STDERR "<ProcessingError n='$mst_file_path_relative' r='Could not find ISIS XRF file $self->{'xrf_file_path'}'>\n" if ($self->{'gli'});
193	print $outhandle "Error: Could not find ISIS XRF file " . $self->{'xrf_file_path'} . ".\n";
194	return;
195	}
196
197	# The text to split is exported from the database by the IsisGdl program
198	open(FILE, "IsisGdl \"$filename\" \|");
199
200	my $reader = new multiread();
201	$reader->set_handle('ISISPlugin::FILE');
202	$reader->set_encoding($encoding);
203	$reader->read_file($textref);
204
205	# At this point $$textref is a binary byte string
206	# => turn it into a Unicode aware string, so full
207	# Unicode aware pattern matching can be used.
208	# For instance: 's/\x{0101}//g' or '[[:upper:]]'
209	#
210
211	$$textref = decode("utf8",$$textref);
212	close(FILE);
213
214	# Parse the associated ISIS database Field Definition Table file (.fdt)
215	my %fdt_mapping = &parse_field_definition_table($self->{'fdt_file_path'}, $encoding);
216	$self->{'fdt_mapping'} = \%fdt_mapping;
217
218	# Remove the line at the start, and any blank lines, so the data is split and processed properly
219	$$textref =~ s/^----------\r?\n//;
220	$$textref =~ s/(\r\|\n)\n/\n/g;
221	}
222
223
224	sub process
225	{
226	my $self = shift (@_);
227	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
228	my $outhandle = $self->{'outhandle'};
229
230	# store the auxiliary files so we know which ones were used
231	# (mst file becomes the source file)
232	$doc_obj->associate_source_file($self->{'fdt_file_path'});
233	$doc_obj->associate_source_file($self->{'xrf_file_path'});
234
235	my $section = $doc_obj->get_top_section();
236	my $fdt_mapping = $self->{'fdt_mapping'};
237	my $subfield_separator = $self->{'subfield_separator'};
238	my $entry_separator = $self->{'entry_separator'};
239	my $isis_record_html_metadata_value = "<table cellpadding=\"4\" cellspacing=\"0\">";
240
241	# Process each line of the ISIS record, one at a time
242	foreach my $line (split(/\n/, $$textref)) {
243	$line =~ s/(\s*)$//; # Remove any nasty whitespace (very important for Windows)
244	$line =~ /^tag=(.*) data=(.+)$/;
245	my $tag = $1;
246	my $tag_data = $2;
247	# print STDERR "\nTag: $tag, Data: $tag_data\n";
248
249	# Convert the tag number into a name, and remove any invalid characters
250	my $raw_metadata_name = $fdt_mapping->{$tag}{'name'} \|\| "";
251	$raw_metadata_name =~ s/[,&\#\.\-\/]/ /g;
252	next if ($raw_metadata_name eq "");
253
254	# Metadata field names: title case, then remove spaces
255	my $metadata_name = "";
256	foreach my $word (split(/\s+/, $raw_metadata_name)) {
257	substr($word, 0, 1) =~ tr/a-z/A-Z/;
258	$metadata_name .= $word;
259	}
260
261	my $all_metadata_name = $metadata_name . "^all";
262	my $all_metadata_value = "";
263
264	# Handle repeatable fields
265	if ($fdt_mapping->{$tag}{'repeatable'}) {
266	# Multiple values are separated using the '%' character
267	foreach my $raw_metadata_value (split(/%/, $tag_data)) {
268	my $metadata_value = "";
269
270	# Handle subfields
271	while ($raw_metadata_value ne "") {
272	# If there is a subfield specifier, parse it off
273	my $sub_metadata_name = $metadata_name;
274	if ($raw_metadata_value =~ s/^\^// && $raw_metadata_value =~ s/^([a-z])//) {
275	$sub_metadata_name .= "^$1";
276	}
277
278	# Parse the value off and add it as metadata
279	$raw_metadata_value =~ s/^([^\^]*)//;
280	my $sub_metadata_value = &escape_metadata_value($1);
281
282	# print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
283	if ($sub_metadata_name ne $metadata_name) {
284	$doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
285	}
286
287	# If this tag has subfields and this is the first, use the value for the CDS/ISIS ^* field
288	if ($fdt_mapping->{$tag}{'subfields'} ne "" && $metadata_value eq "") {
289	$doc_obj->add_utf8_metadata($section, $metadata_name . "^*", $sub_metadata_value);
290	}
291
292	$metadata_value .= $subfield_separator unless ($metadata_value eq "");
293	$metadata_value .= $sub_metadata_value;
294	}
295
296	# Add the metadata value
297	# print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
298	$doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
299
300	$all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
301	$all_metadata_value .= $metadata_value;
302	}
303	}
304
305	# Handle non-repeatable fields
306	else {
307	my $raw_metadata_value = $tag_data;
308	my $metadata_value = "";
309
310	# Handle subfields
311	while ($raw_metadata_value ne "") {
312	# If there is a subfield specifier, parse it off
313	my $sub_metadata_name = $metadata_name;
314	if ($raw_metadata_value =~ s/^\^// && $raw_metadata_value =~ s/^([a-z])//) {
315	$sub_metadata_name .= "^$1";
316	}
317
318	# Parse the value off and add it as metadata
319	$raw_metadata_value =~ s/^([^\^]*)//;
320	my $sub_metadata_value = $1;
321
322	# Deal with the case when multiple values are specified using <...>
323	if ($sub_metadata_value =~ /\<(.+)\>/) {
324	my $sub_sub_metadata_name = $sub_metadata_name . "^sub";
325	my $tmp_sub_metadata_value = $sub_metadata_value;
326	while ($tmp_sub_metadata_value =~ s/\<(.+?)\>//) {
327	my $sub_sub_metadata_value = $1;
328	$doc_obj->add_utf8_metadata($section, $sub_sub_metadata_name, $sub_sub_metadata_value);
329	}
330	}
331	# Deal with the legacy case when multiple values are specified using /.../
332	elsif ($sub_metadata_value =~ /\/(.+)\//) {
333	my $sub_sub_metadata_name = $sub_metadata_name . "^sub";
334	my $tmp_sub_metadata_value = $sub_metadata_value;
335	while ($tmp_sub_metadata_value =~ s/\/(.+?)\///) {
336	my $sub_sub_metadata_value = $1;
337	$doc_obj->add_utf8_metadata($section, $sub_sub_metadata_name, $sub_sub_metadata_value);
338	}
339	}
340
341	# Escape the metadata value so it appears correctly in the final collection
342	$sub_metadata_value = &escape_metadata_value($sub_metadata_value);
343
344	# print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
345	if ($sub_metadata_name ne $metadata_name) {
346	$doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
347	}
348
349	# If this tag has subfields and this is the first, use the value for the CDS/ISIS ^* field
350	if ($fdt_mapping->{$tag}{'subfields'} ne "" && $metadata_value eq "") {
351	$doc_obj->add_utf8_metadata($section, $metadata_name . "^*", $sub_metadata_value);
352	}
353
354	$metadata_value .= $subfield_separator unless ($metadata_value eq "");
355	$metadata_value .= $sub_metadata_value;
356	}
357
358	# Add the metadata value
359	# print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
360	$doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
361
362	$all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
363	$all_metadata_value .= $metadata_value;
364	}
365
366	# Add the "^all" metadata value
367	# print STDERR "All metadata name: $all_metadata_name, value: $all_metadata_value\n";
368	$doc_obj->add_utf8_metadata($section, $all_metadata_name, $all_metadata_value);
369
370	$isis_record_html_metadata_value .= "<tr><td valign=top><nobr><b>" . $fdt_mapping->{$tag}{'name'} . "</b></nobr></td><td valign=top>" . $all_metadata_value . "</td></tr>";
371	}
372
373	# Add a reasonably formatted HTML table view of the record as the document text
374	$isis_record_html_metadata_value .= "</table>";
375	$doc_obj->add_utf8_text($section, $isis_record_html_metadata_value);
376
377	# Add the full raw record as metadata
378	my $isis_raw_record_metadata_value = &escape_metadata_value($$textref);
379	$doc_obj->add_utf8_metadata($section, "ISISRawRecord", $isis_raw_record_metadata_value);
380
381	# Add FileFormat metadata
382	$doc_obj->add_utf8_metadata($section, "FileFormat", "CDS/ISIS");
383
384	# Record was processed successfully
385	return 1;
386	}
387
388
389	sub parse_field_definition_table
390	{
391	my $fdtfilename = shift(@_);
392	my $encoding = shift(@_);
393
394	my %fdtmapping = ();
395
396	open(FDT_FILE, "<$fdtfilename") \|\| die "Error: Could not open file $fdtfilename.\n";
397
398	my $fdtfiletext = "";
399	my $reader = new multiread();
400	$reader->set_handle('ISISPlugin::FDT_FILE');
401	$reader->set_encoding($encoding);
402	$reader->read_file($fdtfiletext);
403
404	my $amongstdefinitions = 0;
405	foreach my $fdtfileline (split(/\n/, $$fdtfiletext)) {
406	$fdtfileline =~ s/(\s*)$//; # Remove any nasty spaces at the end of the lines
407
408	if ($amongstdefinitions) {
409	my $fieldname = &unicode::substr($fdtfileline, 0, 30);
410	my $fieldsubfields = &unicode::substr($fdtfileline, 30, 20);
411	my $fieldspecs = &unicode::substr($fdtfileline, 50, 50);
412
413	# Remove extra spaces
414	$fieldname =~ s/(\s*)$//;
415	$fieldsubfields =~ s/(\s*)$//;
416	$fieldspecs =~ s/(\s*)$//;
417
418	# Map from tag number to metadata field title, subfields, and repeatability
419	my $fieldtag = (split(/ /, $fieldspecs))[0];
420	my $fieldrepeatable = (split(/ /, $fieldspecs))[3];
421	$fdtmapping{$fieldtag} = { 'name' => $fieldname,
422	'subfields' => $fieldsubfields,
423	'repeatable' => $fieldrepeatable };
424	}
425	elsif ($fdtfileline eq "***") {
426	$amongstdefinitions = 1;
427	}
428	}
429
430	close(FDT_FILE);
431
432	return %fdtmapping;
433	}
434
435
436	sub escape_metadata_value
437	{
438	my $value = shift(@_);
439	$value =~ s/\</</g;
440	$value =~ s/\>/>/g;
441	$value =~ s/\\/\\\\/g;
442	return $value;
443	}
444
445
446	sub clean_up_after_exploding
447	{
448	my $self = shift(@_);
449
450	# Delete the FDT and XRF files too
451	&FileUtils::removeFiles($self->{'fdt_file_path'});
452	&FileUtils::removeFiles($self->{'xrf_file_path'});
453	}
454
455
456	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: