Context Navigation

source: trunk/gsdl/perllib/plugins/ISISPlug.pm@ 9998

Last change on this file since 9998 was 9998, checked in by mdewsnip, 19 years ago
Changed this to work on Windows now that multiread sets binmode, and fixed the bug where the Keywords metadata was not being added correctly.
Property svn:keywords set to `Author Date Id Revision`
File size: 8.4 KB

Line
1	###########################################################################
2	#
3	# ISISPlug.pm -- A plugin for CDS/ISIS databases
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright 1999-2004 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	package ISISPlug;
28
29
30	use multiread;
31	use SplitPlug;
32
33
34	# ISISPlug is a sub-class of SplitPlug.
35	sub BEGIN {
36	@ISA = ('SplitPlug');
37	}
38
39
40	my $arguments =
41	[ { 'name' => "process_exp",
42	'desc' => "{BasPlug.process_exp}",
43	'type' => "regexp",
44	'reqd' => "no",
45	'deft' => &get_default_process_exp() },
46	{ 'name' => "block_exp",
47	'desc' => "{BasPlug.block_exp}",
48	'type' => "regexp",
49	'reqd' => "no",
50	'deft' => &get_default_block_exp() },
51	{ 'name' => "split_exp",
52	'desc' => "{SplitPlug.split_exp}",
53	'type' => "regexp",
54	'reqd' => "no",
55	'deft' => &get_default_split_exp() },
56
57	# The interesting options
58	{ 'name' => "entry_separator",
59	'desc' => "{ISISPlug.entry_separator}",
60	'type' => "string",
61	'reqd' => "no",
62	'deft' => "<br>" },
63	{ 'name' => "subfield_separator",
64	'desc' => "{ISISPlug.subfield_separator}",
65	'type' => "string",
66	'reqd' => "no",
67	'deft' => ", " }
68	];
69
70	my $options = { 'name' => "ISISPlug",
71	'desc' => "{ISISPlug.desc}",
72	'abstract' => "no",
73	'inherits' => "yes",
74	'explodes' => "yes",
75	'args' => $arguments };
76
77
78	# This plugin processes files with the suffix ".mst"
79	sub get_default_process_exp {
80	return q^(?i)(\.mst)$^;
81	}
82
83
84	# This plugin blocks files with the suffix ".fdt" and ".xrf"
85	sub get_default_block_exp {
86	return q^(?i)(\.fdt\|\.xrf)$^;
87	}
88
89
90	# This plugin splits the input text at the "----------" lines
91	sub get_default_split_exp {
92	return q^\r?\n----------\r?\n^;
93	}
94
95
96	sub new
97	{
98	my $class = shift(@_);
99
100	my $self = new SplitPlug($class, @_);
101	if (!parsargv::parse(\@_,
102	q^subfield_separator/.*/, ^, \$self->{'subfield_separator'},
103	q^entry_separator/.*/<br>^, \$self->{'entry_separator'},
104	"allow_extra_options")) {
105	print STDERR "\nIncorrect options passed to ISISPlug, check your collect.cfg configuration file\n";
106	die "\n";
107	}
108
109	# To allow for proper inheritance of arguments
110	my $option_list = $self->{'option_list'};
111	push(@{$option_list}, $options);
112	$self->{'plugin_type'} = "ISISPlug";
113
114	return bless $self, $class;
115	}
116
117
118	sub read_file
119	{
120	my $self = shift (@_);
121	my ($filename, $encoding, $language, $textref) = @_;
122
123	my ($databasename) = ($filename =~ /(.*)\.mst$/i);
124
125	# Check the associated .fdt and .xrf files exist
126	# These files must have a lowercase extension for the IsisGdl program to work
127	# Bailing out because of this is kind of crappy but it is only an issue on Unix
128	my $fdtfilename = $databasename . ".fdt";
129	if (! -e $fdtfilename) {
130	die "Error: Could not find ISIS FDT file $fdtfilename.\n";
131	}
132	my $xrffilename = $databasename . ".xrf";
133	if (! -e $xrffilename) {
134	die "Error: Could not find ISIS XRF file $xrffilename.\n";
135	}
136
137	# The text to split is exported from the database by the IsisGdl program
138	open(FILE, "IsisGdl \"$filename\" \|");
139
140	my $reader = new multiread();
141	$reader->set_handle('ISISPlug::FILE');
142	$reader->set_encoding($encoding);
143	$reader->read_file($textref);
144
145	close(FILE);
146
147	# Parse the associated ISIS database Field Definition Table file (.fdt)
148	my %fdtmapping = &parse_field_definition_table($fdtfilename);
149
150	# Map the tag numbers to tag names, using the FDT mapping
151	$$textref =~ s/\r?\ntag=(\d+) /\ntag=$fdtmapping{$1}{'title'} /g;
152
153	# Remove the line at the start so it is split and processed properly
154	$$textref =~ s/^----------\n//;
155	}
156
157
158	sub process
159	{
160	my $self = shift (@_);
161	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
162	my $outhandle = $self->{'outhandle'};
163
164	my $subfield_separator = $self->{'subfield_separator'};
165	my $entry_separator = $self->{'entry_separator'};
166
167	# Report that we're processing the file
168	print STDERR "<Processing n='$file' p='ISISPlug'>\n" if ($gli);
169	print $outhandle "IsisPlug: processing $file\n"
170	if ($self->{'verbosity'}) > 1;
171
172	# Process each line of the ISIS record, one at a time
173	foreach $line (split(/\n/, $$textref)) {
174	$line =~ /^tag=(.*) data=(.+)$/;
175	my $rawtagname = $1;
176	my $rawtagdata = $2;
177	# print STDERR "Raw tag: $rawtagname, Raw data: $rawtagdata\n";
178	next if ($rawtagname eq "");
179
180	# Metadata field names: title case, then remove spaces
181	my $tagname = "";
182	foreach $word (split(/\s+/, $rawtagname)) {
183	substr($word, 0, 1) =~ tr/a-z/A-Z/;
184	$tagname .= $word;
185	}
186
187	# Make sure there is nothing bad in the tag names
188	$tagname =~ s/&//g;
189
190	# Handle each piece of metadata ('%' separated)
191	my $completetagvalue = "";
192	foreach $rawtagvalue (split(/%/, $rawtagdata)) {
193	$completetagvalue .= $entry_separator unless ($completetagvalue eq "");
194
195	# Metadata field values: take care with subfields
196	my $completeentryvalue = "";
197	while ($rawtagvalue ne "") {
198	# If there is a subfield specifier, parse it off
199	my $subfieldname = "";
200	if ($rawtagvalue =~ s/^\^// && $rawtagvalue =~ s/([a-z])//) {
201	$subfieldname = "^$1";
202	}
203
204	# Parse the metadata value off
205	$rawtagvalue =~ s/^([^\^]*)//;
206	my $metadatafieldname = $tagname . $subfieldname;
207	my $metadatafieldvalue = $1;
208
209	# Handle Keywords specially
210	if ($metadatafieldname eq "Keywords") {
211	my $keywordmetadatavalue = $metadatafieldvalue;
212	my $keywordlist = "";
213	while ($keywordmetadatavalue =~ s/\<(.+?)\>//) {
214	my $keyword = $1;
215	$doc_obj->add_utf8_metadata($cursection, $metadatafieldname, $keyword);
216	$keywordlist .= ", " unless ($keywordlist eq "");
217	$keywordlist .= $keyword;
218	}
219
220	$metadatafieldvalue = $keywordlist;
221	}
222
223	# Escape any '<' and '>' characters so they appear correctly in the final collection
224	$metadatafieldvalue =~ s/\</</g;
225	$metadatafieldvalue =~ s/\>/>/g;
226
227	# We have already added Keywords metadata above
228	unless ($metadatafieldname eq "Keywords") {
229	$doc_obj->add_utf8_metadata($cursection, $metadatafieldname, $metadatafieldvalue);
230	}
231
232	$completeentryvalue .= $subfield_separator unless ($completeentryvalue eq "");
233	$completeentryvalue .= $metadatafieldvalue;
234	}
235
236	$completetagvalue .= $completeentryvalue;
237	}
238
239	$doc_obj->add_utf8_metadata($cursection, $tagname . "^all", $completetagvalue);
240	}
241
242	# Add the full record as the document text
243	$$textref =~ s/\</</g;
244	$$textref =~ s/\>/>/g;
245	$doc_obj->add_utf8_text($cursection, $$textref);
246
247	# Add FileFormat metadata
248	$doc_obj->add_utf8_metadata($cursection, "FileFormat", "CDS/ISIS");
249
250	# Record was processed successfully (and there was no document obtained)
251	return 1;
252	}
253
254
255	sub parse_field_definition_table
256	{
257	my $fdtfilename = shift(@_);
258
259	my %fdtmapping = ();
260
261	open(FDT_FILE, "<$fdtfilename") \|\| die "Error: Could not open file $fdtfilename.\n";
262
263	my $amongstdefinitions = 0;
264	foreach $fdtfileline (<FDT_FILE>) {
265	$fdtfileline =~ s/(\s*)$//; # Remove any nasty spaces at the end of the lines
266
267	if ($amongstdefinitions) {
268	my $fieldtitle = substr($fdtfileline, 0, 30);
269	my $fieldsubfields = substr($fdtfileline, 30, 20);
270	my $fieldspecs = substr($fdtfileline, 50);
271
272	# Remove extra spaces
273	$fieldtitle =~ s/(\s*)$//;
274	$fieldsubfields =~ s/(\s*)$//;
275
276	# Map from tag number to metadata field title and subfields
277	my $fieldtag = (split(/ /, $fieldspecs))[0];
278	$fdtmapping{$fieldtag} = { 'title' => $fieldtitle,
279	'subfields' => $fieldsubfields };
280	}
281	elsif ($fdtfileline eq "***") {
282	$amongstdefinitions = 1;
283	}
284	}
285
286	close(FDT_FILE);
287
288	return %fdtmapping;
289	}
290
291
292	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: