Context Navigation

source: trunk/gsdl/perllib/plugins/ISISPlug.pm@ 10218

Last change on this file since 10218 was 10218, checked in by kjdon, 19 years ago
Jeffrey's new parsing modifications, committed approx 6 July, 15.16
Property svn:keywords set to `Author Date Id Revision`
File size: 8.2 KB

Line
1	###########################################################################
2	#
3	# ISISPlug.pm -- A plugin for CDS/ISIS databases
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright 1999-2004 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	package ISISPlug;
28
29
30	use multiread;
31	use SplitPlug;
32
33
34	# ISISPlug is a sub-class of SplitPlug.
35	sub BEGIN {
36	@ISA = ('SplitPlug');
37	}
38
39
40	my $arguments =
41	[ { 'name' => "process_exp",
42	'desc' => "{BasPlug.process_exp}",
43	'type' => "regexp",
44	'reqd' => "no",
45	'deft' => &get_default_process_exp() },
46	{ 'name' => "block_exp",
47	'desc' => "{BasPlug.block_exp}",
48	'type' => "regexp",
49	'reqd' => "no",
50	'deft' => &get_default_block_exp() },
51	{ 'name' => "split_exp",
52	'desc' => "{SplitPlug.split_exp}",
53	'type' => "regexp",
54	'reqd' => "no",
55	'deft' => &get_default_split_exp() },
56
57	# The interesting options
58	{ 'name' => "entry_separator",
59	'desc' => "{ISISPlug.entry_separator}",
60	'type' => "string",
61	'reqd' => "no",
62	'deft' => "<br>" },
63	{ 'name' => "subfield_separator",
64	'desc' => "{ISISPlug.subfield_separator}",
65	'type' => "string",
66	'reqd' => "no",
67	'deft' => ", " }
68	];
69
70	my $options = { 'name' => "ISISPlug",
71	'desc' => "{ISISPlug.desc}",
72	'abstract' => "no",
73	'inherits' => "yes",
74	'explodes' => "yes",
75	'args' => $arguments };
76
77
78	# This plugin processes files with the suffix ".mst"
79	sub get_default_process_exp {
80	return q^(?i)(\.mst)$^;
81	}
82
83
84	# This plugin blocks files with the suffix ".fdt" and ".xrf"
85	sub get_default_block_exp {
86	return q^(?i)(\.fdt\|\.xrf)$^;
87	}
88
89
90	# This plugin splits the input text at the "----------" lines
91	sub get_default_split_exp {
92	return q^\r?\n----------\r?\n^;
93	}
94
95
96	sub new
97	{
98	my ($class) = shift (@_);
99	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
100	push(@$pluginlist, $class);
101
102	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
103	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
104
105	my $self = (defined $hashArgOptLists)? new SplitPlug($pluginlist,$inputargs,$hashArgOptLists): new SplitPlug($pluginlist,$inputargs);
106
107	return bless $self, $class;
108	}
109
110
111	sub read_file
112	{
113	my $self = shift (@_);
114	my ($filename, $encoding, $language, $textref) = @_;
115
116	my ($databasename) = ($filename =~ /(.*)\.mst$/i);
117
118	# Check the associated .fdt and .xrf files exist
119	# These files must have a lowercase extension for the IsisGdl program to work
120	# Bailing out because of this is kind of crappy but it is only an issue on Unix
121	my $fdtfilename = $databasename . ".fdt";
122	if (! -e $fdtfilename) {
123	die "Error: Could not find ISIS FDT file $fdtfilename.\n";
124	}
125	my $xrffilename = $databasename . ".xrf";
126	if (! -e $xrffilename) {
127	die "Error: Could not find ISIS XRF file $xrffilename.\n";
128	}
129
130	# The text to split is exported from the database by the IsisGdl program
131	open(FILE, "IsisGdl \"$filename\" \|");
132
133	my $reader = new multiread();
134	$reader->set_handle('ISISPlug::FILE');
135	$reader->set_encoding($encoding);
136	$reader->read_file($textref);
137
138	close(FILE);
139
140	# Parse the associated ISIS database Field Definition Table file (.fdt)
141	my %fdtmapping = &parse_field_definition_table($fdtfilename);
142
143	# Map the tag numbers to tag names, using the FDT mapping
144	$$textref =~ s/\r?\ntag=(\d+) /\ntag=$fdtmapping{$1}{'title'} /g;
145
146	# Remove the line at the start so it is split and processed properly
147	$$textref =~ s/^----------\n//;
148	}
149
150
151	sub process
152	{
153	my $self = shift (@_);
154	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
155	my $outhandle = $self->{'outhandle'};
156
157	my $subfield_separator = $self->{'subfield_separator'};
158	my $entry_separator = $self->{'entry_separator'};
159
160	# Report that we're processing the file
161	print STDERR "<Processing n='$file' p='ISISPlug'>\n" if ($gli);
162	print $outhandle "IsisPlug: processing $file\n"
163	if ($self->{'verbosity'}) > 1;
164
165	# Process each line of the ISIS record, one at a time
166	foreach $line (split(/\n/, $$textref)) {
167	$line =~ /^tag=(.*) data=(.+)$/;
168	my $rawtagname = $1;
169	my $rawtagdata = $2;
170	# print STDERR "Raw tag: $rawtagname, Raw data: $rawtagdata\n";
171	next if ($rawtagname eq "");
172
173	# Metadata field names: title case, then remove spaces
174	my $tagname = "";
175	foreach $word (split(/\s+/, $rawtagname)) {
176	substr($word, 0, 1) =~ tr/a-z/A-Z/;
177	$tagname .= $word;
178	}
179
180	# Make sure there is nothing bad in the tag names
181	$tagname =~ s/&//g;
182
183	# Handle each piece of metadata ('%' separated)
184	my $completetagvalue = "";
185	foreach $rawtagvalue (split(/%/, $rawtagdata)) {
186	$completetagvalue .= $entry_separator unless ($completetagvalue eq "");
187
188	# Metadata field values: take care with subfields
189	my $completeentryvalue = "";
190	while ($rawtagvalue ne "") {
191	# If there is a subfield specifier, parse it off
192	my $subfieldname = "";
193	if ($rawtagvalue =~ s/^\^// && $rawtagvalue =~ s/([a-z])//) {
194	$subfieldname = "^$1";
195	}
196
197	# Parse the metadata value off
198	$rawtagvalue =~ s/^([^\^]*)//;
199	my $metadatafieldname = $tagname . $subfieldname;
200	my $metadatafieldvalue = $1;
201
202	# Handle Keywords specially
203	if ($metadatafieldname eq "Keywords") {
204	my $keywordmetadatavalue = $metadatafieldvalue;
205	my $keywordlist = "";
206	while ($keywordmetadatavalue =~ s/\<(.+?)\>//) {
207	my $keyword = $1;
208	$doc_obj->add_utf8_metadata($cursection, $metadatafieldname, $keyword);
209	$keywordlist .= ", " unless ($keywordlist eq "");
210	$keywordlist .= $keyword;
211	}
212
213	$metadatafieldvalue = $keywordlist;
214	}
215
216	# Escape any '<' and '>' characters so they appear correctly in the final collection
217	$metadatafieldvalue =~ s/\</</g;
218	$metadatafieldvalue =~ s/\>/>/g;
219
220	# We have already added Keywords metadata above
221	unless ($metadatafieldname eq "Keywords") {
222	$doc_obj->add_utf8_metadata($cursection, $metadatafieldname, $metadatafieldvalue);
223	}
224
225	$completeentryvalue .= $subfield_separator unless ($completeentryvalue eq "");
226	$completeentryvalue .= $metadatafieldvalue;
227	}
228
229	$completetagvalue .= $completeentryvalue;
230	}
231
232	$doc_obj->add_utf8_metadata($cursection, $tagname . "^all", $completetagvalue);
233	}
234
235	# Add the full record as the document text
236	$$textref =~ s/\</</g;
237	$$textref =~ s/\>/>/g;
238	$doc_obj->add_utf8_text($cursection, $$textref);
239
240	# Add FileFormat metadata
241	$doc_obj->add_utf8_metadata($cursection, "FileFormat", "CDS/ISIS");
242
243	# Record was processed successfully (and there was no document obtained)
244	return 1;
245	}
246
247
248	sub parse_field_definition_table
249	{
250	my $fdtfilename = shift(@_);
251
252	my %fdtmapping = ();
253
254	open(FDT_FILE, "<$fdtfilename") \|\| die "Error: Could not open file $fdtfilename.\n";
255
256	my $amongstdefinitions = 0;
257	foreach $fdtfileline (<FDT_FILE>) {
258	$fdtfileline =~ s/(\s*)$//; # Remove any nasty spaces at the end of the lines
259
260	if ($amongstdefinitions) {
261	my $fieldtitle = substr($fdtfileline, 0, 30);
262	my $fieldsubfields = substr($fdtfileline, 30, 20);
263	my $fieldspecs = substr($fdtfileline, 50);
264
265	# Remove extra spaces
266	$fieldtitle =~ s/(\s*)$//;
267	$fieldsubfields =~ s/(\s*)$//;
268
269	# Map from tag number to metadata field title and subfields
270	my $fieldtag = (split(/ /, $fieldspecs))[0];
271	$fdtmapping{$fieldtag} = { 'title' => $fieldtitle,
272	'subfields' => $fieldsubfields };
273	}
274	elsif ($fdtfileline eq "***") {
275	$amongstdefinitions = 1;
276	}
277	}
278
279	close(FDT_FILE);
280
281	return %fdtmapping;
282	}
283
284
285	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: