Context Navigation

source: trunk/gsdl/perllib/plugins/ISISPlug.pm@ 11090

Last change on this file since 11090 was 10254, checked in by kjdon, 19 years ago
added 'use strict' to all plugins, and made modifications (mostly adding 'my') to make them compile
Property svn:keywords set to `Author Date Id Revision`
File size: 8.4 KB

Line
1	###########################################################################
2	#
3	# ISISPlug.pm -- A plugin for CDS/ISIS databases
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright 1999-2004 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	package ISISPlug;
28
29
30	use multiread;
31	use SplitPlug;
32
33	use strict;
34	no strict 'refs'; # allow filehandles to be variables and viceversa
35
36	# ISISPlug is a sub-class of SplitPlug.
37	sub BEGIN {
38	@ISISPlug::ISA = ('SplitPlug');
39	}
40
41
42	my $arguments =
43	[ { 'name' => "process_exp",
44	'desc' => "{BasPlug.process_exp}",
45	'type' => "regexp",
46	'reqd' => "no",
47	'deft' => &get_default_process_exp() },
48	{ 'name' => "block_exp",
49	'desc' => "{BasPlug.block_exp}",
50	'type' => "regexp",
51	'reqd' => "no",
52	'deft' => &get_default_block_exp() },
53	{ 'name' => "split_exp",
54	'desc' => "{SplitPlug.split_exp}",
55	'type' => "regexp",
56	'reqd' => "no",
57	'deft' => &get_default_split_exp() },
58
59	# The interesting options
60	{ 'name' => "entry_separator",
61	'desc' => "{ISISPlug.entry_separator}",
62	'type' => "string",
63	'reqd' => "no",
64	'deft' => "<br>" },
65	{ 'name' => "subfield_separator",
66	'desc' => "{ISISPlug.subfield_separator}",
67	'type' => "string",
68	'reqd' => "no",
69	'deft' => ", " }
70	];
71
72	my $options = { 'name' => "ISISPlug",
73	'desc' => "{ISISPlug.desc}",
74	'abstract' => "no",
75	'inherits' => "yes",
76	'explodes' => "yes",
77	'args' => $arguments };
78
79
80	# This plugin processes files with the suffix ".mst"
81	sub get_default_process_exp {
82	return q^(?i)(\.mst)$^;
83	}
84
85
86	# This plugin blocks files with the suffix ".fdt" and ".xrf"
87	sub get_default_block_exp {
88	return q^(?i)(\.fdt\|\.xrf)$^;
89	}
90
91
92	# This plugin splits the input text at the "----------" lines
93	sub get_default_split_exp {
94	return q^\r?\n----------\r?\n^;
95	}
96
97
98	sub new
99	{
100	my ($class) = shift (@_);
101	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
102	push(@$pluginlist, $class);
103
104	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
105	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
106
107	my $self = (defined $hashArgOptLists)? new SplitPlug($pluginlist,$inputargs,$hashArgOptLists): new SplitPlug($pluginlist,$inputargs);
108
109	return bless $self, $class;
110	}
111
112
113	sub read_file
114	{
115	my $self = shift (@_);
116	my ($filename, $encoding, $language, $textref) = @_;
117
118	my ($databasename) = ($filename =~ /(.*)\.mst$/i);
119
120	# Check the associated .fdt and .xrf files exist
121	# These files must have a lowercase extension for the IsisGdl program to work
122	# Bailing out because of this is kind of crappy but it is only an issue on Unix
123	my $fdtfilename = $databasename . ".fdt";
124	if (! -e $fdtfilename) {
125	die "Error: Could not find ISIS FDT file $fdtfilename.\n";
126	}
127	my $xrffilename = $databasename . ".xrf";
128	if (! -e $xrffilename) {
129	die "Error: Could not find ISIS XRF file $xrffilename.\n";
130	}
131
132	# The text to split is exported from the database by the IsisGdl program
133	open(FILE, "IsisGdl \"$filename\" \|");
134
135	my $reader = new multiread();
136	$reader->set_handle('ISISPlug::FILE');
137	$reader->set_encoding($encoding);
138	$reader->read_file($textref);
139
140	close(FILE);
141
142	# Parse the associated ISIS database Field Definition Table file (.fdt)
143	my %fdtmapping = &parse_field_definition_table($fdtfilename);
144
145	# Map the tag numbers to tag names, using the FDT mapping
146	$$textref =~ s/\r?\ntag=(\d+) /\ntag=$fdtmapping{$1}{'title'} /g;
147
148	# Remove the line at the start so it is split and processed properly
149	$$textref =~ s/^----------\n//;
150	}
151
152
153	sub process
154	{
155	my $self = shift (@_);
156	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
157	my $outhandle = $self->{'outhandle'};
158
159	my $cursection = $doc_obj->get_top_section();
160	my $subfield_separator = $self->{'subfield_separator'};
161	my $entry_separator = $self->{'entry_separator'};
162
163	# Report that we're processing the file
164	print STDERR "<Processing n='$file' p='ISISPlug'>\n" if ($gli);
165	print $outhandle "IsisPlug: processing $file\n"
166	if ($self->{'verbosity'}) > 1;
167
168	# Process each line of the ISIS record, one at a time
169	foreach my $line (split(/\n/, $$textref)) {
170	$line =~ /^tag=(.*) data=(.+)$/;
171	my $rawtagname = $1;
172	my $rawtagdata = $2;
173	# print STDERR "Raw tag: $rawtagname, Raw data: $rawtagdata\n";
174	next if ($rawtagname eq "");
175
176	# Metadata field names: title case, then remove spaces
177	my $tagname = "";
178	foreach my $word (split(/\s+/, $rawtagname)) {
179	substr($word, 0, 1) =~ tr/a-z/A-Z/;
180	$tagname .= $word;
181	}
182
183	# Make sure there is nothing bad in the tag names
184	$tagname =~ s/&//g;
185
186	# Handle each piece of metadata ('%' separated)
187	my $completetagvalue = "";
188	foreach my $rawtagvalue (split(/%/, $rawtagdata)) {
189	$completetagvalue .= $entry_separator unless ($completetagvalue eq "");
190
191	# Metadata field values: take care with subfields
192	my $completeentryvalue = "";
193	while ($rawtagvalue ne "") {
194	# If there is a subfield specifier, parse it off
195	my $subfieldname = "";
196	if ($rawtagvalue =~ s/^\^// && $rawtagvalue =~ s/([a-z])//) {
197	$subfieldname = "^$1";
198	}
199
200	# Parse the metadata value off
201	$rawtagvalue =~ s/^([^\^]*)//;
202	my $metadatafieldname = $tagname . $subfieldname;
203	my $metadatafieldvalue = $1;
204
205	# Handle Keywords specially
206	if ($metadatafieldname eq "Keywords") {
207	my $keywordmetadatavalue = $metadatafieldvalue;
208	my $keywordlist = "";
209	while ($keywordmetadatavalue =~ s/\<(.+?)\>//) {
210	my $keyword = $1;
211	$doc_obj->add_utf8_metadata($cursection, $metadatafieldname, $keyword);
212	$keywordlist .= ", " unless ($keywordlist eq "");
213	$keywordlist .= $keyword;
214	}
215
216	$metadatafieldvalue = $keywordlist;
217	}
218
219	# Escape any '<' and '>' characters so they appear correctly in the final collection
220	$metadatafieldvalue =~ s/\</</g;
221	$metadatafieldvalue =~ s/\>/>/g;
222
223	# We have already added Keywords metadata above
224	unless ($metadatafieldname eq "Keywords") {
225	$doc_obj->add_utf8_metadata($cursection, $metadatafieldname, $metadatafieldvalue);
226	}
227
228	$completeentryvalue .= $subfield_separator unless ($completeentryvalue eq "");
229	$completeentryvalue .= $metadatafieldvalue;
230	}
231
232	$completetagvalue .= $completeentryvalue;
233	}
234
235	$doc_obj->add_utf8_metadata($cursection, $tagname . "^all", $completetagvalue);
236	}
237
238	# Add the full record as the document text
239	$$textref =~ s/\</</g;
240	$$textref =~ s/\>/>/g;
241	$doc_obj->add_utf8_text($cursection, $$textref);
242
243	# Add FileFormat metadata
244	$doc_obj->add_utf8_metadata($cursection, "FileFormat", "CDS/ISIS");
245
246	# Record was processed successfully (and there was no document obtained)
247	return 1;
248	}
249
250
251	sub parse_field_definition_table
252	{
253	my $fdtfilename = shift(@_);
254
255	my %fdtmapping = ();
256
257	open(FDT_FILE, "<$fdtfilename") \|\| die "Error: Could not open file $fdtfilename.\n";
258
259	my $amongstdefinitions = 0;
260	foreach my $fdtfileline (<FDT_FILE>) {
261	$fdtfileline =~ s/(\s*)$//; # Remove any nasty spaces at the end of the lines
262
263	if ($amongstdefinitions) {
264	my $fieldtitle = substr($fdtfileline, 0, 30);
265	my $fieldsubfields = substr($fdtfileline, 30, 20);
266	my $fieldspecs = substr($fdtfileline, 50);
267
268	# Remove extra spaces
269	$fieldtitle =~ s/(\s*)$//;
270	$fieldsubfields =~ s/(\s*)$//;
271
272	# Map from tag number to metadata field title and subfields
273	my $fieldtag = (split(/ /, $fieldspecs))[0];
274	$fdtmapping{$fieldtag} = { 'title' => $fieldtitle,
275	'subfields' => $fieldsubfields };
276	}
277	elsif ($fdtfileline eq "***") {
278	$amongstdefinitions = 1;
279	}
280	}
281
282	close(FDT_FILE);
283
284	return %fdtmapping;
285	}
286
287
288	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: