Context Navigation

source: trunk/gsdl/perllib/plugins/ISISPlug.pm@ 7703

Last change on this file since 7703 was 7693, checked in by mdewsnip, 20 years ago
Improvements to the new code so that it works on Windows as well as Unix.
Property svn:keywords set to `Author Date Id Revision`
File size: 13.8 KB

Line
1	###########################################################################
2	#
3	# ISISPlug.pm -- A plugin for CDS/ISIS databases
4	#
5	# A component of the Greenstone digital library software
6	# from the New Zealand Digital Library Project at the
7	# University of Waikato, New Zealand.
8	#
9	# Copyright 1999-2004 New Zealand Digital Library Project
10	#
11	# This program is free software; you can redistribute it and/or modify
12	# it under the terms of the GNU General Public License as published by
13	# the Free Software Foundation; either version 2 of the License, or
14	# (at your option) any later version.
15	#
16	# This program is distributed in the hope that it will be useful,
17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
19	# GNU General Public License for more details.
20	#
21	# You should have received a copy of the GNU General Public License
22	# along with this program; if not, write to the Free Software
23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
24	#
25	###########################################################################
26
27	package ISISPlug;
28
29
30	use multiread;
31	use SplitPlug;
32
33
34	# ISISPlug is a sub-class of SplitPlug.
35	sub BEGIN {
36	@ISA = ('SplitPlug');
37	}
38
39
40	my $arguments =
41	[ { 'name' => "process_exp",
42	'desc' => "{BasPlug.process_exp}",
43	'type' => "regexp",
44	'reqd' => "no",
45	'deft' => &get_default_process_exp() },
46	{ 'name' => "block_exp",
47	'desc' => "{BasPlug.block_exp}",
48	'type' => "regexp",
49	'reqd' => "no",
50	'deft' => &get_default_block_exp() },
51	{ 'name' => "split_exp",
52	'desc' => "{SplitPlug.split_exp}",
53	'type' => "regexp",
54	'reqd' => "no",
55	'deft' => &get_default_split_exp() },
56
57	# The interesting options
58	{ 'name' => "entry_separator",
59	'desc' => "{ISISPlug.entry_separator}",
60	'type' => "string",
61	'reqd' => "no",
62	'deft' => "<br>" },
63	{ 'name' => "subfield_separator",
64	'desc' => "{ISISPlug.subfield_separator}",
65	'type' => "string",
66	'reqd' => "no",
67	'deft' => ", " },
68	{ 'name' => "document_field",
69	'desc' => "{ISISPlug.document_field}",
70	'type' => "string",
71	'reqd' => "no",
72	'deft' => "" },
73	{ 'name' => "document_prefix",
74	'desc' => "{ISISPlug.document_prefix}",
75	'type' => "string",
76	'reqd' => "no",
77	'deft' => "" },
78	{ 'name' => "document_suffix",
79	'desc' => "{ISISPlug.document_suffix}",
80	'type' => "string",
81	'reqd' => "no",
82	'deft' => "" }
83	];
84
85	my $options = { 'name' => "ISISPlug",
86	'desc' => "{ISISPlug.desc}",
87	'abstract' => "no",
88	'inherits' => "yes",
89	'args' => $arguments };
90
91
92	# This plugin processes files with the suffix ".mst"
93	sub get_default_process_exp {
94	return q^(?i)(\.mst)$^;
95	}
96
97
98	# This plugin blocks files with the suffix ".fdt" and ".xrf"
99	sub get_default_block_exp {
100	return q^(?i)(\.fdt\|\.xrf)$^;
101	}
102
103
104	# This plugin splits the input text at the "----------" lines
105	sub get_default_split_exp {
106	return q^\n----------\n^;
107	}
108
109
110	sub new {
111	my $class = shift(@_);
112
113	my $self = new SplitPlug($class, @_);
114	if (!parsargv::parse(\@_,
115	q^subfield_separator/.*/, ^, \$self->{'subfield_separator'},
116	q^entry_separator/.*/<br>^, \$self->{'entry_separator'},
117	q^document_field/.*/^, \$self->{'document_field'},
118	q^document_prefix/.*/^, \$self->{'document_prefix'},
119	q^document_suffix/.*/^, \$self->{'document_suffix'},
120	"allow_extra_options")) {
121	print STDERR "\nIncorrect options passed to ISISPlug, check your collect.cfg configuration file\n";
122	die "\n";
123	}
124
125	# To allow for proper inheritance of arguments
126	my $option_list = $self->{'option_list'};
127	push(@{$option_list}, $options);
128	$self->{'plugin_type'} = "ISISPlug";
129
130	return bless $self, $class;
131	}
132
133
134	sub read
135	{
136	my $self = shift(@_);
137	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs) = @_;
138
139	my $result = &SplitPlug::read($self, @_);
140	if ($file =~ /$self->{'process_exp'}/ && $self->{'document_field'}) {
141	&end_metadata_xml_file($self->{'documents_metadata_xml_file'});
142	}
143
144	return $result;
145	}
146
147
148	sub read_file {
149	my $self = shift (@_);
150	my ($filename, $encoding, $language, $textref) = @_;
151
152	my ($databasename) = ($filename =~ /([^\.]+)\.mst$/i);
153
154	# Check the associated .fdt and .xrf files exist
155	# These files must have a lowercase extension for the IsisGdl program to work
156	# Bailing out because of this is kind of crappy but it is only an issue on Unix
157	my $fdtfilename = $databasename . ".fdt";
158	if (! -e $fdtfilename) {
159	die "Error: Could not find ISIS FDT file $fdtfilename.\n";
160	}
161	my $xrffilename = $databasename . ".xrf";
162	if (! -e $xrffilename) {
163	die "Error: Could not find ISIS XRF file $xrffilename.\n";
164	}
165
166	# The text to split is exported from the database by the IsisGdl program
167	open(FILE, "IsisGdl \"$filename\" \|");
168
169	my $reader = new multiread();
170	$reader->set_handle ('ISISPlug::FILE');
171	$reader->set_encoding ($encoding);
172	$reader->read_file ($textref);
173
174	close(FILE);
175
176	# Parse the associated ISIS database Field Definition Table file (.fdt)
177	my %fdtmapping = &parse_field_definition_table($fdtfilename);
178
179	# Map the tag numbers to tag names, using the FDT mapping
180	$$textref =~ s/\ntag=(\d+) /\ntag=$fdtmapping{$1}{'title'} /g;
181
182	# Remove the line at the start so it is split and processed properly
183	$$textref =~ s/^----------\n//;
184
185	# Obtain the documents specified in the CDS/ISIS database, if requested
186	if ($self->{'document_field'}) {
187	# Create a directory to store the document files
188	$self->{'documents_directory'} = $databasename . ".all";
189	if (-e $self->{'documents_directory'}) {
190	&util::rm_r($self->{'documents_directory'});
191	}
192	&util::mk_dir($self->{'documents_directory'});
193
194	# ...and a metadata.xml file for the document metadata (extracted from the database)
195	$self->{'documents_metadata_xml_file'} = &util::filename_cat($self->{'documents_directory'}, "metadata.xml");
196	if (-e $self->{'documents_metadata_xml_file'}) {
197	&util::rm($self->{'documents_metadata_xml_file'});
198	}
199	&begin_metadata_xml_file($self->{'documents_metadata_xml_file'});
200	}
201	}
202
203
204	sub process
205	{
206	my $self = shift (@_);
207	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
208	my $outhandle = $self->{'outhandle'};
209
210	my $subfield_separator = $self->{'subfield_separator'};
211	my $entry_separator = $self->{'entry_separator'};
212
213	# Report that we're processing the file
214	print STDERR "<Processing n='$file' p='ISISPlug'>\n" if ($gli);
215	print $outhandle "IsisPlug: processing $file\n"
216	if ($self->{'verbosity'}) > 1;
217
218	# Process each line of the ISIS record, one at a time
219	foreach $line (split(/\n/, $$textref)) {
220	$line =~ /^tag=(.+) data=(.+)$/;
221	my $rawtagname = $1;
222	my $rawtagdata = $2;
223	# print "Raw tag: $rawtagname, Raw data: $rawtagdata\n";
224
225	# Metadata field names: title case, then remove spaces
226	my $tagname = "";
227	foreach $word (split(/\s+/, $rawtagname)) {
228	substr($word, 0, 1) =~ tr/a-z/A-Z/;
229	$tagname .= $word;
230	}
231
232	# Make sure there is nothing bad in the tag names
233	$tagname =~ s/&//g;
234
235	# Handle each piece of metadata ('%' separated)
236	my $completetagvalue = "";
237	foreach $rawtagvalue (split(/%/, $rawtagdata)) {
238	$completetagvalue .= $entry_separator unless ($completetagvalue eq "");
239
240	# Metadata field values: take care with subfields
241	my $completeentryvalue = "";
242	while ($rawtagvalue ne "") {
243	# If there is a subfield specifier, parse it off
244	my $subfieldname = "";
245	if ($rawtagvalue =~ s/^\^([a-z])//) {
246	$subfieldname = "." . $1;
247	}
248
249	# Parse the metadata value off
250	$rawtagvalue =~ s/^([^\^]*)//;
251	my $metadatafieldname = $tagname . $subfieldname;
252	my $metadatafieldvalue = $1;
253	# print "Metadata: $metadatafieldname -> $metadatafieldvalue\n";
254
255	# Handle Keywords specially
256	if ($metadatafieldname eq "Keywords") {
257	my $keywordmetadatavalue = $metadatafieldvalue;
258	my $keywordlist = "";
259	while ($keywordmetadatavalue =~ s/\<([^\>]+)\>//) {
260	my $keyword = $1;
261	$doc_obj->add_utf8_metadata($cursection, $metadatafieldname, $keyword);
262	$keywordlist .= ", " unless ($keywordlist eq "");
263	$keywordlist .= $keyword;
264	}
265
266	$metadatafieldvalue = $keywordlist;
267	}
268
269	else {
270	$doc_obj->add_utf8_metadata($cursection, $metadatafieldname, $metadatafieldvalue);
271	}
272
273	$completeentryvalue .= $subfield_separator unless ($completeentryvalue eq "");
274	$completeentryvalue .= $metadatafieldvalue;
275	}
276
277	$completetagvalue .= $completeentryvalue;
278	}
279
280	# print "Metadata: $tagname.all -> $completetagvalue\n";
281	$doc_obj->add_utf8_metadata($cursection, $tagname . ".all", $completetagvalue);
282	}
283	# print "\n";
284
285	# Add the full record as the document text
286	$$textref =~ s/\</</g;
287	$$textref =~ s/\>/>/g;
288	$doc_obj->add_utf8_text ($cursection, $$textref);
289
290	# Obtain the documents specified in the CDS/ISIS database, if requested
291	if ($self->{'document_field'}) {
292	my $document_field = $self->{'document_field'};
293	my $document_prefix = $self->{'document_prefix'} \|\| "";
294	my $document_suffix = $self->{'document_suffix'} \|\| "";
295
296	my $documents_directory = $self->{'documents_directory'};
297	my $document_obtained = 0;
298
299	# Look at all the metadata assigned to this record
300	my $record_metadata = $doc_obj->get_all_metadata($cursection);
301	foreach my $pair (@$record_metadata) {
302	my ($field, $value) = (@$pair);
303
304	# Does this metadata element specify a document to obtain?
305	if ($field eq $document_field) {
306	my $document_file_full = $document_prefix . $value . $document_suffix;
307
308	my $document_file = &obtain_document($self, $document_file_full, $documents_directory);
309	if ($document_file) {
310	$document_obtained = 1;
311	&write_metadata_xml_file($self->{'documents_metadata_xml_file'},
312	$document_file, $record_metadata);
313	}
314	}
315	}
316
317	# If there was a document obtained for this record we don't want the record as well
318	if ($document_obtained) {
319	return 0;
320	}
321	}
322
323	# Record was processed successfully (and there was no document obtained)
324	return 1;
325	}
326
327
328	sub parse_field_definition_table
329	{
330	my $fdtfilename = shift(@_);
331
332	my %fdtmapping = ();
333
334	open(FDT_FILE, "<$fdtfilename") \|\| die "Error: Could not open file $fdtfilename.\n";
335
336	my $amongstdefinitions = 0;
337	foreach $fdtfileline (<FDT_FILE>) {
338	$fdtfileline =~ s/(\s*)$//; # Remove any nasty spaces at the end of the lines
339
340	if ($amongstdefinitions) {
341	my $fieldtitle = substr($fdtfileline, 0, 30);
342	my $fieldsubfields = substr($fdtfileline, 30, 20);
343	my $fieldspecs = substr($fdtfileline, 50);
344
345	# Remove extra spaces
346	$fieldtitle =~ s/(\s*)$//;
347	$fieldsubfields =~ s/(\s*)$//;
348
349	# Map from tag number to metadata field title and subfields
350	my $fieldtag = (split(/ /, $fieldspecs))[0];
351	$fdtmapping{$fieldtag} = { 'title' => $fieldtitle,
352	'subfields' => $fieldsubfields };
353	}
354	elsif ($fdtfileline eq "***") {
355	$amongstdefinitions = 1;
356	}
357	}
358
359	close(FDT_FILE);
360
361	return %fdtmapping;
362	}
363
364
365	sub obtain_document
366	{
367	my $self = shift(@_);
368	my $document_file_full = shift(@_);
369	my $documents_directory = shift(@_);
370
371	my $outhandle = $self->{'outhandle'};
372	print $outhandle "Obtaining document file $document_file_full...\n"
373	if ($self->{'verbosity'} > 1);
374
375	my $document_file_name;
376	my $local_document_file;
377
378	# Document specified is on the web
379	if ($document_file_full =~ /^http:/ \|\| $document_file_full =~ /^ftp:/) {
380	$document_file_full =~ /([^\/]+)$/;
381	$document_file_name = $1;
382	$local_document_file = &util::filename_cat($documents_directory, $document_file_name);
383
384	my $wget_options = "--quiet";
385	$wget_options = "--verbose" if ($self->{'verbosity'} > 2);
386	$wget_options .= " --timestamping"; # Only re-download files if they're newer
387	`wget $wget_options $document_file_full --output-document $local_document_file`;
388	}
389	# Document specified is on the disk
390	else {
391	my $dir_sep = &util::get_os_dirsep();
392	$document_file_full =~ /(.+$dir_sep)?(.*)$/;
393	$document_file_name = $2;
394	$local_document_file = &util::filename_cat($documents_directory, $document_file_name);
395
396	&util::cp($document_file_full, $documents_directory);
397	}
398
399	# Check the document was obtained successfully
400	if (!-e $local_document_file) {
401	print STDERR "WARNING: Could not obtain document file $document_file_full\n";
402	return undef;
403	}
404
405	return $document_file_name;
406	}
407
408
409	sub begin_metadata_xml_file
410	{
411	my $metadata_xml_file = shift(@_);
412
413	open(METADATA_XML_FILE, ">$metadata_xml_file");
414	print METADATA_XML_FILE
415	"<?xml version=\"1.0\" encoding=\"UTF-8\" standalone=\"no\"?>\n" .
416	"<!DOCTYPE DirectoryMetadata SYSTEM \"http://greenstone.org/dtd/DirectoryMetadata/1.0/DirectoryMetadata.dtd\">\n" .
417	"<DirectoryMetadata>\n";
418	close(METADATA_XML_FILE);
419	}
420
421
422	sub write_metadata_xml_file
423	{
424	my $metadata_xml_file = shift(@_);
425	my $file_name = shift(@_);
426	my $record_metadata = shift(@_);
427
428	# Make $file_name XML-safe
429	$file_name =~ s/</</g;
430	$file_name =~ s/>/>/g;
431
432	open(METADATA_XML_FILE, ">>$metadata_xml_file");
433
434	print METADATA_XML_FILE
435	"\n" .
436	" <FileSet>\n" .
437	" <FileName>$file_name</FileName>\n" .
438	" <Description>\n";
439
440	foreach my $pair (@$record_metadata) {
441	my ($field, $value) = (@$pair);
442
443	# We're only interested in metadata from the database
444	next if ($field eq "gsdlsourcefilename");
445	next if ($field eq "gsdldoctype");
446	next if ($field eq "Language");
447	next if ($field eq "Encoding");
448	next if ($field eq "Identifier");
449	next if ($field eq "Source");
450	next if ($field eq "SourceSegment");
451	next if ($field eq "Plugin");
452
453	# Make $value XML-safe
454	$value =~ s/</</g;
455	$value =~ s/>/>/g;
456
457	print METADATA_XML_FILE " <Metadata name=\"$field\">$value</Metadata>\n";
458	}
459
460	print METADATA_XML_FILE
461	" </Description>\n" .
462	" </FileSet>\n";
463
464	close(METADATA_XML_FILE);
465	}
466
467
468	sub end_metadata_xml_file
469	{
470	my $metadata_xml_file = shift(@_);
471
472	open(METADATA_XML_FILE, ">>$metadata_xml_file");
473	print METADATA_XML_FILE "\n</DirectoryMetadata>\n";
474	close(METADATA_XML_FILE);
475	}
476
477
478	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: