Context Navigation

source: trunk/gsdl/perllib/plugins/ISISPlug.pm@ 11335

Last change on this file since 11335 was 11335, checked in by mdewsnip, 18 years ago
Now sends messages to the GLI telling the user the CDS/ISIS file is being processed.
Property svn:keywords set to `Author Date Id Revision`
File size: 11.3 KB

Rev	Line
[6107]	1	###########################################################################
	2	#
	3	# ISISPlug.pm -- A plugin for CDS/ISIS databases
	4	#
	5	# A component of the Greenstone digital library software
	6	# from the New Zealand Digital Library Project at the
	7	# University of Waikato, New Zealand.
	8	#
[7686]	9	# Copyright 1999-2004 New Zealand Digital Library Project
[6107]	10	#
	11	# This program is free software; you can redistribute it and/or modify
	12	# it under the terms of the GNU General Public License as published by
	13	# the Free Software Foundation; either version 2 of the License, or
	14	# (at your option) any later version.
	15	#
	16	# This program is distributed in the hope that it will be useful,
	17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	19	# GNU General Public License for more details.
	20	#
	21	# You should have received a copy of the GNU General Public License
	22	# along with this program; if not, write to the Free Software
	23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	24	#
	25	###########################################################################
	26
	27	package ISISPlug;
	28
	29
	30	use multiread;
	31	use SplitPlug;
	32
[10254]	33	use strict;
	34	no strict 'refs'; # allow filehandles to be variables and viceversa
[6107]	35
	36	# ISISPlug is a sub-class of SplitPlug.
	37	sub BEGIN {
[10254]	38	@ISISPlug::ISA = ('SplitPlug');
[6107]	39	}
	40
	41
	42	my $arguments =
[7686]	43	[ { 'name' => "process_exp",
[6408]	44	'desc' => "{BasPlug.process_exp}",
	45	'type' => "regexp",
	46	'reqd' => "no",
[6107]	47	'deft' => &get_default_process_exp() },
	48	{ 'name' => "block_exp",
	49	'desc' => "{BasPlug.block_exp}",
[6408]	50	'type' => "regexp",
[7686]	51	'reqd' => "no",
[11329]	52	'deft' => &get_default_block_exp(),
	53	'hiddengli' => "yes" },
[7686]	54	{ 'name' => "split_exp",
	55	'desc' => "{SplitPlug.split_exp}",
	56	'type' => "regexp",
	57	'reqd' => "no",
[11295]	58	'deft' => &get_default_split_exp(),
	59	'hiddengli' => "yes" },
[7686]	60
	61	# The interesting options
	62	{ 'name' => "entry_separator",
	63	'desc' => "{ISISPlug.entry_separator}",
	64	'type' => "string",
	65	'reqd' => "no",
	66	'deft' => "<br>" },
[6107]	67	{ 'name' => "subfield_separator",
	68	'desc' => "{ISISPlug.subfield_separator}",
	69	'type' => "string",
	70	'reqd' => "no",
[8563]	71	'deft' => ", " }
[6408]	72	];
[6107]	73
	74	my $options = { 'name' => "ISISPlug",
	75	'desc' => "{ISISPlug.desc}",
[6408]	76	'abstract' => "no",
	77	'inherits' => "yes",
[8762]	78	'explodes' => "yes",
[6107]	79	'args' => $arguments };
	80
	81
	82	# This plugin processes files with the suffix ".mst"
	83	sub get_default_process_exp {
	84	return q^(?i)(\.mst)$^;
	85	}
	86
	87
	88	# This plugin blocks files with the suffix ".fdt" and ".xrf"
	89	sub get_default_block_exp {
	90	return q^(?i)(\.fdt\|\.xrf)$^;
	91	}
	92
	93
	94	# This plugin splits the input text at the "----------" lines
	95	sub get_default_split_exp {
[9998]	96	return q^\r?\n----------\r?\n^;
[6107]	97	}
	98
	99
[8563]	100	sub new
	101	{
[10218]	102	my ($class) = shift (@_);
	103	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	104	push(@$pluginlist, $class);
[6107]	105
[10218]	106	if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
	107	if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
[6107]	108
[10218]	109	my $self = (defined $hashArgOptLists)? new SplitPlug($pluginlist,$inputargs,$hashArgOptLists): new SplitPlug($pluginlist,$inputargs);
[6107]	110
	111	return bless $self, $class;
	112	}
	113
	114
[8563]	115	sub read_file
[7686]	116	{
[6107]	117	my $self = shift (@_);
	118	my ($filename, $encoding, $language, $textref) = @_;
[11334]	119	my $outhandle = $self->{'outhandle'};
[6107]	120
[11334]	121	my ($database_file_path_root) = ($filename =~ /(.*)\.mst$/i);
	122	my $mst_file_path_relative = $filename;
	123	$mst_file_path_relative =~ s/^.+import.(.*?)$/$1/;
[6107]	124
[7048]	125	# Check the associated .fdt and .xrf files exist
	126	# These files must have a lowercase extension for the IsisGdl program to work
	127	# Bailing out because of this is kind of crappy but it is only an issue on Unix
[11334]	128	$self->{'fdt_file_path'} = $database_file_path_root . ".fdt";
	129	if (!-e $self->{'fdt_file_path'}) {
	130	print STDERR "<ProcessingError n='$mst_file_path_relative' r='Could not find ISIS FDT file $self->{'fdt_file_path'}'>\n" if ($self->{'gli'});
	131	print $outhandle "Error: Could not find ISIS FDT file " . $self->{'fdt_file_path'} . ".\n";
	132	return;
[7048]	133	}
[11334]	134	$self->{'xrf_file_path'} = $database_file_path_root . ".xrf";
	135	if (!-e $self->{'xrf_file_path'}) {
	136	print STDERR "<ProcessingError n='$mst_file_path_relative' r='Could not find ISIS XRF file $self->{'xrf_file_path'}'>\n" if ($self->{'gli'});
	137	print $outhandle "Error: Could not find ISIS XRF file " . $self->{'xrf_file_path'} . ".\n";
	138	return;
[7048]	139	}
	140
[6107]	141	# The text to split is exported from the database by the IsisGdl program
[7021]	142	open(FILE, "IsisGdl \"$filename\" \|");
[6107]	143
	144	my $reader = new multiread();
[8563]	145	$reader->set_handle('ISISPlug::FILE');
	146	$reader->set_encoding($encoding);
	147	$reader->read_file($textref);
[6107]	148
	149	close(FILE);
	150
	151	# Parse the associated ISIS database Field Definition Table file (.fdt)
[11334]	152	my %fdt_mapping = &parse_field_definition_table($self->{'fdt_file_path'}, $encoding);
[11332]	153	$self->{'fdt_mapping'} = \%fdt_mapping;
[6107]	154
[7686]	155	# Remove the line at the start so it is split and processed properly
	156	$$textref =~ s/^----------\n//;
[6107]	157	}
	158
	159
	160	sub process
	161	{
	162	my $self = shift (@_);
[6332]	163	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[6107]	164	my $outhandle = $self->{'outhandle'};
	165
[11298]	166	my $section = $doc_obj->get_top_section();
	167	my $fdt_mapping = $self->{'fdt_mapping'};
[6107]	168	my $subfield_separator = $self->{'subfield_separator'};
	169	my $entry_separator = $self->{'entry_separator'};
	170
	171	# Report that we're processing the file
[11335]	172	print STDERR "\n<Processing n='$file' p='ISISPlug'>\n" if ($gli);
[11298]	173	print $outhandle "IsisPlug: processing $file\n" if ($self->{'verbosity'}) > 1;
[6107]	174
	175	# Process each line of the ISIS record, one at a time
[10254]	176	foreach my $line (split(/\n/, $$textref)) {
[8646]	177	$line =~ /^tag=(.*) data=(.+)$/;
[11298]	178	my $tag = $1;
	179	my $tag_data = $2;
	180	# print STDERR "\nTag: $tag, Data: $tag_data\n";
[6107]	181
[11298]	182	# Convert the tag number into a name, and remove any invalid characters
	183	my $raw_metadata_name = $fdt_mapping->{$tag}{'name'} \|\| "";
[11300]	184	$raw_metadata_name =~ s/[,&\#\.\-\/]/ /g;
[11298]	185	next if ($raw_metadata_name eq "");
	186
[6107]	187	# Metadata field names: title case, then remove spaces
[11298]	188	my $metadata_name = "";
	189	foreach my $word (split(/\s+/, $raw_metadata_name)) {
[6107]	190	substr($word, 0, 1) =~ tr/a-z/A-Z/;
[11298]	191	$metadata_name .= $word;
[6107]	192	}
	193
[11298]	194	my $all_metadata_name = $metadata_name . "^all";
	195	my $all_metadata_value = "";
[6123]	196
[11298]	197	# Handle repeatable fields
	198	if ($fdt_mapping->{$tag}{'repeatable'}) {
	199	# Multiple values are separated using the '%' character
	200	foreach my $raw_metadata_value (split(/%/, $tag_data)) {
	201	my $metadata_value = "";
[6107]	202
[11298]	203	# Handle subfields
	204	while ($raw_metadata_value ne "") {
	205	# If there is a subfield specifier, parse it off
	206	my $sub_metadata_name = $metadata_name;
[11299]	207	if ($raw_metadata_value =~ s/^\^// && $raw_metadata_value =~ s/^([a-z])//) {
[11298]	208	$sub_metadata_name .= "^$1";
	209	}
	210
	211	# Parse the value off and add it as metadata
	212	$raw_metadata_value =~ s/^([^\^]*)//;
	213	my $sub_metadata_value = $1;
	214
	215	# Escape any '<' and '>' characters so they appear correctly in the final collection
	216	$sub_metadata_value =~ s/\</</g;
	217	$sub_metadata_value =~ s/\>/>/g;
	218
	219	# print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
	220	if ($sub_metadata_name ne $metadata_name) {
	221	$doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
	222	}
	223
	224	$metadata_value .= $subfield_separator unless ($metadata_value eq "");
	225	$metadata_value .= $sub_metadata_value;
	226	}
	227
	228	# Add the metadata value
	229	# print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
	230	$doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
	231
	232	$all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
	233	$all_metadata_value .= $metadata_value;
	234	}
	235	}
	236
	237	# Handle non-repeatable fields
	238	else {
	239	my $raw_metadata_value = $tag_data;
	240	my $metadata_value = "";
	241
	242	# Handle subfields
	243	while ($raw_metadata_value ne "") {
[6107]	244	# If there is a subfield specifier, parse it off
[11298]	245	my $sub_metadata_name = $metadata_name;
	246	if ($raw_metadata_value =~ s/^(\^[a-z])//) {
	247	$sub_metadata_name .= $1;
[6107]	248	}
	249
[11298]	250	# Parse the value off and add it as metadata
	251	$raw_metadata_value =~ s/^([^\^]*)//;
	252	my $sub_metadata_value = $1;
	253
	254	# Deal with the case when multiple values are specified using <...>
	255	if ($sub_metadata_value =~ /\<(.*)\>$/) {
	256	my $sub_sub_metadata_name = $sub_metadata_name . "^sub";
	257	my $tmp_sub_metadata_value = $sub_metadata_value;
	258	while ($tmp_sub_metadata_value =~ s/\<(.*?)\>//) {
	259	my $sub_sub_metadata_value = $1;
	260	$doc_obj->add_utf8_metadata($section, $sub_sub_metadata_name, $sub_sub_metadata_value);
[6107]	261	}
	262	}
	263
[9998]	264	# Escape any '<' and '>' characters so they appear correctly in the final collection
[11298]	265	$sub_metadata_value =~ s/\</</g;
	266	$sub_metadata_value =~ s/\>/>/g;
[9998]	267
[11298]	268	# print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
	269	if ($sub_metadata_name ne $metadata_name) {
	270	$doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
[6107]	271	}
	272
[11298]	273	$metadata_value .= $subfield_separator unless ($metadata_value eq "");
	274	$metadata_value .= $sub_metadata_value;
[6107]	275	}
	276
[11298]	277	# Add the metadata value
	278	# print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
	279	$doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
	280
	281	$all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
	282	$all_metadata_value .= $metadata_value;
[6107]	283	}
	284
[11298]	285	# Add the "^all" metadata value
	286	# print STDERR "All metadata name: $all_metadata_name, value: $all_metadata_value\n";
	287	$doc_obj->add_utf8_metadata($section, $all_metadata_name, $all_metadata_value);
[6107]	288	}
[8563]	289
[6107]	290	# Add the full record as the document text
	291	$$textref =~ s/\</</g;
	292	$$textref =~ s/\>/>/g;
[11298]	293	$doc_obj->add_utf8_text($section, $$textref);
[6107]	294
[8563]	295	# Add FileFormat metadata
[11298]	296	$doc_obj->add_utf8_metadata($section, "FileFormat", "CDS/ISIS");
[7686]	297
[11298]	298	# Record was processed successfully
[6107]	299	return 1;
	300	}
	301
	302
	303	sub parse_field_definition_table
	304	{
[7686]	305	my $fdtfilename = shift(@_);
[11262]	306	my $encoding = shift(@_);
[6107]	307
[7686]	308	my %fdtmapping = ();
[6107]	309
	310	open(FDT_FILE, "<$fdtfilename") \|\| die "Error: Could not open file $fdtfilename.\n";
	311
[11262]	312	my $fdtfiletext = "";
	313	my $reader = new multiread();
	314	$reader->set_handle('ISISPlug::FDT_FILE');
	315	$reader->set_encoding($encoding);
	316	$reader->read_file($fdtfiletext);
	317
[7686]	318	my $amongstdefinitions = 0;
[11262]	319	foreach my $fdtfileline (split(/\n/, $$fdtfiletext)) {
[6107]	320	$fdtfileline =~ s/(\s*)$//; # Remove any nasty spaces at the end of the lines
	321
	322	if ($amongstdefinitions) {
[11298]	323	my $fieldname = substr($fdtfileline, 0, 30);
[7686]	324	my $fieldsubfields = substr($fdtfileline, 30, 20);
	325	my $fieldspecs = substr($fdtfileline, 50);
[6107]	326
	327	# Remove extra spaces
[11298]	328	$fieldname =~ s/(\s*)$//;
[6107]	329	$fieldsubfields =~ s/(\s*)$//;
[11298]	330	$fieldspecs =~ s/(\s*)$//;
[6107]	331
[11298]	332	# Map from tag number to metadata field title, subfields, and repeatability
	333	my $fieldtag = (split(/ /, $fieldspecs))[0];
	334	my $fieldrepeatable = (split(/ /, $fieldspecs))[3];
	335	$fdtmapping{$fieldtag} = { 'name' => $fieldname,
	336	'subfields' => $fieldsubfields,
	337	'repeatable' => $fieldrepeatable };
[6107]	338	}
	339	elsif ($fdtfileline eq "***") {
	340	$amongstdefinitions = 1;
	341	}
	342	}
	343
	344	close(FDT_FILE);
	345
	346	return %fdtmapping;
	347	}
	348
	349
[11332]	350	sub clean_up_after_exploding
	351	{
	352	my $self = shift(@_);
	353
	354	# Delete the FDT and XRF files too
[11334]	355	&util::rm($self->{'fdt_file_path'});
	356	&util::rm($self->{'xrf_file_path'});
[11332]	357	}
	358
	359
[6107]	360	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: