Context Navigation

source: gsdl/trunk/perllib/plugins/ISISPlugin.pm@ 16935

Last change on this file since 16935 was 16392, checked in by kjdon, 16 years ago
global block pass: read_block is no more, use can_process_this_file to see whether a file is for us or not. extra arg (block_hash) to read, read_into_doc_obj, metadata_read etc
Property svn:keywords set to `Author Date Id Revision`
File size: 13.8 KB

Rev	Line
[6107]	1	###########################################################################
	2	#
[15872]	3	# ISISPlugin.pm -- A plugin for CDS/ISIS databases
[6107]	4	#
	5	# A component of the Greenstone digital library software
	6	# from the New Zealand Digital Library Project at the
	7	# University of Waikato, New Zealand.
	8	#
[7686]	9	# Copyright 1999-2004 New Zealand Digital Library Project
[6107]	10	#
	11	# This program is free software; you can redistribute it and/or modify
	12	# it under the terms of the GNU General Public License as published by
	13	# the Free Software Foundation; either version 2 of the License, or
	14	# (at your option) any later version.
	15	#
	16	# This program is distributed in the hope that it will be useful,
	17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	19	# GNU General Public License for more details.
	20	#
	21	# You should have received a copy of the GNU General Public License
	22	# along with this program; if not, write to the Free Software
	23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	24	#
	25	###########################################################################
	26
[15872]	27	package ISISPlugin;
[6107]	28
	29
	30	use multiread;
[15872]	31	use SplitTextFile;
[6107]	32
[10254]	33	use strict;
	34	no strict 'refs'; # allow filehandles to be variables and viceversa
[6107]	35
[15872]	36	# ISISPlugin is a sub-class of SplitTextFile.
[6107]	37	sub BEGIN {
[15872]	38	@ISISPlugin::ISA = ('SplitTextFile');
[6107]	39	}
	40
	41
	42	my $arguments =
[7686]	43	[ { 'name' => "process_exp",
[15872]	44	'desc' => "{BasePlugin.process_exp}",
[6408]	45	'type' => "regexp",
	46	'reqd' => "no",
[6107]	47	'deft' => &get_default_process_exp() },
	48	{ 'name' => "block_exp",
[15872]	49	'desc' => "{BasePlugin.block_exp}",
[6408]	50	'type' => "regexp",
[7686]	51	'reqd' => "no",
[11329]	52	'deft' => &get_default_block_exp(),
	53	'hiddengli' => "yes" },
[7686]	54	{ 'name' => "split_exp",
[15872]	55	'desc' => "{SplitTextFile.split_exp}",
[7686]	56	'type' => "regexp",
	57	'reqd' => "no",
[11295]	58	'deft' => &get_default_split_exp(),
	59	'hiddengli' => "yes" },
[7686]	60
	61	# The interesting options
	62	{ 'name' => "entry_separator",
[15872]	63	'desc' => "{ISISPlugin.entry_separator}",
[7686]	64	'type' => "string",
	65	'reqd' => "no",
	66	'deft' => "<br>" },
[6107]	67	{ 'name' => "subfield_separator",
[15872]	68	'desc' => "{ISISPlugin.subfield_separator}",
[6107]	69	'type' => "string",
	70	'reqd' => "no",
[8563]	71	'deft' => ", " }
[6408]	72	];
[6107]	73
[15872]	74	my $options = { 'name' => "ISISPlugin",
	75	'desc' => "{ISISPlugin.desc}",
[6408]	76	'abstract' => "no",
	77	'inherits' => "yes",
[8762]	78	'explodes' => "yes",
[6107]	79	'args' => $arguments };
	80
	81
	82	# This plugin processes files with the suffix ".mst"
	83	sub get_default_process_exp {
	84	return q^(?i)(\.mst)$^;
	85	}
	86
	87
	88	# This plugin blocks files with the suffix ".fdt" and ".xrf"
	89	sub get_default_block_exp {
[16392]	90	#return q^(?i)(\.fdt\|\.xrf)$^;
	91	return "";
[6107]	92	}
	93
	94
	95	# This plugin splits the input text at the "----------" lines
	96	sub get_default_split_exp {
[9998]	97	return q^\r?\n----------\r?\n^;
[6107]	98	}
	99
	100
[8563]	101	sub new
	102	{
[10218]	103	my ($class) = shift (@_);
	104	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	105	push(@$pluginlist, $class);
[6107]	106
[15872]	107	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
	108	push(@{$hashArgOptLists->{"OptList"}},$options);
[6107]	109
[15872]	110	my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
[6107]	111
[13178]	112	if ($self->{'info_only'}) {
	113	# don't worry about any options etc
	114	return bless $self, $class;
	115	}
	116
[12833]	117	# isis plug doesn't care about encoding - it assumes ascii unless the user
	118	# has specified an encoding
	119	if ($self->{'input_encoding'} eq "auto") {
	120	$self->{'input_encoding'} = "ascii";
	121	}
[6107]	122	return bless $self, $class;
	123	}
	124
[16392]	125	# we block the corresponding fdt and xrf
	126	sub store_block_files {
	127
	128	my $self =shift (@_);
	129	my ($filename_full_path, $block_hash) = @_;
	130
	131	$self->check_auxiliary_files($filename_full_path);
	132	if (-e $self->{'fdt_file_path'}) {
	133	my $fdt_file = $self->{'fdt_file_path'};
	134	$block_hash->{'file_blocks'}->{$fdt_file} = 1;
	135	}
	136	if (-e $self->{'xrf_file_path'}) {
	137	my $xrf_file = $self->{'xrf_file_path'};
	138	$block_hash->{'file_blocks'}->{$xrf_file} = 1;
	139	}
	140
[6107]	141
[16392]	142	}
	143
	144	sub check_auxiliary_files {
	145	my $self = shift (@_);
	146	my ($filename) = @_;
	147
	148	my ($database_file_path_root) = ($filename =~ /(.*)\.mst$/i);
	149	# Check the associated .fdt and .xrf files exist
	150	$self->{'fdt_file_path'} = $database_file_path_root . ".FDT";
	151	if (!-e $self->{'fdt_file_path'}) {
	152	$self->{'fdt_file_path'} = $database_file_path_root . ".fdt";
	153	}
	154	$self->{'xrf_file_path'} = $database_file_path_root . ".XRF";
	155	if (!-e $self->{'xrf_file_path'}) {
	156	$self->{'xrf_file_path'} = $database_file_path_root . ".xrf";
	157	}
	158	}
	159
	160
[8563]	161	sub read_file
[7686]	162	{
[6107]	163	my $self = shift (@_);
	164	my ($filename, $encoding, $language, $textref) = @_;
[11334]	165	my $outhandle = $self->{'outhandle'};
[6107]	166
[11334]	167	my ($database_file_path_root) = ($filename =~ /(.*)\.mst$/i);
	168	my $mst_file_path_relative = $filename;
	169	$mst_file_path_relative =~ s/^.+import.(.*?)$/$1/;
[6107]	170
[7048]	171	# Check the associated .fdt and .xrf files exist
[16392]	172	$self->check_auxiliary_files($filename);
	173
[11334]	174	if (!-e $self->{'fdt_file_path'}) {
	175	print STDERR "<ProcessingError n='$mst_file_path_relative' r='Could not find ISIS FDT file $self->{'fdt_file_path'}'>\n" if ($self->{'gli'});
	176	print $outhandle "Error: Could not find ISIS FDT file " . $self->{'fdt_file_path'} . ".\n";
	177	return;
[7048]	178	}
[11334]	179	if (!-e $self->{'xrf_file_path'}) {
	180	print STDERR "<ProcessingError n='$mst_file_path_relative' r='Could not find ISIS XRF file $self->{'xrf_file_path'}'>\n" if ($self->{'gli'});
	181	print $outhandle "Error: Could not find ISIS XRF file " . $self->{'xrf_file_path'} . ".\n";
	182	return;
[7048]	183	}
	184
[6107]	185	# The text to split is exported from the database by the IsisGdl program
[7021]	186	open(FILE, "IsisGdl \"$filename\" \|");
[6107]	187
	188	my $reader = new multiread();
[15872]	189	$reader->set_handle('ISISPlugin::FILE');
[8563]	190	$reader->set_encoding($encoding);
	191	$reader->read_file($textref);
[6107]	192
	193	close(FILE);
	194
	195	# Parse the associated ISIS database Field Definition Table file (.fdt)
[11334]	196	my %fdt_mapping = &parse_field_definition_table($self->{'fdt_file_path'}, $encoding);
[11332]	197	$self->{'fdt_mapping'} = \%fdt_mapping;
[6107]	198
[11545]	199	# Remove the line at the start, and any blank lines, so the data is split and processed properly
[7686]	200	$$textref =~ s/^----------\n//;
[11545]	201	$$textref =~ s/\n\n/\n/g;
[6107]	202	}
	203
	204
	205	sub process
	206	{
	207	my $self = shift (@_);
[6332]	208	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[6107]	209	my $outhandle = $self->{'outhandle'};
	210
[11298]	211	my $section = $doc_obj->get_top_section();
	212	my $fdt_mapping = $self->{'fdt_mapping'};
[6107]	213	my $subfield_separator = $self->{'subfield_separator'};
	214	my $entry_separator = $self->{'entry_separator'};
[11466]	215	my $isis_record_html_metadata_value = "<table cellpadding=\"4\" cellspacing=\"0\">";
[6107]	216
	217	# Process each line of the ISIS record, one at a time
[10254]	218	foreach my $line (split(/\n/, $$textref)) {
[11430]	219	$line =~ s/(\s*)$//; # Remove any nasty whitespace (very important for Windows)
[8646]	220	$line =~ /^tag=(.*) data=(.+)$/;
[11298]	221	my $tag = $1;
	222	my $tag_data = $2;
	223	# print STDERR "\nTag: $tag, Data: $tag_data\n";
[6107]	224
[11298]	225	# Convert the tag number into a name, and remove any invalid characters
	226	my $raw_metadata_name = $fdt_mapping->{$tag}{'name'} \|\| "";
[11300]	227	$raw_metadata_name =~ s/[,&\#\.\-\/]/ /g;
[11298]	228	next if ($raw_metadata_name eq "");
	229
[6107]	230	# Metadata field names: title case, then remove spaces
[11298]	231	my $metadata_name = "";
	232	foreach my $word (split(/\s+/, $raw_metadata_name)) {
[6107]	233	substr($word, 0, 1) =~ tr/a-z/A-Z/;
[11298]	234	$metadata_name .= $word;
[6107]	235	}
	236
[11298]	237	my $all_metadata_name = $metadata_name . "^all";
	238	my $all_metadata_value = "";
[6123]	239
[11298]	240	# Handle repeatable fields
	241	if ($fdt_mapping->{$tag}{'repeatable'}) {
	242	# Multiple values are separated using the '%' character
	243	foreach my $raw_metadata_value (split(/%/, $tag_data)) {
	244	my $metadata_value = "";
[6107]	245
[11298]	246	# Handle subfields
	247	while ($raw_metadata_value ne "") {
	248	# If there is a subfield specifier, parse it off
	249	my $sub_metadata_name = $metadata_name;
[11299]	250	if ($raw_metadata_value =~ s/^\^// && $raw_metadata_value =~ s/^([a-z])//) {
[11298]	251	$sub_metadata_name .= "^$1";
	252	}
	253
	254	# Parse the value off and add it as metadata
	255	$raw_metadata_value =~ s/^([^\^]*)//;
[12705]	256	my $sub_metadata_value = &escape_metadata_value($1);
[11298]	257
	258	# print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
	259	if ($sub_metadata_name ne $metadata_name) {
	260	$doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
	261	}
	262
[13157]	263	# If this tag has subfields and this is the first, use the value for the CDS/ISIS ^* field
	264	if ($fdt_mapping->{$tag}{'subfields'} ne "" && $metadata_value eq "") {
[12703]	265	$doc_obj->add_utf8_metadata($section, $metadata_name . "^*", $sub_metadata_value);
	266	}
	267
[11298]	268	$metadata_value .= $subfield_separator unless ($metadata_value eq "");
	269	$metadata_value .= $sub_metadata_value;
	270	}
	271
	272	# Add the metadata value
	273	# print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
	274	$doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
	275
	276	$all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
	277	$all_metadata_value .= $metadata_value;
	278	}
	279	}
	280
	281	# Handle non-repeatable fields
	282	else {
	283	my $raw_metadata_value = $tag_data;
	284	my $metadata_value = "";
	285
	286	# Handle subfields
	287	while ($raw_metadata_value ne "") {
[6107]	288	# If there is a subfield specifier, parse it off
[11298]	289	my $sub_metadata_name = $metadata_name;
[11353]	290	if ($raw_metadata_value =~ s/^\^// && $raw_metadata_value =~ s/^([a-z])//) {
[11379]	291	$sub_metadata_name .= "^$1";
[6107]	292	}
	293
[11298]	294	# Parse the value off and add it as metadata
	295	$raw_metadata_value =~ s/^([^\^]*)//;
	296	my $sub_metadata_value = $1;
	297
	298	# Deal with the case when multiple values are specified using <...>
[11545]	299	if ($sub_metadata_value =~ /\<(.+)\>/) {
[11298]	300	my $sub_sub_metadata_name = $sub_metadata_name . "^sub";
	301	my $tmp_sub_metadata_value = $sub_metadata_value;
[11545]	302	while ($tmp_sub_metadata_value =~ s/\<(.+?)\>//) {
[11298]	303	my $sub_sub_metadata_value = $1;
	304	$doc_obj->add_utf8_metadata($section, $sub_sub_metadata_name, $sub_sub_metadata_value);
[6107]	305	}
	306	}
[11546]	307	# Deal with the legacy case when multiple values are specified using /.../
[11545]	308	elsif ($sub_metadata_value =~ /\/(.+)\//) {
	309	my $sub_sub_metadata_name = $sub_metadata_name . "^sub";
	310	my $tmp_sub_metadata_value = $sub_metadata_value;
	311	while ($tmp_sub_metadata_value =~ s/\/(.+?)\///) {
	312	my $sub_sub_metadata_value = $1;
	313	$doc_obj->add_utf8_metadata($section, $sub_sub_metadata_name, $sub_sub_metadata_value);
	314	}
	315	}
[6107]	316
[12705]	317	# Escape the metadata value so it appears correctly in the final collection
	318	$sub_metadata_value = &escape_metadata_value($sub_metadata_value);
[9998]	319
[11298]	320	# print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
	321	if ($sub_metadata_name ne $metadata_name) {
	322	$doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
[6107]	323	}
	324
[13157]	325	# If this tag has subfields and this is the first, use the value for the CDS/ISIS ^* field
	326	if ($fdt_mapping->{$tag}{'subfields'} ne "" && $metadata_value eq "") {
[12703]	327	$doc_obj->add_utf8_metadata($section, $metadata_name . "^*", $sub_metadata_value);
	328	}
	329
[11298]	330	$metadata_value .= $subfield_separator unless ($metadata_value eq "");
	331	$metadata_value .= $sub_metadata_value;
[6107]	332	}
	333
[11298]	334	# Add the metadata value
	335	# print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
	336	$doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
	337
	338	$all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
	339	$all_metadata_value .= $metadata_value;
[6107]	340	}
	341
[11298]	342	# Add the "^all" metadata value
	343	# print STDERR "All metadata name: $all_metadata_name, value: $all_metadata_value\n";
	344	$doc_obj->add_utf8_metadata($section, $all_metadata_name, $all_metadata_value);
[11465]	345
	346	$isis_record_html_metadata_value .= "<tr><td valign=top><nobr><b>" . $fdt_mapping->{$tag}{'name'} . "</b></nobr></td><td valign=top>" . $all_metadata_value . "</td></tr>";
[6107]	347	}
[8563]	348
[11467]	349	# Add a reasonably formatted HTML table view of the record as the document text
[11465]	350	$isis_record_html_metadata_value .= "</table>";
[11467]	351	$doc_obj->add_utf8_text($section, $isis_record_html_metadata_value);
[11465]	352
[11467]	353	# Add the full raw record as metadata
[12705]	354	my $isis_raw_record_metadata_value = &escape_metadata_value($$textref);
	355	$doc_obj->add_utf8_metadata($section, "ISISRawRecord", $isis_raw_record_metadata_value);
[6107]	356
[8563]	357	# Add FileFormat metadata
[11298]	358	$doc_obj->add_utf8_metadata($section, "FileFormat", "CDS/ISIS");
[7686]	359
[11298]	360	# Record was processed successfully
[6107]	361	return 1;
	362	}
	363
	364
	365	sub parse_field_definition_table
	366	{
[7686]	367	my $fdtfilename = shift(@_);
[11262]	368	my $encoding = shift(@_);
[6107]	369
[7686]	370	my %fdtmapping = ();
[6107]	371
	372	open(FDT_FILE, "<$fdtfilename") \|\| die "Error: Could not open file $fdtfilename.\n";
	373
[11262]	374	my $fdtfiletext = "";
	375	my $reader = new multiread();
[15872]	376	$reader->set_handle('ISISPlugin::FDT_FILE');
[11262]	377	$reader->set_encoding($encoding);
	378	$reader->read_file($fdtfiletext);
	379
[7686]	380	my $amongstdefinitions = 0;
[11262]	381	foreach my $fdtfileline (split(/\n/, $$fdtfiletext)) {
[6107]	382	$fdtfileline =~ s/(\s*)$//; # Remove any nasty spaces at the end of the lines
	383
	384	if ($amongstdefinitions) {
[13298]	385	my $fieldname = &unicode::substr($fdtfileline, 0, 30);
	386	my $fieldsubfields = &unicode::substr($fdtfileline, 30, 20);
	387	my $fieldspecs = &unicode::substr($fdtfileline, 50, 50);
[6107]	388
	389	# Remove extra spaces
[11298]	390	$fieldname =~ s/(\s*)$//;
[6107]	391	$fieldsubfields =~ s/(\s*)$//;
[11298]	392	$fieldspecs =~ s/(\s*)$//;
[6107]	393
[11298]	394	# Map from tag number to metadata field title, subfields, and repeatability
	395	my $fieldtag = (split(/ /, $fieldspecs))[0];
	396	my $fieldrepeatable = (split(/ /, $fieldspecs))[3];
	397	$fdtmapping{$fieldtag} = { 'name' => $fieldname,
	398	'subfields' => $fieldsubfields,
	399	'repeatable' => $fieldrepeatable };
[6107]	400	}
	401	elsif ($fdtfileline eq "***") {
	402	$amongstdefinitions = 1;
	403	}
	404	}
	405
	406	close(FDT_FILE);
	407
	408	return %fdtmapping;
	409	}
	410
	411
[12705]	412	sub escape_metadata_value
	413	{
	414	my $value = shift(@_);
	415	$value =~ s/\</</g;
	416	$value =~ s/\>/>/g;
	417	$value =~ s/\\/\\\\/g;
	418	return $value;
	419	}
	420
	421
[11332]	422	sub clean_up_after_exploding
	423	{
	424	my $self = shift(@_);
	425
	426	# Delete the FDT and XRF files too
[11334]	427	&util::rm($self->{'fdt_file_path'});
	428	&util::rm($self->{'xrf_file_path'});
[11332]	429	}
	430
	431
[6107]	432	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: