Context Navigation

source: main/trunk/greenstone2/perllib/plugins/ISISPlugin.pm@ 31480

Last change on this file since 31480 was 31480, checked in by kjdon, 7 years ago
util::block_file moved to EncodingUtil::block_raw_filename
Property svn:keywords set to `Author Date Id Revision`
File size: 15.0 KB

Rev	Line
[6107]	1	###########################################################################
	2	#
[15872]	3	# ISISPlugin.pm -- A plugin for CDS/ISIS databases
[6107]	4	#
	5	# A component of the Greenstone digital library software
	6	# from the New Zealand Digital Library Project at the
	7	# University of Waikato, New Zealand.
	8	#
[7686]	9	# Copyright 1999-2004 New Zealand Digital Library Project
[6107]	10	#
	11	# This program is free software; you can redistribute it and/or modify
	12	# it under the terms of the GNU General Public License as published by
	13	# the Free Software Foundation; either version 2 of the License, or
	14	# (at your option) any later version.
	15	#
	16	# This program is distributed in the hope that it will be useful,
	17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	19	# GNU General Public License for more details.
	20	#
	21	# You should have received a copy of the GNU General Public License
	22	# along with this program; if not, write to the Free Software
	23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	24	#
	25	###########################################################################
	26
[15872]	27	package ISISPlugin;
[6107]	28
[27502]	29	use Encode;
[6107]	30
	31	use multiread;
[15872]	32	use SplitTextFile;
[24547]	33	use MetadataRead;
[28563]	34	use FileUtils;
[6107]	35
[10254]	36	use strict;
	37	no strict 'refs'; # allow filehandles to be variables and viceversa
[6107]	38
[15872]	39	# ISISPlugin is a sub-class of SplitTextFile.
[24547]	40	# methods with identical signatures take precedence in the order given in the ISA list.
[6107]	41	sub BEGIN {
[24547]	42	@ISISPlugin::ISA = ('MetadataRead', 'SplitTextFile');
[6107]	43	}
	44
	45
	46	my $arguments =
[7686]	47	[ { 'name' => "process_exp",
[15872]	48	'desc' => "{BasePlugin.process_exp}",
[6408]	49	'type' => "regexp",
	50	'reqd' => "no",
[6107]	51	'deft' => &get_default_process_exp() },
	52	{ 'name' => "block_exp",
[15872]	53	'desc' => "{BasePlugin.block_exp}",
[6408]	54	'type' => "regexp",
[7686]	55	'reqd' => "no",
[11329]	56	'deft' => &get_default_block_exp(),
	57	'hiddengli' => "yes" },
[7686]	58	{ 'name' => "split_exp",
[15872]	59	'desc' => "{SplitTextFile.split_exp}",
[7686]	60	'type' => "regexp",
	61	'reqd' => "no",
[11295]	62	'deft' => &get_default_split_exp(),
	63	'hiddengli' => "yes" },
[7686]	64
	65	# The interesting options
	66	{ 'name' => "entry_separator",
[15872]	67	'desc' => "{ISISPlugin.entry_separator}",
[7686]	68	'type' => "string",
	69	'reqd' => "no",
	70	'deft' => "<br>" },
[6107]	71	{ 'name' => "subfield_separator",
[15872]	72	'desc' => "{ISISPlugin.subfield_separator}",
[6107]	73	'type' => "string",
	74	'reqd' => "no",
[8563]	75	'deft' => ", " }
[6408]	76	];
[6107]	77
[15872]	78	my $options = { 'name' => "ISISPlugin",
	79	'desc' => "{ISISPlugin.desc}",
[6408]	80	'abstract' => "no",
	81	'inherits' => "yes",
[8762]	82	'explodes' => "yes",
[6107]	83	'args' => $arguments };
	84
	85
	86	# This plugin processes files with the suffix ".mst"
	87	sub get_default_process_exp {
	88	return q^(?i)(\.mst)$^;
	89	}
	90
	91
	92	# This plugin blocks files with the suffix ".fdt" and ".xrf"
	93	sub get_default_block_exp {
[17479]	94	return q^(?i)(\.fdt\|\.xrf)$^;
	95	#return "";
[6107]	96	}
	97
	98
	99	# This plugin splits the input text at the "----------" lines
	100	sub get_default_split_exp {
[9998]	101	return q^\r?\n----------\r?\n^;
[6107]	102	}
	103
	104
[8563]	105	sub new
	106	{
[10218]	107	my ($class) = shift (@_);
	108	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	109	push(@$pluginlist, $class);
[6107]	110
[15872]	111	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
	112	push(@{$hashArgOptLists->{"OptList"}},$options);
[6107]	113
[15872]	114	my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
[6107]	115
[13178]	116	if ($self->{'info_only'}) {
	117	# don't worry about any options etc
	118	return bless $self, $class;
	119	}
	120
[12833]	121	# isis plug doesn't care about encoding - it assumes ascii unless the user
	122	# has specified an encoding
	123	if ($self->{'input_encoding'} eq "auto") {
	124	$self->{'input_encoding'} = "ascii";
	125	}
[6107]	126	return bless $self, $class;
	127	}
	128
[16392]	129	# we block the corresponding fdt and xrf
[17479]	130	# a pain on windows. blocks xxx.FDT, but if actual file is xx.fdt then
	131	# complains that no plugin can process it. Have put it back to using
	132	# block exp for now
[23564]	133	# This works now, as are doing case insenstive blocking on windows. However,
	134	# a pain for GLI as will not know what plugin processes the fdt and xrf.
	135	# if add to process expression, then get more problems.
[17479]	136	sub store_block_files_tmp {
[16392]	137
	138	my $self =shift (@_);
	139	my ($filename_full_path, $block_hash) = @_;
[17479]	140	print STDERR "in store block files\n";
[16392]	141	$self->check_auxiliary_files($filename_full_path);
	142	if (-e $self->{'fdt_file_path'}) {
[17479]	143	print STDERR "$self->{'fdt_file_path'}\n";
[16392]	144	my $fdt_file = $self->{'fdt_file_path'};
[31480]	145	$self->block_raw_filename($block_hash,$fdt_file);
[16392]	146	}
	147	if (-e $self->{'xrf_file_path'}) {
[17479]	148	print STDERR "$self->{'xrf_file_path'}\n";
[16392]	149	my $xrf_file = $self->{'xrf_file_path'};
[31480]	150	$self->block_raw_filename($block_hash,$xrf_file);
[16392]	151	}
	152
[6107]	153
[16392]	154	}
	155
	156	sub check_auxiliary_files {
	157	my $self = shift (@_);
	158	my ($filename) = @_;
	159
	160	my ($database_file_path_root) = ($filename =~ /(.*)\.mst$/i);
	161	# Check the associated .fdt and .xrf files exist
	162	$self->{'fdt_file_path'} = $database_file_path_root . ".FDT";
	163	if (!-e $self->{'fdt_file_path'}) {
	164	$self->{'fdt_file_path'} = $database_file_path_root . ".fdt";
	165	}
	166	$self->{'xrf_file_path'} = $database_file_path_root . ".XRF";
	167	if (!-e $self->{'xrf_file_path'}) {
	168	$self->{'xrf_file_path'} = $database_file_path_root . ".xrf";
	169	}
	170	}
	171
	172
[8563]	173	sub read_file
[7686]	174	{
[6107]	175	my $self = shift (@_);
	176	my ($filename, $encoding, $language, $textref) = @_;
[11334]	177	my $outhandle = $self->{'outhandle'};
[6107]	178
[11334]	179	my ($database_file_path_root) = ($filename =~ /(.*)\.mst$/i);
	180	my $mst_file_path_relative = $filename;
	181	$mst_file_path_relative =~ s/^.+import.(.*?)$/$1/;
[6107]	182
[7048]	183	# Check the associated .fdt and .xrf files exist
[16392]	184	$self->check_auxiliary_files($filename);
	185
[11334]	186	if (!-e $self->{'fdt_file_path'}) {
	187	print STDERR "<ProcessingError n='$mst_file_path_relative' r='Could not find ISIS FDT file $self->{'fdt_file_path'}'>\n" if ($self->{'gli'});
	188	print $outhandle "Error: Could not find ISIS FDT file " . $self->{'fdt_file_path'} . ".\n";
	189	return;
[7048]	190	}
[11334]	191	if (!-e $self->{'xrf_file_path'}) {
	192	print STDERR "<ProcessingError n='$mst_file_path_relative' r='Could not find ISIS XRF file $self->{'xrf_file_path'}'>\n" if ($self->{'gli'});
	193	print $outhandle "Error: Could not find ISIS XRF file " . $self->{'xrf_file_path'} . ".\n";
	194	return;
[7048]	195	}
	196
[6107]	197	# The text to split is exported from the database by the IsisGdl program
[7021]	198	open(FILE, "IsisGdl \"$filename\" \|");
[6107]	199
	200	my $reader = new multiread();
[15872]	201	$reader->set_handle('ISISPlugin::FILE');
[8563]	202	$reader->set_encoding($encoding);
	203	$reader->read_file($textref);
[6107]	204
[27502]	205	# At this point $$textref is a binary byte string
	206	# => turn it into a Unicode aware string, so full
	207	# Unicode aware pattern matching can be used.
	208	# For instance: 's/\x{0101}//g' or '[[:upper:]]'
	209	#
	210
	211	$$textref = decode("utf8",$$textref);
[6107]	212	close(FILE);
	213
	214	# Parse the associated ISIS database Field Definition Table file (.fdt)
[11334]	215	my %fdt_mapping = &parse_field_definition_table($self->{'fdt_file_path'}, $encoding);
[11332]	216	$self->{'fdt_mapping'} = \%fdt_mapping;
[6107]	217
[11545]	218	# Remove the line at the start, and any blank lines, so the data is split and processed properly
[28603]	219	$$textref =~ s/^----------\r?\n//;
	220	$$textref =~ s/(\r\|\n)\n/\n/g;
[6107]	221	}
	222
	223
	224	sub process
	225	{
	226	my $self = shift (@_);
[6332]	227	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[6107]	228	my $outhandle = $self->{'outhandle'};
	229
[20778]	230	# store the auxiliary files so we know which ones were used
	231	# (mst file becomes the source file)
	232	$doc_obj->associate_source_file($self->{'fdt_file_path'});
	233	$doc_obj->associate_source_file($self->{'xrf_file_path'});
	234
[11298]	235	my $section = $doc_obj->get_top_section();
	236	my $fdt_mapping = $self->{'fdt_mapping'};
[6107]	237	my $subfield_separator = $self->{'subfield_separator'};
	238	my $entry_separator = $self->{'entry_separator'};
[11466]	239	my $isis_record_html_metadata_value = "<table cellpadding=\"4\" cellspacing=\"0\">";
[6107]	240
	241	# Process each line of the ISIS record, one at a time
[10254]	242	foreach my $line (split(/\n/, $$textref)) {
[11430]	243	$line =~ s/(\s*)$//; # Remove any nasty whitespace (very important for Windows)
[8646]	244	$line =~ /^tag=(.*) data=(.+)$/;
[11298]	245	my $tag = $1;
	246	my $tag_data = $2;
	247	# print STDERR "\nTag: $tag, Data: $tag_data\n";
[6107]	248
[11298]	249	# Convert the tag number into a name, and remove any invalid characters
	250	my $raw_metadata_name = $fdt_mapping->{$tag}{'name'} \|\| "";
[11300]	251	$raw_metadata_name =~ s/[,&\#\.\-\/]/ /g;
[11298]	252	next if ($raw_metadata_name eq "");
	253
[6107]	254	# Metadata field names: title case, then remove spaces
[11298]	255	my $metadata_name = "";
	256	foreach my $word (split(/\s+/, $raw_metadata_name)) {
[6107]	257	substr($word, 0, 1) =~ tr/a-z/A-Z/;
[11298]	258	$metadata_name .= $word;
[6107]	259	}
	260
[11298]	261	my $all_metadata_name = $metadata_name . "^all";
	262	my $all_metadata_value = "";
[6123]	263
[11298]	264	# Handle repeatable fields
	265	if ($fdt_mapping->{$tag}{'repeatable'}) {
	266	# Multiple values are separated using the '%' character
	267	foreach my $raw_metadata_value (split(/%/, $tag_data)) {
	268	my $metadata_value = "";
[6107]	269
[11298]	270	# Handle subfields
	271	while ($raw_metadata_value ne "") {
	272	# If there is a subfield specifier, parse it off
	273	my $sub_metadata_name = $metadata_name;
[11299]	274	if ($raw_metadata_value =~ s/^\^// && $raw_metadata_value =~ s/^([a-z])//) {
[11298]	275	$sub_metadata_name .= "^$1";
	276	}
	277
	278	# Parse the value off and add it as metadata
	279	$raw_metadata_value =~ s/^([^\^]*)//;
[12705]	280	my $sub_metadata_value = &escape_metadata_value($1);
[11298]	281
	282	# print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
	283	if ($sub_metadata_name ne $metadata_name) {
	284	$doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
	285	}
	286
[13157]	287	# If this tag has subfields and this is the first, use the value for the CDS/ISIS ^* field
	288	if ($fdt_mapping->{$tag}{'subfields'} ne "" && $metadata_value eq "") {
[12703]	289	$doc_obj->add_utf8_metadata($section, $metadata_name . "^*", $sub_metadata_value);
	290	}
	291
[11298]	292	$metadata_value .= $subfield_separator unless ($metadata_value eq "");
	293	$metadata_value .= $sub_metadata_value;
	294	}
	295
	296	# Add the metadata value
	297	# print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
	298	$doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
	299
	300	$all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
	301	$all_metadata_value .= $metadata_value;
	302	}
	303	}
	304
	305	# Handle non-repeatable fields
	306	else {
	307	my $raw_metadata_value = $tag_data;
	308	my $metadata_value = "";
	309
	310	# Handle subfields
	311	while ($raw_metadata_value ne "") {
[6107]	312	# If there is a subfield specifier, parse it off
[11298]	313	my $sub_metadata_name = $metadata_name;
[11353]	314	if ($raw_metadata_value =~ s/^\^// && $raw_metadata_value =~ s/^([a-z])//) {
[11379]	315	$sub_metadata_name .= "^$1";
[6107]	316	}
	317
[11298]	318	# Parse the value off and add it as metadata
	319	$raw_metadata_value =~ s/^([^\^]*)//;
	320	my $sub_metadata_value = $1;
	321
	322	# Deal with the case when multiple values are specified using <...>
[11545]	323	if ($sub_metadata_value =~ /\<(.+)\>/) {
[11298]	324	my $sub_sub_metadata_name = $sub_metadata_name . "^sub";
	325	my $tmp_sub_metadata_value = $sub_metadata_value;
[11545]	326	while ($tmp_sub_metadata_value =~ s/\<(.+?)\>//) {
[11298]	327	my $sub_sub_metadata_value = $1;
	328	$doc_obj->add_utf8_metadata($section, $sub_sub_metadata_name, $sub_sub_metadata_value);
[6107]	329	}
	330	}
[11546]	331	# Deal with the legacy case when multiple values are specified using /.../
[11545]	332	elsif ($sub_metadata_value =~ /\/(.+)\//) {
	333	my $sub_sub_metadata_name = $sub_metadata_name . "^sub";
	334	my $tmp_sub_metadata_value = $sub_metadata_value;
	335	while ($tmp_sub_metadata_value =~ s/\/(.+?)\///) {
	336	my $sub_sub_metadata_value = $1;
	337	$doc_obj->add_utf8_metadata($section, $sub_sub_metadata_name, $sub_sub_metadata_value);
	338	}
	339	}
[6107]	340
[12705]	341	# Escape the metadata value so it appears correctly in the final collection
	342	$sub_metadata_value = &escape_metadata_value($sub_metadata_value);
[9998]	343
[11298]	344	# print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
	345	if ($sub_metadata_name ne $metadata_name) {
	346	$doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
[6107]	347	}
	348
[13157]	349	# If this tag has subfields and this is the first, use the value for the CDS/ISIS ^* field
	350	if ($fdt_mapping->{$tag}{'subfields'} ne "" && $metadata_value eq "") {
[12703]	351	$doc_obj->add_utf8_metadata($section, $metadata_name . "^*", $sub_metadata_value);
	352	}
	353
[11298]	354	$metadata_value .= $subfield_separator unless ($metadata_value eq "");
	355	$metadata_value .= $sub_metadata_value;
[6107]	356	}
	357
[11298]	358	# Add the metadata value
	359	# print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
	360	$doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
	361
	362	$all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
	363	$all_metadata_value .= $metadata_value;
[6107]	364	}
	365
[11298]	366	# Add the "^all" metadata value
	367	# print STDERR "All metadata name: $all_metadata_name, value: $all_metadata_value\n";
	368	$doc_obj->add_utf8_metadata($section, $all_metadata_name, $all_metadata_value);
[11465]	369
	370	$isis_record_html_metadata_value .= "<tr><td valign=top><nobr><b>" . $fdt_mapping->{$tag}{'name'} . "</b></nobr></td><td valign=top>" . $all_metadata_value . "</td></tr>";
[6107]	371	}
[8563]	372
[11467]	373	# Add a reasonably formatted HTML table view of the record as the document text
[11465]	374	$isis_record_html_metadata_value .= "</table>";
[11467]	375	$doc_obj->add_utf8_text($section, $isis_record_html_metadata_value);
[11465]	376
[11467]	377	# Add the full raw record as metadata
[12705]	378	my $isis_raw_record_metadata_value = &escape_metadata_value($$textref);
	379	$doc_obj->add_utf8_metadata($section, "ISISRawRecord", $isis_raw_record_metadata_value);
[6107]	380
[8563]	381	# Add FileFormat metadata
[11298]	382	$doc_obj->add_utf8_metadata($section, "FileFormat", "CDS/ISIS");
[7686]	383
[11298]	384	# Record was processed successfully
[6107]	385	return 1;
	386	}
	387
	388
	389	sub parse_field_definition_table
	390	{
[7686]	391	my $fdtfilename = shift(@_);
[11262]	392	my $encoding = shift(@_);
[6107]	393
[7686]	394	my %fdtmapping = ();
[6107]	395
	396	open(FDT_FILE, "<$fdtfilename") \|\| die "Error: Could not open file $fdtfilename.\n";
	397
[11262]	398	my $fdtfiletext = "";
	399	my $reader = new multiread();
[15872]	400	$reader->set_handle('ISISPlugin::FDT_FILE');
[11262]	401	$reader->set_encoding($encoding);
	402	$reader->read_file($fdtfiletext);
	403
[7686]	404	my $amongstdefinitions = 0;
[11262]	405	foreach my $fdtfileline (split(/\n/, $$fdtfiletext)) {
[6107]	406	$fdtfileline =~ s/(\s*)$//; # Remove any nasty spaces at the end of the lines
	407
	408	if ($amongstdefinitions) {
[13298]	409	my $fieldname = &unicode::substr($fdtfileline, 0, 30);
	410	my $fieldsubfields = &unicode::substr($fdtfileline, 30, 20);
	411	my $fieldspecs = &unicode::substr($fdtfileline, 50, 50);
[6107]	412
	413	# Remove extra spaces
[11298]	414	$fieldname =~ s/(\s*)$//;
[6107]	415	$fieldsubfields =~ s/(\s*)$//;
[11298]	416	$fieldspecs =~ s/(\s*)$//;
[6107]	417
[11298]	418	# Map from tag number to metadata field title, subfields, and repeatability
	419	my $fieldtag = (split(/ /, $fieldspecs))[0];
	420	my $fieldrepeatable = (split(/ /, $fieldspecs))[3];
	421	$fdtmapping{$fieldtag} = { 'name' => $fieldname,
	422	'subfields' => $fieldsubfields,
	423	'repeatable' => $fieldrepeatable };
[6107]	424	}
	425	elsif ($fdtfileline eq "***") {
	426	$amongstdefinitions = 1;
	427	}
	428	}
	429
	430	close(FDT_FILE);
	431
	432	return %fdtmapping;
	433	}
	434
	435
[12705]	436	sub escape_metadata_value
	437	{
	438	my $value = shift(@_);
	439	$value =~ s/\</</g;
	440	$value =~ s/\>/>/g;
	441	$value =~ s/\\/\\\\/g;
	442	return $value;
	443	}
	444
	445
[11332]	446	sub clean_up_after_exploding
	447	{
	448	my $self = shift(@_);
	449
	450	# Delete the FDT and XRF files too
[28563]	451	&FileUtils::removeFiles($self->{'fdt_file_path'});
	452	&FileUtils::removeFiles($self->{'xrf_file_path'});
[11332]	453	}
	454
	455
[6107]	456	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: