Context Navigation

source: main/trunk/greenstone2/perllib/plugins/ISISPlugin.pm@ 27502

Last change on this file since 27502 was 27502, checked in by kjdon, 11 years ago
trying to fix double encoding issue for isis files. not sure that I have it yet
Property svn:keywords set to `Author Date Id Revision`
File size: 14.9 KB

Rev	Line
[6107]	1	###########################################################################
	2	#
[15872]	3	# ISISPlugin.pm -- A plugin for CDS/ISIS databases
[6107]	4	#
	5	# A component of the Greenstone digital library software
	6	# from the New Zealand Digital Library Project at the
	7	# University of Waikato, New Zealand.
	8	#
[7686]	9	# Copyright 1999-2004 New Zealand Digital Library Project
[6107]	10	#
	11	# This program is free software; you can redistribute it and/or modify
	12	# it under the terms of the GNU General Public License as published by
	13	# the Free Software Foundation; either version 2 of the License, or
	14	# (at your option) any later version.
	15	#
	16	# This program is distributed in the hope that it will be useful,
	17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	19	# GNU General Public License for more details.
	20	#
	21	# You should have received a copy of the GNU General Public License
	22	# along with this program; if not, write to the Free Software
	23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	24	#
	25	###########################################################################
	26
[15872]	27	package ISISPlugin;
[6107]	28
[27502]	29	use Encode;
[6107]	30
	31	use multiread;
[15872]	32	use SplitTextFile;
[24547]	33	use MetadataRead;
[6107]	34
[10254]	35	use strict;
	36	no strict 'refs'; # allow filehandles to be variables and viceversa
[6107]	37
[15872]	38	# ISISPlugin is a sub-class of SplitTextFile.
[24547]	39	# methods with identical signatures take precedence in the order given in the ISA list.
[6107]	40	sub BEGIN {
[24547]	41	@ISISPlugin::ISA = ('MetadataRead', 'SplitTextFile');
[6107]	42	}
	43
	44
	45	my $arguments =
[7686]	46	[ { 'name' => "process_exp",
[15872]	47	'desc' => "{BasePlugin.process_exp}",
[6408]	48	'type' => "regexp",
	49	'reqd' => "no",
[6107]	50	'deft' => &get_default_process_exp() },
	51	{ 'name' => "block_exp",
[15872]	52	'desc' => "{BasePlugin.block_exp}",
[6408]	53	'type' => "regexp",
[7686]	54	'reqd' => "no",
[11329]	55	'deft' => &get_default_block_exp(),
	56	'hiddengli' => "yes" },
[7686]	57	{ 'name' => "split_exp",
[15872]	58	'desc' => "{SplitTextFile.split_exp}",
[7686]	59	'type' => "regexp",
	60	'reqd' => "no",
[11295]	61	'deft' => &get_default_split_exp(),
	62	'hiddengli' => "yes" },
[7686]	63
	64	# The interesting options
	65	{ 'name' => "entry_separator",
[15872]	66	'desc' => "{ISISPlugin.entry_separator}",
[7686]	67	'type' => "string",
	68	'reqd' => "no",
	69	'deft' => "<br>" },
[6107]	70	{ 'name' => "subfield_separator",
[15872]	71	'desc' => "{ISISPlugin.subfield_separator}",
[6107]	72	'type' => "string",
	73	'reqd' => "no",
[8563]	74	'deft' => ", " }
[6408]	75	];
[6107]	76
[15872]	77	my $options = { 'name' => "ISISPlugin",
	78	'desc' => "{ISISPlugin.desc}",
[6408]	79	'abstract' => "no",
	80	'inherits' => "yes",
[8762]	81	'explodes' => "yes",
[6107]	82	'args' => $arguments };
	83
	84
	85	# This plugin processes files with the suffix ".mst"
	86	sub get_default_process_exp {
	87	return q^(?i)(\.mst)$^;
	88	}
	89
	90
	91	# This plugin blocks files with the suffix ".fdt" and ".xrf"
	92	sub get_default_block_exp {
[17479]	93	return q^(?i)(\.fdt\|\.xrf)$^;
	94	#return "";
[6107]	95	}
	96
	97
	98	# This plugin splits the input text at the "----------" lines
	99	sub get_default_split_exp {
[9998]	100	return q^\r?\n----------\r?\n^;
[6107]	101	}
	102
	103
[8563]	104	sub new
	105	{
[10218]	106	my ($class) = shift (@_);
	107	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	108	push(@$pluginlist, $class);
[6107]	109
[15872]	110	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
	111	push(@{$hashArgOptLists->{"OptList"}},$options);
[6107]	112
[15872]	113	my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
[6107]	114
[13178]	115	if ($self->{'info_only'}) {
	116	# don't worry about any options etc
	117	return bless $self, $class;
	118	}
	119
[12833]	120	# isis plug doesn't care about encoding - it assumes ascii unless the user
	121	# has specified an encoding
	122	if ($self->{'input_encoding'} eq "auto") {
	123	$self->{'input_encoding'} = "ascii";
	124	}
[6107]	125	return bless $self, $class;
	126	}
	127
[16392]	128	# we block the corresponding fdt and xrf
[17479]	129	# a pain on windows. blocks xxx.FDT, but if actual file is xx.fdt then
	130	# complains that no plugin can process it. Have put it back to using
	131	# block exp for now
[23564]	132	# This works now, as are doing case insenstive blocking on windows. However,
	133	# a pain for GLI as will not know what plugin processes the fdt and xrf.
	134	# if add to process expression, then get more problems.
[17479]	135	sub store_block_files_tmp {
[16392]	136
	137	my $self =shift (@_);
	138	my ($filename_full_path, $block_hash) = @_;
[17479]	139	print STDERR "in store block files\n";
[16392]	140	$self->check_auxiliary_files($filename_full_path);
	141	if (-e $self->{'fdt_file_path'}) {
[17479]	142	print STDERR "$self->{'fdt_file_path'}\n";
[16392]	143	my $fdt_file = $self->{'fdt_file_path'};
[23561]	144	&util::block_filename($block_hash,$fdt_file);
[16392]	145	}
	146	if (-e $self->{'xrf_file_path'}) {
[17479]	147	print STDERR "$self->{'xrf_file_path'}\n";
[16392]	148	my $xrf_file = $self->{'xrf_file_path'};
[23561]	149	&util::block_filename($block_hash,$xrf_file);
[16392]	150	}
	151
[6107]	152
[16392]	153	}
	154
	155	sub check_auxiliary_files {
	156	my $self = shift (@_);
	157	my ($filename) = @_;
	158
	159	my ($database_file_path_root) = ($filename =~ /(.*)\.mst$/i);
	160	# Check the associated .fdt and .xrf files exist
	161	$self->{'fdt_file_path'} = $database_file_path_root . ".FDT";
	162	if (!-e $self->{'fdt_file_path'}) {
	163	$self->{'fdt_file_path'} = $database_file_path_root . ".fdt";
	164	}
	165	$self->{'xrf_file_path'} = $database_file_path_root . ".XRF";
	166	if (!-e $self->{'xrf_file_path'}) {
	167	$self->{'xrf_file_path'} = $database_file_path_root . ".xrf";
	168	}
	169	}
	170
	171
[8563]	172	sub read_file
[7686]	173	{
[6107]	174	my $self = shift (@_);
	175	my ($filename, $encoding, $language, $textref) = @_;
[11334]	176	my $outhandle = $self->{'outhandle'};
[6107]	177
[11334]	178	my ($database_file_path_root) = ($filename =~ /(.*)\.mst$/i);
	179	my $mst_file_path_relative = $filename;
	180	$mst_file_path_relative =~ s/^.+import.(.*?)$/$1/;
[6107]	181
[7048]	182	# Check the associated .fdt and .xrf files exist
[16392]	183	$self->check_auxiliary_files($filename);
	184
[11334]	185	if (!-e $self->{'fdt_file_path'}) {
	186	print STDERR "<ProcessingError n='$mst_file_path_relative' r='Could not find ISIS FDT file $self->{'fdt_file_path'}'>\n" if ($self->{'gli'});
	187	print $outhandle "Error: Could not find ISIS FDT file " . $self->{'fdt_file_path'} . ".\n";
	188	return;
[7048]	189	}
[11334]	190	if (!-e $self->{'xrf_file_path'}) {
	191	print STDERR "<ProcessingError n='$mst_file_path_relative' r='Could not find ISIS XRF file $self->{'xrf_file_path'}'>\n" if ($self->{'gli'});
	192	print $outhandle "Error: Could not find ISIS XRF file " . $self->{'xrf_file_path'} . ".\n";
	193	return;
[7048]	194	}
	195
[6107]	196	# The text to split is exported from the database by the IsisGdl program
[7021]	197	open(FILE, "IsisGdl \"$filename\" \|");
[6107]	198
	199	my $reader = new multiread();
[15872]	200	$reader->set_handle('ISISPlugin::FILE');
[8563]	201	$reader->set_encoding($encoding);
	202	$reader->read_file($textref);
[6107]	203
[27502]	204	# At this point $$textref is a binary byte string
	205	# => turn it into a Unicode aware string, so full
	206	# Unicode aware pattern matching can be used.
	207	# For instance: 's/\x{0101}//g' or '[[:upper:]]'
	208	#
	209
	210	$$textref = decode("utf8",$$textref);
[6107]	211	close(FILE);
	212
	213	# Parse the associated ISIS database Field Definition Table file (.fdt)
[11334]	214	my %fdt_mapping = &parse_field_definition_table($self->{'fdt_file_path'}, $encoding);
[11332]	215	$self->{'fdt_mapping'} = \%fdt_mapping;
[6107]	216
[11545]	217	# Remove the line at the start, and any blank lines, so the data is split and processed properly
[7686]	218	$$textref =~ s/^----------\n//;
[11545]	219	$$textref =~ s/\n\n/\n/g;
[6107]	220	}
	221
	222
	223	sub process
	224	{
	225	my $self = shift (@_);
[6332]	226	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[6107]	227	my $outhandle = $self->{'outhandle'};
	228
[20778]	229	# store the auxiliary files so we know which ones were used
	230	# (mst file becomes the source file)
	231	$doc_obj->associate_source_file($self->{'fdt_file_path'});
	232	$doc_obj->associate_source_file($self->{'xrf_file_path'});
	233
[11298]	234	my $section = $doc_obj->get_top_section();
	235	my $fdt_mapping = $self->{'fdt_mapping'};
[6107]	236	my $subfield_separator = $self->{'subfield_separator'};
	237	my $entry_separator = $self->{'entry_separator'};
[11466]	238	my $isis_record_html_metadata_value = "<table cellpadding=\"4\" cellspacing=\"0\">";
[6107]	239
	240	# Process each line of the ISIS record, one at a time
[10254]	241	foreach my $line (split(/\n/, $$textref)) {
[11430]	242	$line =~ s/(\s*)$//; # Remove any nasty whitespace (very important for Windows)
[8646]	243	$line =~ /^tag=(.*) data=(.+)$/;
[11298]	244	my $tag = $1;
	245	my $tag_data = $2;
	246	# print STDERR "\nTag: $tag, Data: $tag_data\n";
[6107]	247
[11298]	248	# Convert the tag number into a name, and remove any invalid characters
	249	my $raw_metadata_name = $fdt_mapping->{$tag}{'name'} \|\| "";
[11300]	250	$raw_metadata_name =~ s/[,&\#\.\-\/]/ /g;
[11298]	251	next if ($raw_metadata_name eq "");
	252
[6107]	253	# Metadata field names: title case, then remove spaces
[11298]	254	my $metadata_name = "";
	255	foreach my $word (split(/\s+/, $raw_metadata_name)) {
[6107]	256	substr($word, 0, 1) =~ tr/a-z/A-Z/;
[11298]	257	$metadata_name .= $word;
[6107]	258	}
	259
[11298]	260	my $all_metadata_name = $metadata_name . "^all";
	261	my $all_metadata_value = "";
[6123]	262
[11298]	263	# Handle repeatable fields
	264	if ($fdt_mapping->{$tag}{'repeatable'}) {
	265	# Multiple values are separated using the '%' character
	266	foreach my $raw_metadata_value (split(/%/, $tag_data)) {
	267	my $metadata_value = "";
[6107]	268
[11298]	269	# Handle subfields
	270	while ($raw_metadata_value ne "") {
	271	# If there is a subfield specifier, parse it off
	272	my $sub_metadata_name = $metadata_name;
[11299]	273	if ($raw_metadata_value =~ s/^\^// && $raw_metadata_value =~ s/^([a-z])//) {
[11298]	274	$sub_metadata_name .= "^$1";
	275	}
	276
	277	# Parse the value off and add it as metadata
	278	$raw_metadata_value =~ s/^([^\^]*)//;
[12705]	279	my $sub_metadata_value = &escape_metadata_value($1);
[11298]	280
	281	# print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
	282	if ($sub_metadata_name ne $metadata_name) {
	283	$doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
	284	}
	285
[13157]	286	# If this tag has subfields and this is the first, use the value for the CDS/ISIS ^* field
	287	if ($fdt_mapping->{$tag}{'subfields'} ne "" && $metadata_value eq "") {
[12703]	288	$doc_obj->add_utf8_metadata($section, $metadata_name . "^*", $sub_metadata_value);
	289	}
	290
[11298]	291	$metadata_value .= $subfield_separator unless ($metadata_value eq "");
	292	$metadata_value .= $sub_metadata_value;
	293	}
	294
	295	# Add the metadata value
	296	# print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
	297	$doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
	298
	299	$all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
	300	$all_metadata_value .= $metadata_value;
	301	}
	302	}
	303
	304	# Handle non-repeatable fields
	305	else {
	306	my $raw_metadata_value = $tag_data;
	307	my $metadata_value = "";
	308
	309	# Handle subfields
	310	while ($raw_metadata_value ne "") {
[6107]	311	# If there is a subfield specifier, parse it off
[11298]	312	my $sub_metadata_name = $metadata_name;
[11353]	313	if ($raw_metadata_value =~ s/^\^// && $raw_metadata_value =~ s/^([a-z])//) {
[11379]	314	$sub_metadata_name .= "^$1";
[6107]	315	}
	316
[11298]	317	# Parse the value off and add it as metadata
	318	$raw_metadata_value =~ s/^([^\^]*)//;
	319	my $sub_metadata_value = $1;
	320
	321	# Deal with the case when multiple values are specified using <...>
[11545]	322	if ($sub_metadata_value =~ /\<(.+)\>/) {
[11298]	323	my $sub_sub_metadata_name = $sub_metadata_name . "^sub";
	324	my $tmp_sub_metadata_value = $sub_metadata_value;
[11545]	325	while ($tmp_sub_metadata_value =~ s/\<(.+?)\>//) {
[11298]	326	my $sub_sub_metadata_value = $1;
	327	$doc_obj->add_utf8_metadata($section, $sub_sub_metadata_name, $sub_sub_metadata_value);
[6107]	328	}
	329	}
[11546]	330	# Deal with the legacy case when multiple values are specified using /.../
[11545]	331	elsif ($sub_metadata_value =~ /\/(.+)\//) {
	332	my $sub_sub_metadata_name = $sub_metadata_name . "^sub";
	333	my $tmp_sub_metadata_value = $sub_metadata_value;
	334	while ($tmp_sub_metadata_value =~ s/\/(.+?)\///) {
	335	my $sub_sub_metadata_value = $1;
	336	$doc_obj->add_utf8_metadata($section, $sub_sub_metadata_name, $sub_sub_metadata_value);
	337	}
	338	}
[6107]	339
[12705]	340	# Escape the metadata value so it appears correctly in the final collection
	341	$sub_metadata_value = &escape_metadata_value($sub_metadata_value);
[9998]	342
[11298]	343	# print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
	344	if ($sub_metadata_name ne $metadata_name) {
	345	$doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
[6107]	346	}
	347
[13157]	348	# If this tag has subfields and this is the first, use the value for the CDS/ISIS ^* field
	349	if ($fdt_mapping->{$tag}{'subfields'} ne "" && $metadata_value eq "") {
[12703]	350	$doc_obj->add_utf8_metadata($section, $metadata_name . "^*", $sub_metadata_value);
	351	}
	352
[11298]	353	$metadata_value .= $subfield_separator unless ($metadata_value eq "");
	354	$metadata_value .= $sub_metadata_value;
[6107]	355	}
	356
[11298]	357	# Add the metadata value
	358	# print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
	359	$doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
	360
	361	$all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
	362	$all_metadata_value .= $metadata_value;
[6107]	363	}
	364
[11298]	365	# Add the "^all" metadata value
	366	# print STDERR "All metadata name: $all_metadata_name, value: $all_metadata_value\n";
	367	$doc_obj->add_utf8_metadata($section, $all_metadata_name, $all_metadata_value);
[11465]	368
	369	$isis_record_html_metadata_value .= "<tr><td valign=top><nobr><b>" . $fdt_mapping->{$tag}{'name'} . "</b></nobr></td><td valign=top>" . $all_metadata_value . "</td></tr>";
[6107]	370	}
[8563]	371
[11467]	372	# Add a reasonably formatted HTML table view of the record as the document text
[11465]	373	$isis_record_html_metadata_value .= "</table>";
[11467]	374	$doc_obj->add_utf8_text($section, $isis_record_html_metadata_value);
[11465]	375
[11467]	376	# Add the full raw record as metadata
[12705]	377	my $isis_raw_record_metadata_value = &escape_metadata_value($$textref);
	378	$doc_obj->add_utf8_metadata($section, "ISISRawRecord", $isis_raw_record_metadata_value);
[6107]	379
[8563]	380	# Add FileFormat metadata
[11298]	381	$doc_obj->add_utf8_metadata($section, "FileFormat", "CDS/ISIS");
[7686]	382
[11298]	383	# Record was processed successfully
[6107]	384	return 1;
	385	}
	386
	387
	388	sub parse_field_definition_table
	389	{
[7686]	390	my $fdtfilename = shift(@_);
[11262]	391	my $encoding = shift(@_);
[6107]	392
[7686]	393	my %fdtmapping = ();
[6107]	394
	395	open(FDT_FILE, "<$fdtfilename") \|\| die "Error: Could not open file $fdtfilename.\n";
	396
[11262]	397	my $fdtfiletext = "";
	398	my $reader = new multiread();
[15872]	399	$reader->set_handle('ISISPlugin::FDT_FILE');
[11262]	400	$reader->set_encoding($encoding);
	401	$reader->read_file($fdtfiletext);
	402
[7686]	403	my $amongstdefinitions = 0;
[11262]	404	foreach my $fdtfileline (split(/\n/, $$fdtfiletext)) {
[6107]	405	$fdtfileline =~ s/(\s*)$//; # Remove any nasty spaces at the end of the lines
	406
	407	if ($amongstdefinitions) {
[13298]	408	my $fieldname = &unicode::substr($fdtfileline, 0, 30);
	409	my $fieldsubfields = &unicode::substr($fdtfileline, 30, 20);
	410	my $fieldspecs = &unicode::substr($fdtfileline, 50, 50);
[6107]	411
	412	# Remove extra spaces
[11298]	413	$fieldname =~ s/(\s*)$//;
[6107]	414	$fieldsubfields =~ s/(\s*)$//;
[11298]	415	$fieldspecs =~ s/(\s*)$//;
[6107]	416
[11298]	417	# Map from tag number to metadata field title, subfields, and repeatability
	418	my $fieldtag = (split(/ /, $fieldspecs))[0];
	419	my $fieldrepeatable = (split(/ /, $fieldspecs))[3];
	420	$fdtmapping{$fieldtag} = { 'name' => $fieldname,
	421	'subfields' => $fieldsubfields,
	422	'repeatable' => $fieldrepeatable };
[6107]	423	}
	424	elsif ($fdtfileline eq "***") {
	425	$amongstdefinitions = 1;
	426	}
	427	}
	428
	429	close(FDT_FILE);
	430
	431	return %fdtmapping;
	432	}
	433
	434
[12705]	435	sub escape_metadata_value
	436	{
	437	my $value = shift(@_);
	438	$value =~ s/\</</g;
	439	$value =~ s/\>/>/g;
	440	$value =~ s/\\/\\\\/g;
	441	return $value;
	442	}
	443
	444
[11332]	445	sub clean_up_after_exploding
	446	{
	447	my $self = shift(@_);
	448
	449	# Delete the FDT and XRF files too
[11334]	450	&util::rm($self->{'fdt_file_path'});
	451	&util::rm($self->{'xrf_file_path'});
[11332]	452	}
	453
	454
[6107]	455	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: