Context Navigation

source: main/trunk/greenstone2/perllib/plugins/ISISPlugin.pm@ 24547

Last change on this file since 24547 was 24547, checked in by ak19, 13 years ago
Added new abstract plugin MetadataRead that defines can_process_this_file_for_metadata that MetadataPlugin subclasses can inherit (if MetadataRead is listed first in the ISA inheritance list) and which will then override the one defined in BasePlugin. For now committing MARC, ISIS and OAIPlugins which now additionally inherit from MetadataRead. Other metadataPlugins also need to be committed.
Property svn:keywords set to `Author Date Id Revision`
File size: 14.7 KB

Rev	Line
[6107]	1	###########################################################################
	2	#
[15872]	3	# ISISPlugin.pm -- A plugin for CDS/ISIS databases
[6107]	4	#
	5	# A component of the Greenstone digital library software
	6	# from the New Zealand Digital Library Project at the
	7	# University of Waikato, New Zealand.
	8	#
[7686]	9	# Copyright 1999-2004 New Zealand Digital Library Project
[6107]	10	#
	11	# This program is free software; you can redistribute it and/or modify
	12	# it under the terms of the GNU General Public License as published by
	13	# the Free Software Foundation; either version 2 of the License, or
	14	# (at your option) any later version.
	15	#
	16	# This program is distributed in the hope that it will be useful,
	17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	19	# GNU General Public License for more details.
	20	#
	21	# You should have received a copy of the GNU General Public License
	22	# along with this program; if not, write to the Free Software
	23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	24	#
	25	###########################################################################
	26
[15872]	27	package ISISPlugin;
[6107]	28
	29
	30	use multiread;
[15872]	31	use SplitTextFile;
[24547]	32	use MetadataRead;
[6107]	33
[10254]	34	use strict;
	35	no strict 'refs'; # allow filehandles to be variables and viceversa
[6107]	36
[15872]	37	# ISISPlugin is a sub-class of SplitTextFile.
[24547]	38	# methods with identical signatures take precedence in the order given in the ISA list.
[6107]	39	sub BEGIN {
[24547]	40	@ISISPlugin::ISA = ('MetadataRead', 'SplitTextFile');
[6107]	41	}
	42
	43
	44	my $arguments =
[7686]	45	[ { 'name' => "process_exp",
[15872]	46	'desc' => "{BasePlugin.process_exp}",
[6408]	47	'type' => "regexp",
	48	'reqd' => "no",
[6107]	49	'deft' => &get_default_process_exp() },
	50	{ 'name' => "block_exp",
[15872]	51	'desc' => "{BasePlugin.block_exp}",
[6408]	52	'type' => "regexp",
[7686]	53	'reqd' => "no",
[11329]	54	'deft' => &get_default_block_exp(),
	55	'hiddengli' => "yes" },
[7686]	56	{ 'name' => "split_exp",
[15872]	57	'desc' => "{SplitTextFile.split_exp}",
[7686]	58	'type' => "regexp",
	59	'reqd' => "no",
[11295]	60	'deft' => &get_default_split_exp(),
	61	'hiddengli' => "yes" },
[7686]	62
	63	# The interesting options
	64	{ 'name' => "entry_separator",
[15872]	65	'desc' => "{ISISPlugin.entry_separator}",
[7686]	66	'type' => "string",
	67	'reqd' => "no",
	68	'deft' => "<br>" },
[6107]	69	{ 'name' => "subfield_separator",
[15872]	70	'desc' => "{ISISPlugin.subfield_separator}",
[6107]	71	'type' => "string",
	72	'reqd' => "no",
[8563]	73	'deft' => ", " }
[6408]	74	];
[6107]	75
[15872]	76	my $options = { 'name' => "ISISPlugin",
	77	'desc' => "{ISISPlugin.desc}",
[6408]	78	'abstract' => "no",
	79	'inherits' => "yes",
[8762]	80	'explodes' => "yes",
[6107]	81	'args' => $arguments };
	82
	83
	84	# This plugin processes files with the suffix ".mst"
	85	sub get_default_process_exp {
	86	return q^(?i)(\.mst)$^;
	87	}
	88
	89
	90	# This plugin blocks files with the suffix ".fdt" and ".xrf"
	91	sub get_default_block_exp {
[17479]	92	return q^(?i)(\.fdt\|\.xrf)$^;
	93	#return "";
[6107]	94	}
	95
	96
	97	# This plugin splits the input text at the "----------" lines
	98	sub get_default_split_exp {
[9998]	99	return q^\r?\n----------\r?\n^;
[6107]	100	}
	101
	102
[8563]	103	sub new
	104	{
[10218]	105	my ($class) = shift (@_);
	106	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	107	push(@$pluginlist, $class);
[6107]	108
[15872]	109	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
	110	push(@{$hashArgOptLists->{"OptList"}},$options);
[6107]	111
[15872]	112	my $self = new SplitTextFile($pluginlist, $inputargs, $hashArgOptLists);
[6107]	113
[13178]	114	if ($self->{'info_only'}) {
	115	# don't worry about any options etc
	116	return bless $self, $class;
	117	}
	118
[12833]	119	# isis plug doesn't care about encoding - it assumes ascii unless the user
	120	# has specified an encoding
	121	if ($self->{'input_encoding'} eq "auto") {
	122	$self->{'input_encoding'} = "ascii";
	123	}
[6107]	124	return bless $self, $class;
	125	}
	126
[16392]	127	# we block the corresponding fdt and xrf
[17479]	128	# a pain on windows. blocks xxx.FDT, but if actual file is xx.fdt then
	129	# complains that no plugin can process it. Have put it back to using
	130	# block exp for now
[23564]	131	# This works now, as are doing case insenstive blocking on windows. However,
	132	# a pain for GLI as will not know what plugin processes the fdt and xrf.
	133	# if add to process expression, then get more problems.
[17479]	134	sub store_block_files_tmp {
[16392]	135
	136	my $self =shift (@_);
	137	my ($filename_full_path, $block_hash) = @_;
[17479]	138	print STDERR "in store block files\n";
[16392]	139	$self->check_auxiliary_files($filename_full_path);
	140	if (-e $self->{'fdt_file_path'}) {
[17479]	141	print STDERR "$self->{'fdt_file_path'}\n";
[16392]	142	my $fdt_file = $self->{'fdt_file_path'};
[23561]	143	&util::block_filename($block_hash,$fdt_file);
[16392]	144	}
	145	if (-e $self->{'xrf_file_path'}) {
[17479]	146	print STDERR "$self->{'xrf_file_path'}\n";
[16392]	147	my $xrf_file = $self->{'xrf_file_path'};
[23561]	148	&util::block_filename($block_hash,$xrf_file);
[16392]	149	}
	150
[6107]	151
[16392]	152	}
	153
	154	sub check_auxiliary_files {
	155	my $self = shift (@_);
	156	my ($filename) = @_;
	157
	158	my ($database_file_path_root) = ($filename =~ /(.*)\.mst$/i);
	159	# Check the associated .fdt and .xrf files exist
	160	$self->{'fdt_file_path'} = $database_file_path_root . ".FDT";
	161	if (!-e $self->{'fdt_file_path'}) {
	162	$self->{'fdt_file_path'} = $database_file_path_root . ".fdt";
	163	}
	164	$self->{'xrf_file_path'} = $database_file_path_root . ".XRF";
	165	if (!-e $self->{'xrf_file_path'}) {
	166	$self->{'xrf_file_path'} = $database_file_path_root . ".xrf";
	167	}
	168	}
	169
	170
[8563]	171	sub read_file
[7686]	172	{
[6107]	173	my $self = shift (@_);
	174	my ($filename, $encoding, $language, $textref) = @_;
[11334]	175	my $outhandle = $self->{'outhandle'};
[6107]	176
[11334]	177	my ($database_file_path_root) = ($filename =~ /(.*)\.mst$/i);
	178	my $mst_file_path_relative = $filename;
	179	$mst_file_path_relative =~ s/^.+import.(.*?)$/$1/;
[6107]	180
[7048]	181	# Check the associated .fdt and .xrf files exist
[16392]	182	$self->check_auxiliary_files($filename);
	183
[11334]	184	if (!-e $self->{'fdt_file_path'}) {
	185	print STDERR "<ProcessingError n='$mst_file_path_relative' r='Could not find ISIS FDT file $self->{'fdt_file_path'}'>\n" if ($self->{'gli'});
	186	print $outhandle "Error: Could not find ISIS FDT file " . $self->{'fdt_file_path'} . ".\n";
	187	return;
[7048]	188	}
[11334]	189	if (!-e $self->{'xrf_file_path'}) {
	190	print STDERR "<ProcessingError n='$mst_file_path_relative' r='Could not find ISIS XRF file $self->{'xrf_file_path'}'>\n" if ($self->{'gli'});
	191	print $outhandle "Error: Could not find ISIS XRF file " . $self->{'xrf_file_path'} . ".\n";
	192	return;
[7048]	193	}
	194
[6107]	195	# The text to split is exported from the database by the IsisGdl program
[7021]	196	open(FILE, "IsisGdl \"$filename\" \|");
[6107]	197
	198	my $reader = new multiread();
[15872]	199	$reader->set_handle('ISISPlugin::FILE');
[8563]	200	$reader->set_encoding($encoding);
	201	$reader->read_file($textref);
[6107]	202
	203	close(FILE);
	204
	205	# Parse the associated ISIS database Field Definition Table file (.fdt)
[11334]	206	my %fdt_mapping = &parse_field_definition_table($self->{'fdt_file_path'}, $encoding);
[11332]	207	$self->{'fdt_mapping'} = \%fdt_mapping;
[6107]	208
[11545]	209	# Remove the line at the start, and any blank lines, so the data is split and processed properly
[7686]	210	$$textref =~ s/^----------\n//;
[11545]	211	$$textref =~ s/\n\n/\n/g;
[6107]	212	}
	213
	214
	215	sub process
	216	{
	217	my $self = shift (@_);
[6332]	218	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[6107]	219	my $outhandle = $self->{'outhandle'};
	220
[20778]	221	# store the auxiliary files so we know which ones were used
	222	# (mst file becomes the source file)
	223	$doc_obj->associate_source_file($self->{'fdt_file_path'});
	224	$doc_obj->associate_source_file($self->{'xrf_file_path'});
	225
[11298]	226	my $section = $doc_obj->get_top_section();
	227	my $fdt_mapping = $self->{'fdt_mapping'};
[6107]	228	my $subfield_separator = $self->{'subfield_separator'};
	229	my $entry_separator = $self->{'entry_separator'};
[11466]	230	my $isis_record_html_metadata_value = "<table cellpadding=\"4\" cellspacing=\"0\">";
[6107]	231
	232	# Process each line of the ISIS record, one at a time
[10254]	233	foreach my $line (split(/\n/, $$textref)) {
[11430]	234	$line =~ s/(\s*)$//; # Remove any nasty whitespace (very important for Windows)
[8646]	235	$line =~ /^tag=(.*) data=(.+)$/;
[11298]	236	my $tag = $1;
	237	my $tag_data = $2;
	238	# print STDERR "\nTag: $tag, Data: $tag_data\n";
[6107]	239
[11298]	240	# Convert the tag number into a name, and remove any invalid characters
	241	my $raw_metadata_name = $fdt_mapping->{$tag}{'name'} \|\| "";
[11300]	242	$raw_metadata_name =~ s/[,&\#\.\-\/]/ /g;
[11298]	243	next if ($raw_metadata_name eq "");
	244
[6107]	245	# Metadata field names: title case, then remove spaces
[11298]	246	my $metadata_name = "";
	247	foreach my $word (split(/\s+/, $raw_metadata_name)) {
[6107]	248	substr($word, 0, 1) =~ tr/a-z/A-Z/;
[11298]	249	$metadata_name .= $word;
[6107]	250	}
	251
[11298]	252	my $all_metadata_name = $metadata_name . "^all";
	253	my $all_metadata_value = "";
[6123]	254
[11298]	255	# Handle repeatable fields
	256	if ($fdt_mapping->{$tag}{'repeatable'}) {
	257	# Multiple values are separated using the '%' character
	258	foreach my $raw_metadata_value (split(/%/, $tag_data)) {
	259	my $metadata_value = "";
[6107]	260
[11298]	261	# Handle subfields
	262	while ($raw_metadata_value ne "") {
	263	# If there is a subfield specifier, parse it off
	264	my $sub_metadata_name = $metadata_name;
[11299]	265	if ($raw_metadata_value =~ s/^\^// && $raw_metadata_value =~ s/^([a-z])//) {
[11298]	266	$sub_metadata_name .= "^$1";
	267	}
	268
	269	# Parse the value off and add it as metadata
	270	$raw_metadata_value =~ s/^([^\^]*)//;
[12705]	271	my $sub_metadata_value = &escape_metadata_value($1);
[11298]	272
	273	# print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
	274	if ($sub_metadata_name ne $metadata_name) {
	275	$doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
	276	}
	277
[13157]	278	# If this tag has subfields and this is the first, use the value for the CDS/ISIS ^* field
	279	if ($fdt_mapping->{$tag}{'subfields'} ne "" && $metadata_value eq "") {
[12703]	280	$doc_obj->add_utf8_metadata($section, $metadata_name . "^*", $sub_metadata_value);
	281	}
	282
[11298]	283	$metadata_value .= $subfield_separator unless ($metadata_value eq "");
	284	$metadata_value .= $sub_metadata_value;
	285	}
	286
	287	# Add the metadata value
	288	# print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
	289	$doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
	290
	291	$all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
	292	$all_metadata_value .= $metadata_value;
	293	}
	294	}
	295
	296	# Handle non-repeatable fields
	297	else {
	298	my $raw_metadata_value = $tag_data;
	299	my $metadata_value = "";
	300
	301	# Handle subfields
	302	while ($raw_metadata_value ne "") {
[6107]	303	# If there is a subfield specifier, parse it off
[11298]	304	my $sub_metadata_name = $metadata_name;
[11353]	305	if ($raw_metadata_value =~ s/^\^// && $raw_metadata_value =~ s/^([a-z])//) {
[11379]	306	$sub_metadata_name .= "^$1";
[6107]	307	}
	308
[11298]	309	# Parse the value off and add it as metadata
	310	$raw_metadata_value =~ s/^([^\^]*)//;
	311	my $sub_metadata_value = $1;
	312
	313	# Deal with the case when multiple values are specified using <...>
[11545]	314	if ($sub_metadata_value =~ /\<(.+)\>/) {
[11298]	315	my $sub_sub_metadata_name = $sub_metadata_name . "^sub";
	316	my $tmp_sub_metadata_value = $sub_metadata_value;
[11545]	317	while ($tmp_sub_metadata_value =~ s/\<(.+?)\>//) {
[11298]	318	my $sub_sub_metadata_value = $1;
	319	$doc_obj->add_utf8_metadata($section, $sub_sub_metadata_name, $sub_sub_metadata_value);
[6107]	320	}
	321	}
[11546]	322	# Deal with the legacy case when multiple values are specified using /.../
[11545]	323	elsif ($sub_metadata_value =~ /\/(.+)\//) {
	324	my $sub_sub_metadata_name = $sub_metadata_name . "^sub";
	325	my $tmp_sub_metadata_value = $sub_metadata_value;
	326	while ($tmp_sub_metadata_value =~ s/\/(.+?)\///) {
	327	my $sub_sub_metadata_value = $1;
	328	$doc_obj->add_utf8_metadata($section, $sub_sub_metadata_name, $sub_sub_metadata_value);
	329	}
	330	}
[6107]	331
[12705]	332	# Escape the metadata value so it appears correctly in the final collection
	333	$sub_metadata_value = &escape_metadata_value($sub_metadata_value);
[9998]	334
[11298]	335	# print STDERR "Sub metadata name: $sub_metadata_name, value: $sub_metadata_value\n";
	336	if ($sub_metadata_name ne $metadata_name) {
	337	$doc_obj->add_utf8_metadata($section, $sub_metadata_name, $sub_metadata_value);
[6107]	338	}
	339
[13157]	340	# If this tag has subfields and this is the first, use the value for the CDS/ISIS ^* field
	341	if ($fdt_mapping->{$tag}{'subfields'} ne "" && $metadata_value eq "") {
[12703]	342	$doc_obj->add_utf8_metadata($section, $metadata_name . "^*", $sub_metadata_value);
	343	}
	344
[11298]	345	$metadata_value .= $subfield_separator unless ($metadata_value eq "");
	346	$metadata_value .= $sub_metadata_value;
[6107]	347	}
	348
[11298]	349	# Add the metadata value
	350	# print STDERR "Metadata name: $metadata_name, value: $metadata_value\n";
	351	$doc_obj->add_utf8_metadata($section, $metadata_name, $metadata_value);
	352
	353	$all_metadata_value .= $entry_separator unless ($all_metadata_value eq "");
	354	$all_metadata_value .= $metadata_value;
[6107]	355	}
	356
[11298]	357	# Add the "^all" metadata value
	358	# print STDERR "All metadata name: $all_metadata_name, value: $all_metadata_value\n";
	359	$doc_obj->add_utf8_metadata($section, $all_metadata_name, $all_metadata_value);
[11465]	360
	361	$isis_record_html_metadata_value .= "<tr><td valign=top><nobr><b>" . $fdt_mapping->{$tag}{'name'} . "</b></nobr></td><td valign=top>" . $all_metadata_value . "</td></tr>";
[6107]	362	}
[8563]	363
[11467]	364	# Add a reasonably formatted HTML table view of the record as the document text
[11465]	365	$isis_record_html_metadata_value .= "</table>";
[11467]	366	$doc_obj->add_utf8_text($section, $isis_record_html_metadata_value);
[11465]	367
[11467]	368	# Add the full raw record as metadata
[12705]	369	my $isis_raw_record_metadata_value = &escape_metadata_value($$textref);
	370	$doc_obj->add_utf8_metadata($section, "ISISRawRecord", $isis_raw_record_metadata_value);
[6107]	371
[8563]	372	# Add FileFormat metadata
[11298]	373	$doc_obj->add_utf8_metadata($section, "FileFormat", "CDS/ISIS");
[7686]	374
[11298]	375	# Record was processed successfully
[6107]	376	return 1;
	377	}
	378
	379
	380	sub parse_field_definition_table
	381	{
[7686]	382	my $fdtfilename = shift(@_);
[11262]	383	my $encoding = shift(@_);
[6107]	384
[7686]	385	my %fdtmapping = ();
[6107]	386
	387	open(FDT_FILE, "<$fdtfilename") \|\| die "Error: Could not open file $fdtfilename.\n";
	388
[11262]	389	my $fdtfiletext = "";
	390	my $reader = new multiread();
[15872]	391	$reader->set_handle('ISISPlugin::FDT_FILE');
[11262]	392	$reader->set_encoding($encoding);
	393	$reader->read_file($fdtfiletext);
	394
[7686]	395	my $amongstdefinitions = 0;
[11262]	396	foreach my $fdtfileline (split(/\n/, $$fdtfiletext)) {
[6107]	397	$fdtfileline =~ s/(\s*)$//; # Remove any nasty spaces at the end of the lines
	398
	399	if ($amongstdefinitions) {
[13298]	400	my $fieldname = &unicode::substr($fdtfileline, 0, 30);
	401	my $fieldsubfields = &unicode::substr($fdtfileline, 30, 20);
	402	my $fieldspecs = &unicode::substr($fdtfileline, 50, 50);
[6107]	403
	404	# Remove extra spaces
[11298]	405	$fieldname =~ s/(\s*)$//;
[6107]	406	$fieldsubfields =~ s/(\s*)$//;
[11298]	407	$fieldspecs =~ s/(\s*)$//;
[6107]	408
[11298]	409	# Map from tag number to metadata field title, subfields, and repeatability
	410	my $fieldtag = (split(/ /, $fieldspecs))[0];
	411	my $fieldrepeatable = (split(/ /, $fieldspecs))[3];
	412	$fdtmapping{$fieldtag} = { 'name' => $fieldname,
	413	'subfields' => $fieldsubfields,
	414	'repeatable' => $fieldrepeatable };
[6107]	415	}
	416	elsif ($fdtfileline eq "***") {
	417	$amongstdefinitions = 1;
	418	}
	419	}
	420
	421	close(FDT_FILE);
	422
	423	return %fdtmapping;
	424	}
	425
	426
[12705]	427	sub escape_metadata_value
	428	{
	429	my $value = shift(@_);
	430	$value =~ s/\</</g;
	431	$value =~ s/\>/>/g;
	432	$value =~ s/\\/\\\\/g;
	433	return $value;
	434	}
	435
	436
[11332]	437	sub clean_up_after_exploding
	438	{
	439	my $self = shift(@_);
	440
	441	# Delete the FDT and XRF files too
[11334]	442	&util::rm($self->{'fdt_file_path'});
	443	&util::rm($self->{'xrf_file_path'});
[11332]	444	}
	445
	446
[6107]	447	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: