Context Navigation

source: trunk/gsdl/perllib/plugins/ISISPlug.pm@ 9961

Last change on this file since 9961 was 8762, checked in by mdewsnip, 20 years ago
The files this plugin processes can be exploded by the explode_metadata_database.pl script.
Property svn:keywords set to `Author Date Id Revision`
File size: 8.3 KB

Rev	Line
[6107]	1	###########################################################################
	2	#
	3	# ISISPlug.pm -- A plugin for CDS/ISIS databases
	4	#
	5	# A component of the Greenstone digital library software
	6	# from the New Zealand Digital Library Project at the
	7	# University of Waikato, New Zealand.
	8	#
[7686]	9	# Copyright 1999-2004 New Zealand Digital Library Project
[6107]	10	#
	11	# This program is free software; you can redistribute it and/or modify
	12	# it under the terms of the GNU General Public License as published by
	13	# the Free Software Foundation; either version 2 of the License, or
	14	# (at your option) any later version.
	15	#
	16	# This program is distributed in the hope that it will be useful,
	17	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	18	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	19	# GNU General Public License for more details.
	20	#
	21	# You should have received a copy of the GNU General Public License
	22	# along with this program; if not, write to the Free Software
	23	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	24	#
	25	###########################################################################
	26
	27	package ISISPlug;
	28
	29
	30	use multiread;
	31	use SplitPlug;
	32
	33
	34	# ISISPlug is a sub-class of SplitPlug.
	35	sub BEGIN {
	36	@ISA = ('SplitPlug');
	37	}
	38
	39
	40	my $arguments =
[7686]	41	[ { 'name' => "process_exp",
[6408]	42	'desc' => "{BasPlug.process_exp}",
	43	'type' => "regexp",
	44	'reqd' => "no",
[6107]	45	'deft' => &get_default_process_exp() },
	46	{ 'name' => "block_exp",
	47	'desc' => "{BasPlug.block_exp}",
[6408]	48	'type' => "regexp",
[7686]	49	'reqd' => "no",
[6107]	50	'deft' => &get_default_block_exp() },
[7686]	51	{ 'name' => "split_exp",
	52	'desc' => "{SplitPlug.split_exp}",
	53	'type' => "regexp",
	54	'reqd' => "no",
	55	'deft' => &get_default_split_exp() },
	56
	57	# The interesting options
	58	{ 'name' => "entry_separator",
	59	'desc' => "{ISISPlug.entry_separator}",
	60	'type' => "string",
	61	'reqd' => "no",
	62	'deft' => "<br>" },
[6107]	63	{ 'name' => "subfield_separator",
	64	'desc' => "{ISISPlug.subfield_separator}",
	65	'type' => "string",
	66	'reqd' => "no",
[8563]	67	'deft' => ", " }
[6408]	68	];
[6107]	69
	70	my $options = { 'name' => "ISISPlug",
	71	'desc' => "{ISISPlug.desc}",
[6408]	72	'abstract' => "no",
	73	'inherits' => "yes",
[8762]	74	'explodes' => "yes",
[6107]	75	'args' => $arguments };
	76
	77
	78	# This plugin processes files with the suffix ".mst"
	79	sub get_default_process_exp {
	80	return q^(?i)(\.mst)$^;
	81	}
	82
	83
	84	# This plugin blocks files with the suffix ".fdt" and ".xrf"
	85	sub get_default_block_exp {
	86	return q^(?i)(\.fdt\|\.xrf)$^;
	87	}
	88
	89
	90	# This plugin splits the input text at the "----------" lines
	91	sub get_default_split_exp {
	92	return q^\n----------\n^;
	93	}
	94
	95
[8563]	96	sub new
	97	{
[6107]	98	my $class = shift(@_);
	99
	100	my $self = new SplitPlug($class, @_);
	101	if (!parsargv::parse(\@_,
	102	q^subfield_separator/.*/, ^, \$self->{'subfield_separator'},
	103	q^entry_separator/.*/<br>^, \$self->{'entry_separator'},
	104	"allow_extra_options")) {
	105	print STDERR "\nIncorrect options passed to ISISPlug, check your collect.cfg configuration file\n";
	106	die "\n";
	107	}
	108
	109	# To allow for proper inheritance of arguments
	110	my $option_list = $self->{'option_list'};
	111	push(@{$option_list}, $options);
[6138]	112	$self->{'plugin_type'} = "ISISPlug";
[6107]	113
	114	return bless $self, $class;
	115	}
	116
	117
[8563]	118	sub read_file
[7686]	119	{
[6107]	120	my $self = shift (@_);
	121	my ($filename, $encoding, $language, $textref) = @_;
	122
[8563]	123	my ($databasename) = ($filename =~ /(.*)\.mst$/i);
[6107]	124
[7048]	125	# Check the associated .fdt and .xrf files exist
	126	# These files must have a lowercase extension for the IsisGdl program to work
	127	# Bailing out because of this is kind of crappy but it is only an issue on Unix
	128	my $fdtfilename = $databasename . ".fdt";
	129	if (! -e $fdtfilename) {
[7049]	130	die "Error: Could not find ISIS FDT file $fdtfilename.\n";
[7048]	131	}
	132	my $xrffilename = $databasename . ".xrf";
	133	if (! -e $xrffilename) {
[7049]	134	die "Error: Could not find ISIS XRF file $xrffilename.\n";
[7048]	135	}
	136
[6107]	137	# The text to split is exported from the database by the IsisGdl program
[7021]	138	open(FILE, "IsisGdl \"$filename\" \|");
[6107]	139
	140	my $reader = new multiread();
[8563]	141	$reader->set_handle('ISISPlug::FILE');
	142	$reader->set_encoding($encoding);
	143	$reader->read_file($textref);
[6107]	144
	145	close(FILE);
	146
	147	# Parse the associated ISIS database Field Definition Table file (.fdt)
	148	my %fdtmapping = &parse_field_definition_table($fdtfilename);
	149
	150	# Map the tag numbers to tag names, using the FDT mapping
	151	$$textref =~ s/\ntag=(\d+) /\ntag=$fdtmapping{$1}{'title'} /g;
	152
[7686]	153	# Remove the line at the start so it is split and processed properly
	154	$$textref =~ s/^----------\n//;
[6107]	155	}
	156
	157
	158	sub process
	159	{
	160	my $self = shift (@_);
[6332]	161	my ($textref, $pluginfo, $base_dir, $file, $metadata, $doc_obj, $gli) = @_;
[6107]	162	my $outhandle = $self->{'outhandle'};
	163
	164	my $subfield_separator = $self->{'subfield_separator'};
	165	my $entry_separator = $self->{'entry_separator'};
	166
	167	# Report that we're processing the file
[6332]	168	print STDERR "<Processing n='$file' p='ISISPlug'>\n" if ($gli);
[6107]	169	print $outhandle "IsisPlug: processing $file\n"
	170	if ($self->{'verbosity'}) > 1;
	171
	172	# Process each line of the ISIS record, one at a time
	173	foreach $line (split(/\n/, $$textref)) {
[8646]	174	$line =~ /^tag=(.*) data=(.+)$/;
[7686]	175	my $rawtagname = $1;
	176	my $rawtagdata = $2;
[8646]	177	# print STDERR "Raw tag: $rawtagname, Raw data: $rawtagdata\n";
	178	next if ($rawtagname eq "");
[6107]	179
	180	# Metadata field names: title case, then remove spaces
[7686]	181	my $tagname = "";
[6107]	182	foreach $word (split(/\s+/, $rawtagname)) {
	183	substr($word, 0, 1) =~ tr/a-z/A-Z/;
	184	$tagname .= $word;
	185	}
	186
[6123]	187	# Make sure there is nothing bad in the tag names
	188	$tagname =~ s/&//g;
	189
[6107]	190	# Handle each piece of metadata ('%' separated)
[7686]	191	my $completetagvalue = "";
[6107]	192	foreach $rawtagvalue (split(/%/, $rawtagdata)) {
	193	$completetagvalue .= $entry_separator unless ($completetagvalue eq "");
	194
	195	# Metadata field values: take care with subfields
[7686]	196	my $completeentryvalue = "";
[6107]	197	while ($rawtagvalue ne "") {
	198	# If there is a subfield specifier, parse it off
[7686]	199	my $subfieldname = "";
[8646]	200	if ($rawtagvalue =~ s/^\^// && $rawtagvalue =~ s/([a-z])//) {
[8563]	201	$subfieldname = "^$1";
[6107]	202	}
	203
	204	# Parse the metadata value off
	205	$rawtagvalue =~ s/^([^\^]*)//;
[7686]	206	my $metadatafieldname = $tagname . $subfieldname;
	207	my $metadatafieldvalue = $1;
[6107]	208
[8749]	209	# Escape any '<' and '>' characters so they appear correctly in the final collection
	210	$metadatafieldvalue =~ s/\</</g;
	211	$metadatafieldvalue =~ s/\>/>/g;
	212
[6107]	213	# Handle Keywords specially
	214	if ($metadatafieldname eq "Keywords") {
[7686]	215	my $keywordmetadatavalue = $metadatafieldvalue;
	216	my $keywordlist = "";
[6107]	217	while ($keywordmetadatavalue =~ s/\<([^\>]+)\>//) {
[7686]	218	my $keyword = $1;
[6107]	219	$doc_obj->add_utf8_metadata($cursection, $metadatafieldname, $keyword);
	220	$keywordlist .= ", " unless ($keywordlist eq "");
	221	$keywordlist .= $keyword;
	222	}
	223
	224	$metadatafieldvalue = $keywordlist;
	225	}
	226
	227	else {
	228	$doc_obj->add_utf8_metadata($cursection, $metadatafieldname, $metadatafieldvalue);
	229	}
	230
	231	$completeentryvalue .= $subfield_separator unless ($completeentryvalue eq "");
	232	$completeentryvalue .= $metadatafieldvalue;
	233	}
	234
	235	$completetagvalue .= $completeentryvalue;
	236	}
	237
[8563]	238	$doc_obj->add_utf8_metadata($cursection, $tagname . "^all", $completetagvalue);
[6107]	239	}
[8563]	240
[6107]	241	# Add the full record as the document text
	242	$$textref =~ s/\</</g;
	243	$$textref =~ s/\>/>/g;
[8563]	244	$doc_obj->add_utf8_text($cursection, $$textref);
[6107]	245
[8563]	246	# Add FileFormat metadata
	247	$doc_obj->add_utf8_metadata($cursection, "FileFormat", "CDS/ISIS");
[7686]	248
	249	# Record was processed successfully (and there was no document obtained)
[6107]	250	return 1;
	251	}
	252
	253
	254	sub parse_field_definition_table
	255	{
[7686]	256	my $fdtfilename = shift(@_);
[6107]	257
[7686]	258	my %fdtmapping = ();
[6107]	259
	260	open(FDT_FILE, "<$fdtfilename") \|\| die "Error: Could not open file $fdtfilename.\n";
	261
[7686]	262	my $amongstdefinitions = 0;
[6107]	263	foreach $fdtfileline (<FDT_FILE>) {
	264	$fdtfileline =~ s/(\s*)$//; # Remove any nasty spaces at the end of the lines
	265
	266	if ($amongstdefinitions) {
[7686]	267	my $fieldtitle = substr($fdtfileline, 0, 30);
	268	my $fieldsubfields = substr($fdtfileline, 30, 20);
	269	my $fieldspecs = substr($fdtfileline, 50);
[6107]	270
	271	# Remove extra spaces
	272	$fieldtitle =~ s/(\s*)$//;
	273	$fieldsubfields =~ s/(\s*)$//;
	274
	275	# Map from tag number to metadata field title and subfields
[7686]	276	my $fieldtag = (split(/ /, $fieldspecs))[0];
[6107]	277	$fdtmapping{$fieldtag} = { 'title' => $fieldtitle,
	278	'subfields' => $fieldsubfields };
	279	}
	280	elsif ($fdtfileline eq "***") {
	281	$amongstdefinitions = 1;
	282	}
	283	}
	284
	285	close(FDT_FILE);
	286
	287	return %fdtmapping;
	288	}
	289
	290
	291	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: