Context Navigation

source: gsdl/trunk/perllib/plugins/GMLPlugin.pm@ 15918

Last change on this file since 15918 was 15872, checked in by kjdon, 16 years ago
plugin overhaul: plugins renamed to xxPlugin, and in some cases the names are made more sensible. They now use the new base plugins. Hopefully we have better code reuse. Some of the plugins still need work done as I didn't want to spend another month doing this before committing it. Alos, I haven't really tested anything yet...
Property svn:executable set to ``* Property svn:keywords set to `Author Date Id Revision`
File size: 7.1 KB

Rev	Line
[537]	1	###########################################################################
	2	#
[15872]	3	# GMLPlugin.pm --
[537]	4	# A component of the Greenstone digital library software
	5	# from the New Zealand Digital Library Project at the
	6	# University of Waikato, New Zealand.
	7	#
	8	# Copyright (C) 1999 New Zealand Digital Library Project
	9	#
	10	# This program is free software; you can redistribute it and/or modify
	11	# it under the terms of the GNU General Public License as published by
	12	# the Free Software Foundation; either version 2 of the License, or
	13	# (at your option) any later version.
	14	#
	15	# This program is distributed in the hope that it will be useful,
	16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
	17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	18	# GNU General Public License for more details.
	19	#
	20	# You should have received a copy of the GNU General Public License
	21	# along with this program; if not, write to the Free Software
	22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
	23	#
	24	###########################################################################
[4]	25
[537]	26	# plugin which processes a GML format document
[840]	27	# assumes that gml tags are all in lower-case.
[537]	28
[3540]	29	# 12/05/02 Added usage datastructure - John Thompson
	30
[15872]	31	package GMLPlugin;
[4]	32
[15872]	33	use BasePlugin;
[4]	34	use util;
	35	use doc;
	36
[10254]	37	use strict;
	38	no strict 'refs'; # allow filehandles to be variables and viceversa
	39
[4]	40	sub BEGIN {
[15872]	41	@GMLPlugin::ISA = ('BasePlugin');
[4]	42	}
	43
[6408]	44	my $arguments =
	45	[ { 'name' => "process_exp",
[15872]	46	'desc' => "{BasePlugin.process_exp}",
[6408]	47	'type' => "regexp",
	48	'deft' => &get_default_process_exp() }
	49	];
	50
[15872]	51	my $options = { 'name' => "GMLPlugin",
	52	'desc' => "{GMLPlugin.desc}",
[6408]	53	'abstract' => "no",
	54	'inherits' => "yes",
	55	'args' => $arguments };
[3540]	56
[4]	57	sub new {
[10218]	58	my ($class) = shift (@_);
	59	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
	60	push(@$pluginlist, $class);
[4]	61
[15872]	62	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
	63	push(@{$hashArgOptLists->{"OptList"}},$options);
[3540]	64
[15872]	65	my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists);
[4]	66
[10218]	67	return bless $self, $class;
	68	}
	69
[1244]	70	sub get_default_process_exp {
[4]	71	my $self = shift (@_);
	72
[1269]	73	return q^(?i)\.gml?$^;
[4]	74	}
	75
[317]	76	# return number of files processed, undef if can't process
[4]	77	# Note that $base_dir might be "" and that $file might
	78	# include directories
	79	sub read {
	80	my $self = shift (@_);
[9853]	81	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
[1424]	82	my $outhandle = $self->{'outhandle'};
[4]	83
[11090]	84	#check process and block exps, smart block, etc
	85	my ($block_status,$filename) = $self->read_block(@_);
	86	return $block_status if ((!defined $block_status) \|\| ($block_status==0));
[2795]	87
[1244]	88	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
[4]	89
[15872]	90	print STDERR "<Processing n='$file' p='GMLPlugin'>\n" if ($gli);
	91	print $outhandle "GMLPlugin: processing $file\n";
[4]	92
[1244]	93	my $parent_dir = $file;
	94	$parent_dir =~ s/[^\\\/]*$//;
	95	$parent_dir = &util::filename_cat ($base_dir, $parent_dir);
	96
[1269]	97	if (!open (INFILE, $filename)) {
[9584]	98	if ($gli) {
	99	print STDERR "<ProcessingError n='$file' r='Could not read $filename'>\n";
	100	}
[15872]	101	print $outhandle "GMLPlugin::read - couldn't read $filename\n";
[7362]	102	return -1;
[433]	103	}
	104
[840]	105	undef $/;
[862]	106	my $gml = <INFILE>;
[840]	107	$/ = "\n";
[4]	108	close (INFILE);
[862]	109
[840]	110	my @gml_sections = split("</gsdlsection>",$gml);
	111	$gml = shift(@gml_sections);
[4]	112
[840]	113	my $no_docs = 0;
	114
[862]	115	while (1) {
[840]	116	# create a new document
	117	my $doc_obj = new doc ();
	118	my $section = $doc_obj->get_top_section();
	119
	120	# process the document
	121	my $firstsection = 1;
[862]	122	while (1) {
	123	my ($tags, $text) = ("", "");
[840]	124
[862]	125	my @indenting_sections = split("<gsdlsection", $gml);
[2267]	126	shift(@indenting_sections); # skips over xml header if present
[840]	127
[862]	128	foreach $gml (@indenting_sections) {
[840]	129
[862]	130	if ($gml =~ /^\s([^>])>(.*)$/so) {
	131	$tags = $1 if defined $1;
[15872]	132	$text = &GMLPlugin::_unescape_text($2);
[862]	133
	134	} else {
[15872]	135	print $outhandle "GMLPlugin::read - error in file $filename\n";
[1424]	136	print $outhandle "text: \"$gml\"\n";
[840]	137	last;
	138	}
[4]	139
[840]	140	# create the section (unless this is the first section)
[862]	141	if ($firstsection) {
[863]	142	$firstsection = 0;
[862]	143	# $tags =~ /gsdlsourcefilename\s=\s(?:\"([^\"]*)\")/o;
	144	# $src_filename = $2 \|\| $3;
[840]	145
[862]	146	} else {
	147
	148	$tags =~ s/gsdlnum\s=\s\"?(\d+)\"?//o;
	149	if (defined $1) {
[840]	150	$section .= ".$1";
	151	$doc_obj->create_named_section($section);
[862]	152	} else {
[840]	153	$section = $doc_obj->insert_section($doc_obj->get_end_child($section));
	154	}
	155	}
	156
[2267]	157	# add the metadata
	158	# could be stored as either attributes or ....
[863]	159	while ((defined $tags) && ($tags =~ s/^\s(\S+)=\"([^\"])\"//o)) {
[15872]	160	$doc_obj->add_utf8_metadata($section, $1, &GMLPlugin::_unescape_text($2))
[840]	161	if (defined $1 and defined $2);
[863]	162
[840]	163	}
	164
[2267]	165	# ... or tags (xml compliant)
	166	if ($text =~ m/^\s*<metadata>/)
	167	{
[2326]	168	my ($metadata, $tagname, $tagvalue);
[2267]	169	($metadata,$text)
	170	= ($text =~ m/\s<metadata>\s(<.)\s<\/metadata>(.*)$/s);
[2363]	171
	172	# note: \1 refers to 1st match within regexp, so we can
	173	# handle the unescaped text here...
[2267]	174	while ((defined $metadata)
[2363]	175	&& ($metadata =~ s/<(.?)>(.?)<\/\1>//s))
[2267]	176	{
[2326]	177	if (defined $1 && defined $2)
	178	{
	179	$tagname = $1;
	180	$tagvalue = $2;
	181
	182	# if tagname begins with '/' it will be escaped
	183	$tagname =~ s/^&\#47;/\//;
	184
[15872]	185	$doc_obj->add_utf8_metadata($section, $tagname, &GMLPlugin::_unescape_text($tagvalue));
[2326]	186	}
[2267]	187	}
	188	}
	189
[840]	190	# add the text
[2267]	191
[840]	192	$doc_obj->add_utf8_text($section, $text)
	193	if ((defined $text) && ($text ne ""));
[4]	194	}
	195
[840]	196	$gml = shift(@gml_sections); # get next bit of data
[862]	197	last unless defined $gml;
[863]	198	last if $section eq ""; # back to top level again (more than one document in gml file)
[4]	199	$section = $doc_obj->get_parent_section ($section);
[1244]	200	} # while (1) section level
[8121]	201
	202	# add the FileFormat as the metadata
	203	$doc_obj->add_metadata($doc_obj->get_top_section(), "FileFormat", "GML");
[4]	204
[862]	205	# add the associated files
[1244]	206	my $assoc_files = $doc_obj->get_metadata($doc_obj->get_top_section(), "gsdlassocfile");
[1401]	207	my ($assoc_file_info);
[8121]	208
[1401]	209	foreach $assoc_file_info (@$assoc_files)
	210	{
[862]	211	my ($assoc_file, $mime_type, $dir) = split (":", $assoc_file_info);
[1401]	212	my $real_dir = &util::filename_cat($parent_dir, $assoc_file),
	213	my $assoc_dir = (defined $dir && $dir ne "")
	214	? &util::filename_cat($dir, $assoc_file) : $assoc_file;
	215	$doc_obj->associate_file($real_dir, $assoc_dir, $mime_type);
	216
[840]	217	}
	218	$doc_obj->delete_metadata($doc_obj->get_top_section(), "gsdlassocfile");
[1401]	219
[863]	220	# add metadata passed in from elsewhere
	221	$self->extra_metadata ($doc_obj, $doc_obj->get_top_section(), $metadata);
[862]	222
[1244]	223	# do any automatic metadata extraction
	224	$self->auto_extract_metadata ($doc_obj);
	225
	226	# assume the document has an OID already
[1401]	227
[840]	228	# process the document
	229	$processor->process($doc_obj, $file);
[862]	230
[840]	231	$no_docs++;
[9853]	232	last if ($maxdocs > -1 && ($total_count+$no_docs) >= $maxdocs);
[918]	233	last unless defined $gml && $gml =~ /\w/;
[1244]	234	} # while(1) document level
[1401]	235
[840]	236	return $no_docs; # no of docs processed
[4]	237	}
	238
[1244]	239	sub _unescape_text {
	240	my ($text) = @_;
[4]	241
[1244]	242	# special characters in the gml encoding
	243	$text =~ s/</</g;
	244	$text =~ s/>/>/g;
	245	$text =~ s/"/\"/g;
	246	$text =~ s/&/&/g; # this has to be last...
	247
	248	return $text;
	249	}
	250
[4]	251	1;

Note: See TracBrowser for help on using the repository browser.

Download in other formats: