Context Navigation

source: gsdl/trunk/perllib/plugins/OpenDocumentPlugin.pm@ 15880

Last change on this file since 15880 was 15872, checked in by kjdon, 16 years ago
plugin overhaul: plugins renamed to xxPlugin, and in some cases the names are made more sensible. They now use the new base plugins. Hopefully we have better code reuse. Some of the plugins still need work done as I didn't want to spend another month doing this before committing it. Alos, I haven't really tested anything yet...
Property svn:keywords set to `Author Date Id Revision`
File size: 9.2 KB

Line
1	###########################################################################
2	#
3	# OpenDocumentPlugin.pm -- The Open Document plugin
4	# A component of the Greenstone digital library software
5	# from the New Zealand Digital Library Project at the
6	# University of Waikato, New Zealand.
7	#
8	# Copyright (C) 2001 New Zealand Digital Library Project
9	#
10	# This program is free software; you can redistribute it and/or modify
11	# it under the terms of the GNU General Public License as published by
12	# the Free Software Foundation; either version 2 of the License, or
13	# (at your option) any later version.
14	#
15	# This program is distributed in the hope that it will be useful,
16	# but WITHOUT ANY WARRANTY; without even the implied warranty of
17	# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18	# GNU General Public License for more details.
19	#
20	# You should have received a copy of the GNU General Public License
21	# along with this program; if not, write to the Free Software
22	# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23	#
24	###########################################################################
25
26	# Processes OASIS Open Document format.
27	# Word processing document: .odt, template: .ott
28	# Spreadsheet document: .ods, template: .ots
29	# Presentation document: .odp, template: .otp
30	# Graphics document: .odg, template: .otg
31	# Formulas document: .odf, template: .otf (not supported)
32
33	#This basically extracts any text out of the document, but not much else.
34
35	# this inherits ReadXMLFile, and therefore offers -xslt option, but does
36	# nothing with it.
37
38	package OpenDocumentPlugin;
39
40	use strict;
41	no strict 'refs'; # allow filehandles to be variables and viceversa
42
43	use ReadXMLFile;
44	use XML::XPath;
45	use XML::XPath::XMLParser;
46	use Cwd;
47	use util;
48	use ghtml;
49
50	sub BEGIN {
51	@OpenDocumentPlugin::ISA = ('ReadXMLFile');
52	}
53
54	our @filesProcess = ( "content.xml" , "meta.xml" );
55
56	my $arguments = [
57	{ 'name' => "process_exp",
58	'desc' => "{BasPlug.process_exp}",
59	'type' => "regexp",
60	'deft' => &get_default_process_exp() }
61	];
62
63	my $options = { 'name' => "OpenDocumentPlugin",
64	'desc' => "{OpenDocumentPlugin.desc}",
65	'abstract' => "no",
66	'inherits' => "yes",
67	'args' => $arguments};
68
69	sub get_default_process_exp { return q^(?i)\.o(?:d\|t)(?:t\|s\|p\|g)$^; }
70
71	sub new {
72	my ($class) = shift (@_);
73	my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
74	push(@$pluginlist, $class);
75
76	push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
77	push(@{$hashArgOptLists->{"OptList"}},$options);
78
79	my $self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
80
81	$self->{'section'} = "";
82	$self->{'office:meta'} = "";
83
84	return bless $self, $class;
85	}
86
87	sub get_doctype {
88	my $self = shift(@_);
89
90	return "manifest:manifest";
91	}
92
93	sub xml_doctype {
94	my $self = shift(@_);
95	my ($expat, $name, $sysid, $pubid, $internal) = @_;
96	die "The only valid doctype is manifest, $name is not valid" if ($name ne "manifest:manifest");
97	}
98
99	# Called for every start tag. The $_ variable will contain a copy of the
100	# tag and the %_ variable will contain the element's attributes.
101	sub xml_start_tag {
102	my $self = shift(@_);
103	my ($expat, $element) = @_;
104	my %atts = %_;
105	$self->{'office:meta'} = $element if $self->{'office:meta'} eq "Start";
106	if($element eq 'office:text') {
107	$self->{'collectedText'} = "";
108	}elsif($element eq 'office:meta') {
109	$self->{'collectedText'} = "";
110	$self->{'office:meta'} = "Start";
111	}elsif($element eq 'meta:document-statistic'){
112	foreach my $att (keys %atts) {
113	$self->{'doc_obj'}->add_utf8_metadata("",$att,$atts{$att});
114	}
115
116	}
117	}
118
119	sub xml_end_tag {
120	my $self = shift(@_);
121	my ($expat, $element) = @_;
122
123	if($element eq 'office:text') {
124	$self->{'doc_obj'}->add_utf8_text("",$self->{'collectedText'});
125	$self->{'collectedText'} = "";
126	}elsif($element eq $self->{'office:meta'}) {
127	if( $self->{'collectedText'} ne "") {
128	$self->{'doc_obj'}->add_utf8_metadata("",$self->{'office:meta'},$self->{'collectedText'});
129	$self->{'doc_obj'}->add_utf8_metadata("","Title",$self->{'collectedText'}) if $self->{'office:meta'} =~ m/:title$/;
130	$self->{'doc_obj'}->add_utf8_metadata("","Language",$self->{'collectedText'}) if $self->{'office:meta'} =~ m/:language$/;
131	$self->{'doc_obj'}->add_utf8_metadata("","GENERATOR",$self->{'collectedText'}) if $self->{'office:meta'} =~ m/:generator$/;
132
133	}
134	$self->{'collectedText'} = "";
135	$self->{'office:meta'} = "Start";
136	}elsif($element eq 'office:meta'){
137	$self->{'office:meta'} = "";
138	}elsif($element eq 'office:body'){
139	#some documents have text in other places that should probably be indexed if we can't find any doc text
140
141	if( defined $self->{'collectedText'} && $self->{'collectedText'} ne "" && $self->{'doc_obj'}->get_text("") eq "") {
142	$self->{'doc_obj'}->add_utf8_text("",$self->{'collectedText'});
143	}
144	}
145	}
146
147	sub xml_text {
148	my $self = shift(@_);
149	my ($expat) = @_;
150	if($_ =~ m/\w/i) {
151	$self->{'collectedText'} .= "<br/>" if $self->{'collectedText'} ne "";
152	$self->{'collectedText'} .= "$_";
153	}
154	}
155
156	#trap start and end document so we do not get our doc_obj closed too soon
157	sub xml_start_document {}
158	sub xml_end_document {}
159
160	sub read {
161	my $self = shift (@_);
162
163	my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
164
165	# check process and block exps, smart block, associate_ext etc
166	my ($block_status,$filename) = $self->read_block(@_);
167	return $block_status if ((!defined $block_status) \|\| ($block_status==0));
168
169	$file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
170	$self->{'file'} = $file;
171	$self->{'filename'} = $filename;
172	$self->{'processor'} = $processor;
173	$self->{'metadata'} = $metadata;
174
175	eval{
176	my ($file_only) = $file =~ /([^\\\/]*)$/;
177	my $tmpdir = &util::get_tmp_filename ();
178	&util::mk_all_dir ($tmpdir);
179
180	$self->open_document();
181
182	# save current working directory
183	my $cwd = getcwd();
184	chdir ($tmpdir) \|\| die "Unable to change to $tmpdir";
185	&util::cp ($filename, $tmpdir);
186
187	$self->unzip ("\"$file_only\"");
188	foreach my $xmlFile (@OpenDocumentPlugin::filesProcess) {
189	if (-e $xmlFile) {
190	$self->{'parser'}->parsefile($xmlFile);
191	}
192	}
193	$self->close_document($filename,$file_only);
194
195	chdir ($cwd) \|\| die "Unable to change back to $cwd";
196	&util::rm_r ($tmpdir);
197
198	};
199
200	if ($@) {
201
202	# parsefile may either croak somewhere in XML::Parser (e.g. because
203	# the document is not well formed) or die somewhere in ReadXMLFile or a
204	# derived plugin (e.g. because we're attempting to process a
205	# document whose DOCTYPE is not meant for this plugin). For the
206	# first case we'll print a warning and continue, for the second
207	# we'll just continue quietly
208
209	print STDERR "**** Error is: $@\n";
210
211	my ($msg) = $@ =~ /Carp::croak\(\'(.*?)\'\)/;
212	if (defined $msg) {
213	my $outhandle = $self->{'outhandle'};
214	my $plugin_name = ref ($self);
215	print $outhandle "$plugin_name failed to process $file ($msg)\n";
216	}
217
218	# reset ourself for the next document
219	print STDERR "<ProcessingError n='$file'>\n" if ($gli);
220	return -1; # error during processing
221	}
222
223	return 1;
224	}
225
226	sub unzip {
227	my $self = shift (@_);
228	my ($file) = @_;
229
230	system ("unzip $file");
231	&util::rm ($file) if -e $file;
232	}
233
234	sub close_document() {
235	my $self = shift(@_);
236	my ($filename,$file_only) = @_;
237
238	my $doc_obj = $self->{'doc_obj'};
239
240	my $mimetype = $self->get_mimetype();
241
242	$doc_obj->associate_file($filename, $file_only, $mimetype, "");
243	$doc_obj->associate_file("Thumbnails/thumbnail.png", "thumbnail.png", "image/png", "");
244	my $doc_ext = $filename;
245	$doc_ext =~ s/.*\.od(.)/od$1/;
246
247	# We use set instead of add here because we only want one value
248	$doc_obj->set_utf8_metadata_element("", "FileFormat", "Open Document");
249
250	#setup to doclink thingi
251	my $doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/[archivedir]/$file_only\">";
252	$doc_obj->add_utf8_metadata ("", "srclink", $doclink);
253	$doc_obj->add_utf8_metadata ("", "srcicon", "<img border=\"0\" align=\"absmiddle\" src=\"_httpprefix_/collect/[collection]/index/assoc/[archivedir]/thumbnail.png\" alt=\"View the Open document\" title=\"View the Open document\">");
254	$doc_obj->add_utf8_metadata ("", "/srclink", "</a>");
255	$self->set_Source_metadata($doc_obj, $file_only);
256	$doc_obj->set_utf8_metadata_element("", "FileSize", (-s $filename));
257
258	# include any metadata passed in from previous plugins
259	# note that this metadata is associated with the top level section
260	$self->extra_metadata ($doc_obj,
261	$doc_obj->get_top_section(),
262	$self->{'metadata'});
263
264	# add a Title if none has been found yet
265	$self->title_fallback($doc_obj,"",$file_only);
266
267	# add an OID
268	$self->add_OID($doc_obj);
269
270	$doc_obj->add_utf8_metadata("", "Plugin", "$self->{'plugin_type'}");
271
272	# process the document
273	$self->{'processor'}->process($doc_obj);
274
275	$self->{'num_processed'} ++;
276	return 1;
277	}
278
279	sub get_mimetype(){
280	my $filename = "mimetype";
281	if (!open (FILEIN,"<$filename")){
282	print STDERR "Warning: unable to open the $filename\n";
283	return "Unknown OpenDocument Format";
284	}
285	else {
286	my $text = "";
287	while (defined (my $line = <FILEIN>)) {
288	$text .= $line;
289	}
290	return $text;
291	}
292	}
293	1;
294
295
296
297

Note: See TracBrowser for help on using the repository browser.

Download in other formats: