source: gsdl/trunk/perllib/plugins/OpenDocumentPlugin.pm@ 15880

Last change on this file since 15880 was 15872, checked in by kjdon, 16 years ago

plugin overhaul: plugins renamed to xxPlugin, and in some cases the names are made more sensible. They now use the new base plugins. Hopefully we have better code reuse. Some of the plugins still need work done as I didn't want to spend another month doing this before committing it. Alos, I haven't really tested anything yet...

  • Property svn:keywords set to Author Date Id Revision
File size: 9.2 KB
Line 
1###########################################################################
2#
3# OpenDocumentPlugin.pm -- The Open Document plugin
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# Processes OASIS Open Document format.
27# Word processing document: .odt, template: .ott
28# Spreadsheet document: .ods, template: .ots
29# Presentation document: .odp, template: .otp
30# Graphics document: .odg, template: .otg
31# Formulas document: .odf, template: .otf (not supported)
32
33#This basically extracts any text out of the document, but not much else.
34
35# this inherits ReadXMLFile, and therefore offers -xslt option, but does
36# nothing with it.
37
38package OpenDocumentPlugin;
39
40use strict;
41no strict 'refs'; # allow filehandles to be variables and viceversa
42
43use ReadXMLFile;
44use XML::XPath;
45use XML::XPath::XMLParser;
46use Cwd;
47use util;
48use ghtml;
49
50sub BEGIN {
51 @OpenDocumentPlugin::ISA = ('ReadXMLFile');
52}
53
54our @filesProcess = ( "content.xml" , "meta.xml" );
55
56my $arguments = [
57 { 'name' => "process_exp",
58 'desc' => "{BasPlug.process_exp}",
59 'type' => "regexp",
60 'deft' => &get_default_process_exp() }
61 ];
62
63my $options = { 'name' => "OpenDocumentPlugin",
64 'desc' => "{OpenDocumentPlugin.desc}",
65 'abstract' => "no",
66 'inherits' => "yes",
67 'args' => $arguments};
68
69sub get_default_process_exp { return q^(?i)\.o(?:d|t)(?:t|s|p|g)$^; }
70
71sub new {
72 my ($class) = shift (@_);
73 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
74 push(@$pluginlist, $class);
75
76 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
77 push(@{$hashArgOptLists->{"OptList"}},$options);
78
79 my $self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
80
81 $self->{'section'} = "";
82 $self->{'office:meta'} = "";
83
84 return bless $self, $class;
85}
86
87sub get_doctype {
88 my $self = shift(@_);
89
90 return "manifest:manifest";
91}
92
93sub xml_doctype {
94 my $self = shift(@_);
95 my ($expat, $name, $sysid, $pubid, $internal) = @_;
96 die "The only valid doctype is manifest, $name is not valid" if ($name ne "manifest:manifest");
97}
98
99# Called for every start tag. The $_ variable will contain a copy of the
100# tag and the %_ variable will contain the element's attributes.
101sub xml_start_tag {
102 my $self = shift(@_);
103 my ($expat, $element) = @_;
104 my %atts = %_;
105 $self->{'office:meta'} = $element if $self->{'office:meta'} eq "Start";
106 if($element eq 'office:text') {
107 $self->{'collectedText'} = "";
108 }elsif($element eq 'office:meta') {
109 $self->{'collectedText'} = "";
110 $self->{'office:meta'} = "Start";
111 }elsif($element eq 'meta:document-statistic'){
112 foreach my $att (keys %atts) {
113 $self->{'doc_obj'}->add_utf8_metadata("",$att,$atts{$att});
114 }
115
116 }
117}
118
119sub xml_end_tag {
120 my $self = shift(@_);
121 my ($expat, $element) = @_;
122
123 if($element eq 'office:text') {
124 $self->{'doc_obj'}->add_utf8_text("",$self->{'collectedText'});
125 $self->{'collectedText'} = "";
126 }elsif($element eq $self->{'office:meta'}) {
127 if( $self->{'collectedText'} ne "") {
128 $self->{'doc_obj'}->add_utf8_metadata("",$self->{'office:meta'},$self->{'collectedText'});
129 $self->{'doc_obj'}->add_utf8_metadata("","Title",$self->{'collectedText'}) if $self->{'office:meta'} =~ m/:title$/;
130 $self->{'doc_obj'}->add_utf8_metadata("","Language",$self->{'collectedText'}) if $self->{'office:meta'} =~ m/:language$/;
131 $self->{'doc_obj'}->add_utf8_metadata("","GENERATOR",$self->{'collectedText'}) if $self->{'office:meta'} =~ m/:generator$/;
132
133 }
134 $self->{'collectedText'} = "";
135 $self->{'office:meta'} = "Start";
136 }elsif($element eq 'office:meta'){
137 $self->{'office:meta'} = "";
138 }elsif($element eq 'office:body'){
139 #some documents have text in other places that should probably be indexed if we can't find any doc text
140
141 if( defined $self->{'collectedText'} && $self->{'collectedText'} ne "" && $self->{'doc_obj'}->get_text("") eq "") {
142 $self->{'doc_obj'}->add_utf8_text("",$self->{'collectedText'});
143 }
144 }
145}
146
147sub xml_text {
148 my $self = shift(@_);
149 my ($expat) = @_;
150 if($_ =~ m/\w/i) {
151 $self->{'collectedText'} .= "<br/>" if $self->{'collectedText'} ne "";
152 $self->{'collectedText'} .= "$_";
153 }
154}
155
156#trap start and end document so we do not get our doc_obj closed too soon
157sub xml_start_document {}
158sub xml_end_document {}
159
160sub read {
161 my $self = shift (@_);
162
163 my ($pluginfo, $base_dir, $file, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
164
165 # check process and block exps, smart block, associate_ext etc
166 my ($block_status,$filename) = $self->read_block(@_);
167 return $block_status if ((!defined $block_status) || ($block_status==0));
168
169 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
170 $self->{'file'} = $file;
171 $self->{'filename'} = $filename;
172 $self->{'processor'} = $processor;
173 $self->{'metadata'} = $metadata;
174
175 eval{
176 my ($file_only) = $file =~ /([^\\\/]*)$/;
177 my $tmpdir = &util::get_tmp_filename ();
178 &util::mk_all_dir ($tmpdir);
179
180 $self->open_document();
181
182 # save current working directory
183 my $cwd = getcwd();
184 chdir ($tmpdir) || die "Unable to change to $tmpdir";
185 &util::cp ($filename, $tmpdir);
186
187 $self->unzip ("\"$file_only\"");
188 foreach my $xmlFile (@OpenDocumentPlugin::filesProcess) {
189 if (-e $xmlFile) {
190 $self->{'parser'}->parsefile($xmlFile);
191 }
192 }
193 $self->close_document($filename,$file_only);
194
195 chdir ($cwd) || die "Unable to change back to $cwd";
196 &util::rm_r ($tmpdir);
197
198 };
199
200 if ($@) {
201
202 # parsefile may either croak somewhere in XML::Parser (e.g. because
203 # the document is not well formed) or die somewhere in ReadXMLFile or a
204 # derived plugin (e.g. because we're attempting to process a
205 # document whose DOCTYPE is not meant for this plugin). For the
206 # first case we'll print a warning and continue, for the second
207 # we'll just continue quietly
208
209 print STDERR "**** Error is: $@\n";
210
211 my ($msg) = $@ =~ /Carp::croak\(\'(.*?)\'\)/;
212 if (defined $msg) {
213 my $outhandle = $self->{'outhandle'};
214 my $plugin_name = ref ($self);
215 print $outhandle "$plugin_name failed to process $file ($msg)\n";
216 }
217
218 # reset ourself for the next document
219 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
220 return -1; # error during processing
221 }
222
223 return 1;
224}
225
226sub unzip {
227 my $self = shift (@_);
228 my ($file) = @_;
229
230 system ("unzip $file");
231 &util::rm ($file) if -e $file;
232}
233
234sub close_document() {
235 my $self = shift(@_);
236 my ($filename,$file_only) = @_;
237
238 my $doc_obj = $self->{'doc_obj'};
239
240 my $mimetype = $self->get_mimetype();
241
242 $doc_obj->associate_file($filename, $file_only, $mimetype, "");
243 $doc_obj->associate_file("Thumbnails/thumbnail.png", "thumbnail.png", "image/png", "");
244 my $doc_ext = $filename;
245 $doc_ext =~ s/.*\.od(.)/od$1/;
246
247 # We use set instead of add here because we only want one value
248 $doc_obj->set_utf8_metadata_element("", "FileFormat", "Open Document");
249
250 #setup to doclink thingi
251 my $doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/[archivedir]/$file_only\">";
252 $doc_obj->add_utf8_metadata ("", "srclink", $doclink);
253 $doc_obj->add_utf8_metadata ("", "srcicon", "<img border=\"0\" align=\"absmiddle\" src=\"_httpprefix_/collect/[collection]/index/assoc/[archivedir]/thumbnail.png\" alt=\"View the Open document\" title=\"View the Open document\">");
254 $doc_obj->add_utf8_metadata ("", "/srclink", "</a>");
255 $self->set_Source_metadata($doc_obj, $file_only);
256 $doc_obj->set_utf8_metadata_element("", "FileSize", (-s $filename));
257
258 # include any metadata passed in from previous plugins
259 # note that this metadata is associated with the top level section
260 $self->extra_metadata ($doc_obj,
261 $doc_obj->get_top_section(),
262 $self->{'metadata'});
263
264 # add a Title if none has been found yet
265 $self->title_fallback($doc_obj,"",$file_only);
266
267 # add an OID
268 $self->add_OID($doc_obj);
269
270 $doc_obj->add_utf8_metadata("", "Plugin", "$self->{'plugin_type'}");
271
272 # process the document
273 $self->{'processor'}->process($doc_obj);
274
275 $self->{'num_processed'} ++;
276 return 1;
277}
278
279sub get_mimetype(){
280 my $filename = "mimetype";
281 if (!open (FILEIN,"<$filename")){
282 print STDERR "Warning: unable to open the $filename\n";
283 return "Unknown OpenDocument Format";
284 }
285 else {
286 my $text = "";
287 while (defined (my $line = <FILEIN>)) {
288 $text .= $line;
289 }
290 return $text;
291 }
292}
2931;
294
295
296
297
Note: See TracBrowser for help on using the repository browser.