source: main/trunk/greenstone2/perllib/plugins/OpenDocumentPlugin.pm@ 31492

Last change on this file since 31492 was 31492, checked in by kjdon, 7 years ago

renamed EncodingUtil to CommonUtil, BasePlugin to BaseImporter. The idea is that only top level plugins that you can specify in your collection get to have plugin in their name. Modified all other plugins to reflect these name changes

  • Property svn:keywords set to Author Date Id Revision
File size: 10.1 KB
Line 
1###########################################################################
2#
3# OpenDocumentPlugin.pm -- The Open Document plugin
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 2001 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# Processes OASIS Open Document format.
27# Word processing document: .odt, template: .ott
28# Spreadsheet document: .ods, template: .ots
29# Presentation document: .odp, template: .otp
30# Graphics document: .odg, template: .otg
31# Formulas document: .odf, template: .otf (not supported)
32
33#This basically extracts any text out of the document, but not much else.
34
35# this inherits ReadXMLFile, and therefore offers -xslt option, but does
36# nothing with it.
37
38package OpenDocumentPlugin;
39
40use strict;
41no strict 'refs'; # allow filehandles to be variables and viceversa
42
43use ReadXMLFile;
44use XML::XPath;
45use XML::XPath::XMLParser;
46use Cwd;
47use util;
48use ghtml;
49
50sub BEGIN {
51 @OpenDocumentPlugin::ISA = ('ReadXMLFile');
52}
53
54our @filesProcess = ( "content.xml" , "meta.xml" );
55
56my $arguments = [
57 { 'name' => "process_exp",
58 'desc' => "{BaseImporter.process_exp}",
59 'type' => "regexp",
60 'deft' => &get_default_process_exp() }
61 ];
62
63my $options = { 'name' => "OpenDocumentPlugin",
64 'desc' => "{OpenDocumentPlugin.desc}",
65 'abstract' => "no",
66 'inherits' => "yes",
67 'args' => $arguments};
68
69sub get_default_process_exp { return q^(?i)\.o(?:d|t)(?:t|s|p|g)$^; }
70
71sub new {
72 my ($class) = shift (@_);
73 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
74 push(@$pluginlist, $class);
75
76 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
77 push(@{$hashArgOptLists->{"OptList"}},$options);
78
79 my $self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
80
81 $self->{'section'} = "";
82 $self->{'office:meta'} = "";
83
84 return bless $self, $class;
85}
86
87# want to use BaseImporter's version of this, not ReadXMLFile's
88sub can_process_this_file {
89 my $self = shift(@_);
90
91 return $self->BaseImporter::can_process_this_file(@_);
92}
93
94sub get_doctype {
95 my $self = shift(@_);
96
97 return "manifest:manifest";
98}
99
100sub xml_doctype {
101 my $self = shift(@_);
102 my ($expat, $name, $sysid, $pubid, $internal) = @_;
103 die "The only valid doctype is manifest, $name is not valid" if ($name ne "manifest:manifest");
104}
105
106# Called for every start tag. The $_ variable will contain a copy of the
107# tag and the %_ variable will contain the element's attributes.
108sub xml_start_tag {
109 my $self = shift(@_);
110 my ($expat, $element) = @_;
111 my %atts = %_;
112 $self->{'office:meta'} = $element if $self->{'office:meta'} eq "Start";
113 if($element eq 'office:text') {
114 $self->{'collectedText'} = "";
115 }elsif($element eq 'office:meta') {
116 $self->{'collectedText'} = "";
117 $self->{'office:meta'} = "Start";
118 }elsif($element eq 'meta:document-statistic'){
119 foreach my $att (keys %atts) {
120 $self->{'doc_obj'}->add_utf8_metadata("",$att,$atts{$att});
121 }
122
123 }
124}
125
126sub xml_end_tag {
127 my $self = shift(@_);
128 my ($expat, $element) = @_;
129
130 if($element eq 'office:text') {
131 $self->{'doc_obj'}->add_utf8_text("",$self->{'collectedText'});
132 $self->{'collectedText'} = "";
133 }elsif($element eq $self->{'office:meta'}) {
134 if( $self->{'collectedText'} ne "") {
135 $self->{'doc_obj'}->add_utf8_metadata("",$self->{'office:meta'},$self->{'collectedText'});
136 $self->{'doc_obj'}->add_utf8_metadata("","Title",$self->{'collectedText'}) if $self->{'office:meta'} =~ m/:title$/;
137 $self->{'doc_obj'}->add_utf8_metadata("","Language",$self->{'collectedText'}) if $self->{'office:meta'} =~ m/:language$/;
138 $self->{'doc_obj'}->add_utf8_metadata("","GENERATOR",$self->{'collectedText'}) if $self->{'office:meta'} =~ m/:generator$/;
139
140 }
141 $self->{'collectedText'} = "";
142 $self->{'office:meta'} = "Start";
143 }elsif($element eq 'office:meta'){
144 $self->{'office:meta'} = "";
145 }elsif($element eq 'office:body'){
146 #some documents have text in other places that should probably be indexed if we can't find any doc text
147
148 if( defined $self->{'collectedText'} && $self->{'collectedText'} ne "" && $self->{'doc_obj'}->get_text("") eq "") {
149 $self->{'doc_obj'}->add_utf8_text("",$self->{'collectedText'});
150 }
151 }
152}
153
154sub xml_text {
155 my $self = shift(@_);
156 my ($expat) = @_;
157 if($_ =~ m/\w/i) {
158 $self->{'collectedText'} .= "<br/>" if $self->{'collectedText'} ne "";
159 $self->{'collectedText'} .= "$_";
160 }
161}
162
163#trap start and end document so we do not get our doc_obj closed too soon
164sub xml_start_document {}
165sub xml_end_document {}
166
167sub read {
168 my $self = shift (@_);
169
170 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
171
172 # can we process this file??
173 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
174 return undef unless $self->can_process_this_file($filename_full_path);
175
176 my $outhandle = $self->{'outhandle'};
177 # Report that we're processing the file
178 print STDERR "<Processing n='$file' p='OpenDocumentPlugin'>\n" if ($gli);
179 print $outhandle "OpenDocumentPlugin: processing $file\n"
180 if ($self->{'verbosity'}) > 1;
181
182 $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
183 $self->{'file'} = $file;
184 $self->{'filename'} = $filename_full_path;
185 $self->{'filename_no_path'} = $filename_no_path;
186 $self->{'processor'} = $processor;
187 $self->{'metadata'} = $metadata;
188
189 eval{
190 my ($file_only) = $file =~ /([^\\\/]*)$/;
191 my $tmpdir = &util::get_tmp_filename ();
192 &FileUtils::makeAllDirectories ($tmpdir);
193
194 $self->open_document();
195
196 # save current working directory
197 my $cwd = getcwd();
198 chdir ($tmpdir) || die "Unable to change to $tmpdir";
199 &FileUtils::copyFiles ($filename_full_path, $tmpdir);
200
201 $self->unzip ("\"$file_only\"");
202 foreach my $xmlFile (@OpenDocumentPlugin::filesProcess) {
203 if (-e $xmlFile) {
204 $self->{'parser'}->parsefile($xmlFile);
205 }
206 }
207 $self->close_document($filename_full_path,$file_only);
208
209 chdir ($cwd) || die "Unable to change back to $cwd";
210 &FileUtils::removeFilesRecursive ($tmpdir);
211
212 };
213
214 if ($@) {
215
216 # parsefile may either croak somewhere in XML::Parser (e.g. because
217 # the document is not well formed) or die somewhere in ReadXMLFile or a
218 # derived plugin (e.g. because we're attempting to process a
219 # document whose DOCTYPE is not meant for this plugin). For the
220 # first case we'll print a warning and continue, for the second
221 # we'll just continue quietly
222
223 print STDERR "**** Error is: $@\n";
224
225 my ($msg) = $@ =~ /Carp::croak\(\'(.*?)\'\)/;
226 if (defined $msg) {
227 my $plugin_name = ref ($self);
228 print $outhandle "$plugin_name failed to process $file ($msg)\n";
229 }
230
231 # reset ourself for the next document
232 print STDERR "<ProcessingError n='$file'>\n" if ($gli);
233 return -1; # error during processing
234 }
235
236 return 1;
237}
238
239sub unzip {
240 my $self = shift (@_);
241 my ($file) = @_;
242
243 system ("unzip $file");
244 &FileUtils::removeFiles ($file) if -e $file;
245}
246
247sub close_document() {
248 my $self = shift(@_);
249 my ($filename,$file_only) = @_;
250
251 my $doc_obj = $self->{'doc_obj'};
252 my $mimetype = $self->get_mimetype();
253
254 $file_only = $doc_obj->get_assocfile_from_sourcefile(); # url-encoded filename in archives
255 $doc_obj->associate_file($filename, $file_only, $mimetype, "");
256 $doc_obj->associate_file("Thumbnails/thumbnail.png", "thumbnail.png", "image/png", "");
257 my $doc_ext = $filename;
258 $doc_ext =~ s/.*\.od(.)/od$1/;
259
260 # We use set instead of add here because we only want one value
261 $doc_obj->set_utf8_metadata_element("", "FileFormat", "Open Document");
262
263 #setup to doclink thingi
264 # srclink_file is now deprecated because of the "_" in the metadataname. Use srclinkFile
265 $doc_obj->add_metadata ("", "srclink_file", $doc_obj->get_sourcefile());
266 $doc_obj->add_metadata ("", "srclinkFile", $doc_obj->get_sourcefile());
267 $doc_obj->add_utf8_metadata ("", "srcicon", "<img border=\"0\" align=\"absmiddle\" src=\"_httpprefix_/collect/[collection]/index/assoc/[assocfilepath]/thumbnail.png\" alt=\"View the Open document\" title=\"View the Open document\">");
268
269 my $plugin_filename_encoding = $self->{'filename_encoding'};
270 my $filename_encoding = $self->deduce_filename_encoding($file_only,$self->{'metadata'},$plugin_filename_encoding);
271
272 $self->set_Source_metadata($doc_obj, $filename, $filename_encoding);
273 $doc_obj->set_utf8_metadata_element("", "FileSize", (-s $filename));
274
275 # include any metadata passed in from previous plugins
276 # note that this metadata is associated with the top level section
277 $self->extra_metadata ($doc_obj,
278 $doc_obj->get_top_section(),
279 $self->{'metadata'});
280
281 # add a Title if none has been found yet
282 $self->title_fallback($doc_obj,"",$file_only);
283
284 # add an OID
285 $self->add_OID($doc_obj);
286
287 $doc_obj->add_utf8_metadata("", "Plugin", "$self->{'plugin_type'}");
288
289 # process the document
290 $self->{'processor'}->process($doc_obj);
291
292 $self->{'num_processed'} ++;
293 return 1;
294}
295
296sub get_mimetype(){
297 my $filename = "mimetype";
298 if (!open (FILEIN,"<$filename")){
299 print STDERR "Warning: unable to open the $filename\n";
300 return "Unknown OpenDocument Format";
301 }
302 else {
303 my $text = "";
304 while (defined (my $line = <FILEIN>)) {
305 $text .= $line;
306 }
307 close(FILEIN);
308 return $text;
309 }
310}
3111;
312
313
314
315
Note: See TracBrowser for help on using the repository browser.