[10997] | 1 | ###########################################################################
|
---|
| 2 | #
|
---|
[15872] | 3 | # OpenDocumentPlugin.pm -- The Open Document plugin
|
---|
[10997] | 4 | # A component of the Greenstone digital library software
|
---|
| 5 | # from the New Zealand Digital Library Project at the
|
---|
| 6 | # University of Waikato, New Zealand.
|
---|
| 7 | #
|
---|
| 8 | # Copyright (C) 2001 New Zealand Digital Library Project
|
---|
| 9 | #
|
---|
| 10 | # This program is free software; you can redistribute it and/or modify
|
---|
| 11 | # it under the terms of the GNU General Public License as published by
|
---|
| 12 | # the Free Software Foundation; either version 2 of the License, or
|
---|
| 13 | # (at your option) any later version.
|
---|
| 14 | #
|
---|
| 15 | # This program is distributed in the hope that it will be useful,
|
---|
| 16 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 17 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 18 | # GNU General Public License for more details.
|
---|
| 19 | #
|
---|
| 20 | # You should have received a copy of the GNU General Public License
|
---|
| 21 | # along with this program; if not, write to the Free Software
|
---|
| 22 | # Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
|
---|
| 23 | #
|
---|
| 24 | ###########################################################################
|
---|
[13357] | 25 |
|
---|
| 26 | # Processes OASIS Open Document format.
|
---|
| 27 | # Word processing document: .odt, template: .ott
|
---|
| 28 | # Spreadsheet document: .ods, template: .ots
|
---|
| 29 | # Presentation document: .odp, template: .otp
|
---|
| 30 | # Graphics document: .odg, template: .otg
|
---|
| 31 | # Formulas document: .odf, template: .otf (not supported)
|
---|
| 32 |
|
---|
| 33 | #This basically extracts any text out of the document, but not much else.
|
---|
| 34 |
|
---|
[15872] | 35 | # this inherits ReadXMLFile, and therefore offers -xslt option, but does
|
---|
| 36 | # nothing with it.
|
---|
[10997] | 37 |
|
---|
[15872] | 38 | package OpenDocumentPlugin;
|
---|
| 39 |
|
---|
[10997] | 40 | use strict;
|
---|
| 41 | no strict 'refs'; # allow filehandles to be variables and viceversa
|
---|
| 42 |
|
---|
[15872] | 43 | use ReadXMLFile;
|
---|
[10997] | 44 | use XML::XPath;
|
---|
| 45 | use XML::XPath::XMLParser;
|
---|
| 46 | use Cwd;
|
---|
| 47 | use util;
|
---|
| 48 | use ghtml;
|
---|
| 49 |
|
---|
| 50 | sub BEGIN {
|
---|
[15872] | 51 | @OpenDocumentPlugin::ISA = ('ReadXMLFile');
|
---|
[10997] | 52 | }
|
---|
| 53 |
|
---|
| 54 | our @filesProcess = ( "content.xml" , "meta.xml" );
|
---|
| 55 |
|
---|
| 56 | my $arguments = [
|
---|
| 57 | { 'name' => "process_exp",
|
---|
[16013] | 58 | 'desc' => "{BasePlugin.process_exp}",
|
---|
[10997] | 59 | 'type' => "regexp",
|
---|
| 60 | 'deft' => &get_default_process_exp() }
|
---|
| 61 | ];
|
---|
[13343] | 62 |
|
---|
[15872] | 63 | my $options = { 'name' => "OpenDocumentPlugin",
|
---|
| 64 | 'desc' => "{OpenDocumentPlugin.desc}",
|
---|
[10997] | 65 | 'abstract' => "no",
|
---|
[13343] | 66 | 'inherits' => "yes",
|
---|
| 67 | 'args' => $arguments};
|
---|
[10997] | 68 |
|
---|
| 69 | sub get_default_process_exp { return q^(?i)\.o(?:d|t)(?:t|s|p|g)$^; }
|
---|
| 70 |
|
---|
| 71 | sub new {
|
---|
| 72 | my ($class) = shift (@_);
|
---|
| 73 | my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
|
---|
| 74 | push(@$pluginlist, $class);
|
---|
| 75 |
|
---|
[15872] | 76 | push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
|
---|
| 77 | push(@{$hashArgOptLists->{"OptList"}},$options);
|
---|
[10997] | 78 |
|
---|
[15872] | 79 | my $self = new ReadXMLFile($pluginlist, $inputargs, $hashArgOptLists);
|
---|
[10997] | 80 |
|
---|
| 81 | $self->{'section'} = "";
|
---|
| 82 | $self->{'office:meta'} = "";
|
---|
| 83 |
|
---|
| 84 | return bless $self, $class;
|
---|
| 85 | }
|
---|
| 86 |
|
---|
[13222] | 87 | sub get_doctype {
|
---|
| 88 | my $self = shift(@_);
|
---|
| 89 |
|
---|
| 90 | return "manifest:manifest";
|
---|
| 91 | }
|
---|
[10997] | 92 |
|
---|
| 93 | sub xml_doctype {
|
---|
| 94 | my $self = shift(@_);
|
---|
| 95 | my ($expat, $name, $sysid, $pubid, $internal) = @_;
|
---|
| 96 | die "The only valid doctype is manifest, $name is not valid" if ($name ne "manifest:manifest");
|
---|
| 97 | }
|
---|
| 98 |
|
---|
| 99 | # Called for every start tag. The $_ variable will contain a copy of the
|
---|
| 100 | # tag and the %_ variable will contain the element's attributes.
|
---|
| 101 | sub xml_start_tag {
|
---|
| 102 | my $self = shift(@_);
|
---|
| 103 | my ($expat, $element) = @_;
|
---|
| 104 | my %atts = %_;
|
---|
| 105 | $self->{'office:meta'} = $element if $self->{'office:meta'} eq "Start";
|
---|
| 106 | if($element eq 'office:text') {
|
---|
| 107 | $self->{'collectedText'} = "";
|
---|
| 108 | }elsif($element eq 'office:meta') {
|
---|
| 109 | $self->{'collectedText'} = "";
|
---|
| 110 | $self->{'office:meta'} = "Start";
|
---|
| 111 | }elsif($element eq 'meta:document-statistic'){
|
---|
| 112 | foreach my $att (keys %atts) {
|
---|
| 113 | $self->{'doc_obj'}->add_utf8_metadata("",$att,$atts{$att});
|
---|
| 114 | }
|
---|
| 115 |
|
---|
| 116 | }
|
---|
| 117 | }
|
---|
| 118 |
|
---|
| 119 | sub xml_end_tag {
|
---|
| 120 | my $self = shift(@_);
|
---|
| 121 | my ($expat, $element) = @_;
|
---|
| 122 |
|
---|
| 123 | if($element eq 'office:text') {
|
---|
| 124 | $self->{'doc_obj'}->add_utf8_text("",$self->{'collectedText'});
|
---|
| 125 | $self->{'collectedText'} = "";
|
---|
| 126 | }elsif($element eq $self->{'office:meta'}) {
|
---|
| 127 | if( $self->{'collectedText'} ne "") {
|
---|
| 128 | $self->{'doc_obj'}->add_utf8_metadata("",$self->{'office:meta'},$self->{'collectedText'});
|
---|
| 129 | $self->{'doc_obj'}->add_utf8_metadata("","Title",$self->{'collectedText'}) if $self->{'office:meta'} =~ m/:title$/;
|
---|
| 130 | $self->{'doc_obj'}->add_utf8_metadata("","Language",$self->{'collectedText'}) if $self->{'office:meta'} =~ m/:language$/;
|
---|
| 131 | $self->{'doc_obj'}->add_utf8_metadata("","GENERATOR",$self->{'collectedText'}) if $self->{'office:meta'} =~ m/:generator$/;
|
---|
| 132 |
|
---|
| 133 | }
|
---|
| 134 | $self->{'collectedText'} = "";
|
---|
| 135 | $self->{'office:meta'} = "Start";
|
---|
| 136 | }elsif($element eq 'office:meta'){
|
---|
| 137 | $self->{'office:meta'} = "";
|
---|
| 138 | }elsif($element eq 'office:body'){
|
---|
| 139 | #some documents have text in other places that should probably be indexed if we can't find any doc text
|
---|
[13343] | 140 |
|
---|
| 141 | if( defined $self->{'collectedText'} && $self->{'collectedText'} ne "" && $self->{'doc_obj'}->get_text("") eq "") {
|
---|
[10997] | 142 | $self->{'doc_obj'}->add_utf8_text("",$self->{'collectedText'});
|
---|
| 143 | }
|
---|
| 144 | }
|
---|
| 145 | }
|
---|
| 146 |
|
---|
| 147 | sub xml_text {
|
---|
| 148 | my $self = shift(@_);
|
---|
| 149 | my ($expat) = @_;
|
---|
| 150 | if($_ =~ m/\w/i) {
|
---|
| 151 | $self->{'collectedText'} .= "<br/>" if $self->{'collectedText'} ne "";
|
---|
| 152 | $self->{'collectedText'} .= "$_";
|
---|
| 153 | }
|
---|
| 154 | }
|
---|
| 155 |
|
---|
| 156 | #trap start and end document so we do not get our doc_obj closed too soon
|
---|
| 157 | sub xml_start_document {}
|
---|
| 158 | sub xml_end_document {}
|
---|
| 159 |
|
---|
| 160 | sub read {
|
---|
[15872] | 161 | my $self = shift (@_);
|
---|
[10997] | 162 |
|
---|
[16392] | 163 | my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
|
---|
[10997] | 164 |
|
---|
[16392] | 165 | # can we process this file??
|
---|
| 166 | my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
|
---|
| 167 | return undef unless $self->can_process_this_file($filename_full_path);
|
---|
[10997] | 168 |
|
---|
[16193] | 169 | my $outhandle = $self->{'outhandle'};
|
---|
[16104] | 170 | # Report that we're processing the file
|
---|
| 171 | print STDERR "<Processing n='$file' p='OpenDocumentPlugin'>\n" if ($gli);
|
---|
| 172 | print $outhandle "OpenDocumentPlugin: processing $file\n"
|
---|
| 173 | if ($self->{'verbosity'}) > 1;
|
---|
| 174 |
|
---|
[10997] | 175 | $file =~ s/^[\/\\]+//; # $file often begins with / so we'll tidy it up
|
---|
| 176 | $self->{'file'} = $file;
|
---|
[16392] | 177 | $self->{'filename'} = $filename_full_path;
|
---|
[10997] | 178 | $self->{'processor'} = $processor;
|
---|
| 179 | $self->{'metadata'} = $metadata;
|
---|
| 180 |
|
---|
| 181 | eval{
|
---|
| 182 | my ($file_only) = $file =~ /([^\\\/]*)$/;
|
---|
| 183 | my $tmpdir = &util::get_tmp_filename ();
|
---|
| 184 | &util::mk_all_dir ($tmpdir);
|
---|
| 185 |
|
---|
| 186 | $self->open_document();
|
---|
| 187 |
|
---|
| 188 | # save current working directory
|
---|
| 189 | my $cwd = getcwd();
|
---|
| 190 | chdir ($tmpdir) || die "Unable to change to $tmpdir";
|
---|
[16392] | 191 | &util::cp ($filename_full_path, $tmpdir);
|
---|
[10997] | 192 |
|
---|
| 193 | $self->unzip ("\"$file_only\"");
|
---|
[15872] | 194 | foreach my $xmlFile (@OpenDocumentPlugin::filesProcess) {
|
---|
[13343] | 195 | if (-e $xmlFile) {
|
---|
[15872] | 196 | $self->{'parser'}->parsefile($xmlFile);
|
---|
[13343] | 197 | }
|
---|
[10997] | 198 | }
|
---|
[16392] | 199 | $self->close_document($filename_full_path,$file_only);
|
---|
[10997] | 200 |
|
---|
| 201 | chdir ($cwd) || die "Unable to change back to $cwd";
|
---|
| 202 | &util::rm_r ($tmpdir);
|
---|
| 203 |
|
---|
| 204 | };
|
---|
| 205 |
|
---|
| 206 | if ($@) {
|
---|
| 207 |
|
---|
| 208 | # parsefile may either croak somewhere in XML::Parser (e.g. because
|
---|
[15872] | 209 | # the document is not well formed) or die somewhere in ReadXMLFile or a
|
---|
[10997] | 210 | # derived plugin (e.g. because we're attempting to process a
|
---|
| 211 | # document whose DOCTYPE is not meant for this plugin). For the
|
---|
| 212 | # first case we'll print a warning and continue, for the second
|
---|
| 213 | # we'll just continue quietly
|
---|
| 214 |
|
---|
| 215 | print STDERR "**** Error is: $@\n";
|
---|
| 216 |
|
---|
| 217 | my ($msg) = $@ =~ /Carp::croak\(\'(.*?)\'\)/;
|
---|
| 218 | if (defined $msg) {
|
---|
| 219 | my $plugin_name = ref ($self);
|
---|
| 220 | print $outhandle "$plugin_name failed to process $file ($msg)\n";
|
---|
| 221 | }
|
---|
| 222 |
|
---|
| 223 | # reset ourself for the next document
|
---|
| 224 | print STDERR "<ProcessingError n='$file'>\n" if ($gli);
|
---|
| 225 | return -1; # error during processing
|
---|
| 226 | }
|
---|
| 227 |
|
---|
| 228 | return 1;
|
---|
| 229 | }
|
---|
| 230 |
|
---|
| 231 | sub unzip {
|
---|
| 232 | my $self = shift (@_);
|
---|
| 233 | my ($file) = @_;
|
---|
| 234 |
|
---|
| 235 | system ("unzip $file");
|
---|
| 236 | &util::rm ($file) if -e $file;
|
---|
| 237 | }
|
---|
| 238 |
|
---|
| 239 | sub close_document() {
|
---|
| 240 | my $self = shift(@_);
|
---|
| 241 | my ($filename,$file_only) = @_;
|
---|
| 242 |
|
---|
| 243 | my $doc_obj = $self->{'doc_obj'};
|
---|
| 244 |
|
---|
| 245 | my $mimetype = $self->get_mimetype();
|
---|
| 246 |
|
---|
| 247 | $doc_obj->associate_file($filename, $file_only, $mimetype, "");
|
---|
| 248 | $doc_obj->associate_file("Thumbnails/thumbnail.png", "thumbnail.png", "image/png", "");
|
---|
| 249 | my $doc_ext = $filename;
|
---|
| 250 | $doc_ext =~ s/.*\.od(.)/od$1/;
|
---|
| 251 |
|
---|
| 252 | # We use set instead of add here because we only want one value
|
---|
| 253 | $doc_obj->set_utf8_metadata_element("", "FileFormat", "Open Document");
|
---|
| 254 |
|
---|
| 255 | #setup to doclink thingi
|
---|
[11834] | 256 | my $doclink = "<a href=\"_httpprefix_/collect/[collection]/index/assoc/[archivedir]/$file_only\">";
|
---|
[10997] | 257 | $doc_obj->add_utf8_metadata ("", "srclink", $doclink);
|
---|
[11834] | 258 | $doc_obj->add_utf8_metadata ("", "srcicon", "<img border=\"0\" align=\"absmiddle\" src=\"_httpprefix_/collect/[collection]/index/assoc/[archivedir]/thumbnail.png\" alt=\"View the Open document\" title=\"View the Open document\">");
|
---|
[10997] | 259 | $doc_obj->add_utf8_metadata ("", "/srclink", "</a>");
|
---|
[15872] | 260 | $self->set_Source_metadata($doc_obj, $file_only);
|
---|
| 261 | $doc_obj->set_utf8_metadata_element("", "FileSize", (-s $filename));
|
---|
[10997] | 262 |
|
---|
[15179] | 263 | # include any metadata passed in from previous plugins
|
---|
| 264 | # note that this metadata is associated with the top level section
|
---|
| 265 | $self->extra_metadata ($doc_obj,
|
---|
| 266 | $doc_obj->get_top_section(),
|
---|
| 267 | $self->{'metadata'});
|
---|
| 268 |
|
---|
[10997] | 269 | # add a Title if none has been found yet
|
---|
| 270 | $self->title_fallback($doc_obj,"",$file_only);
|
---|
| 271 |
|
---|
| 272 | # add an OID
|
---|
[15872] | 273 | $self->add_OID($doc_obj);
|
---|
[10997] | 274 |
|
---|
| 275 | $doc_obj->add_utf8_metadata("", "Plugin", "$self->{'plugin_type'}");
|
---|
| 276 |
|
---|
| 277 | # process the document
|
---|
| 278 | $self->{'processor'}->process($doc_obj);
|
---|
| 279 |
|
---|
| 280 | $self->{'num_processed'} ++;
|
---|
| 281 | return 1;
|
---|
| 282 | }
|
---|
| 283 |
|
---|
| 284 | sub get_mimetype(){
|
---|
| 285 | my $filename = "mimetype";
|
---|
| 286 | if (!open (FILEIN,"<$filename")){
|
---|
| 287 | print STDERR "Warning: unable to open the $filename\n";
|
---|
| 288 | return "Unknown OpenDocument Format";
|
---|
| 289 | }
|
---|
| 290 | else {
|
---|
| 291 | my $text = "";
|
---|
| 292 | while (defined (my $line = <FILEIN>)) {
|
---|
| 293 | $text .= $line;
|
---|
| 294 | }
|
---|
| 295 | return $text;
|
---|
| 296 | }
|
---|
| 297 | }
|
---|
| 298 | 1;
|
---|
| 299 |
|
---|
| 300 |
|
---|
| 301 |
|
---|
| 302 |
|
---|