source: main/trunk/greenstone2/perllib/plugins/ZIPPlugin.pm@ 28836

Last change on this file since 28836 was 27927, checked in by ak19, 11 years ago

Correcting error introduced in earlier commit.

  • Property svn:keywords set to Author Date Id Revision
File size: 6.4 KB
Line 
1###########################################################################
2#
3# ZIPPlugin.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# plugin which handles compressed and/or archived input formats
27#
28# currently handled formats and file extensions are:
29#
30# gzip (.gz, .z, .tgz, .taz)
31# bzip (.bz)
32# bzip2 (.bz2)
33# zip (.zip .jar)
34# tar (.tar)
35#
36# this plugin relies on the following utilities being present
37# (if trying to process the corresponding formats)
38#
39# gunzip (for gzip)
40# bunzip (for bzip)
41# bunzip2
42# unzip (for zip)
43# tar (for tar)
44
45
46package ZIPPlugin;
47
48use BasePlugin;
49use plugin;
50use util;
51use FileUtils;
52use Cwd;
53
54use strict;
55no strict 'refs'; # allow filehandles to be variables and viceversa
56
57BEGIN {
58 @ZIPPlugin::ISA = ('BasePlugin');
59}
60
61my $arguments =
62 [ { 'name' => "process_exp",
63 'desc' => "{BasePlugin.process_exp}",
64 'type' => "string",
65 'deft' => &get_default_process_exp(),
66 'reqd' => "no" } ];
67
68my $options = { 'name' => "ZIPPlugin",
69 'desc' => "{ZIPPlugin.desc}",
70 'abstract' => "no",
71 'inherits' => "yes",
72 'args' => $arguments };
73
74sub new {
75
76 my ($class) = shift (@_);
77 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
78 push(@$pluginlist, $class);
79
80 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
81 push(@{$hashArgOptLists->{"OptList"}},$options);
82
83 my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists);
84
85 return bless $self, $class;
86}
87
88sub begin {
89 my $self = shift (@_);
90 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
91
92 # Are we actually incremental and doing import?
93 my $proc_package_name = ref $processor;
94 if ($proc_package_name !~ /buildproc$/ && $self->{'incremental'} == 1) {
95 # Get the infodbtype value for this collection from the arcinfo object
96 my $infodbtype = $processor->getoutputinfo()->{'infodbtype'};
97 $infodbtype = "gdbm" if $infodbtype eq "gdbm-txtgz"; # in archives, cannot use txtgz version
98 my $output_dir = $processor->getoutputdir();
99 my $archives_inf = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $output_dir);
100
101 if ( -e $archives_inf ) {
102 $self->{'actually_incremental'} = 1;
103 }
104 }
105
106
107}
108# this is a recursive plugin
109sub is_recursive {
110 my $self = shift (@_);
111
112 return 1;
113}
114
115sub get_default_process_exp {
116 return q^(?i)\.(gz|tgz|z|taz|bz|bz2|zip|jar|tar)$^;
117}
118
119# return number of files processed, undef if can't process
120# Note that $base_dir might be "" and that $file might
121# include directories
122sub read {
123 my $self = shift (@_);
124 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
125 my $outhandle = $self->{'outhandle'};
126
127 # can we process this file??
128 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
129 return undef unless $self->can_process_this_file($filename_full_path);
130
131 my $tmpdir = $file;
132 $tmpdir =~ s/\.[^\.]*//;
133 $tmpdir = &util::rename_file($tmpdir, $self->{'file_rename_method'});
134 $tmpdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "tmp", $tmpdir);
135 &FileUtils::makeAllDirectories ($tmpdir);
136
137 print $outhandle "ZIPPlugin: extracting $filename_no_path to $tmpdir\n"
138 if $self->{'verbosity'} > 1;
139
140 # save current working directory
141 my $cwd = cwd();
142 chdir ($tmpdir) || die "Unable to change to $tmpdir";
143 &FileUtils::copyFiles ($filename_full_path, $tmpdir);
144
145 if ($file =~ /\.bz$/i) {
146 $self->bunzip ($filename_no_path);
147 } elsif ($file =~ /\.bz2$/i) {
148 $self->bunzip2 ($filename_no_path);
149 } elsif ($file =~ /\.(zip|jar|epub)$/i) {
150 $self->unzip ($filename_no_path);
151 } elsif ($file =~ /\.tar$/i) {
152 $self->untar ($filename_no_path);
153 } else {
154 $self->gunzip ($filename_no_path);
155 }
156
157 chdir ($cwd) || die "Unable to change back to $cwd";
158
159 # do the blocking step inside the folder
160 &plugin::file_block_read ($pluginfo, "", $tmpdir,
161 $block_hash, $metadata, $gli);
162
163 # if we are incremental, then we need to add all the files in the tmp folder into the new_files list otherwise they won't get processed.
164 if ($self->{'actually_incremental'}) {
165 my @file_list = ();
166 &inexport::add_dir_contents_to_list($tmpdir, \@file_list);
167 foreach my $file (@file_list) {
168 $block_hash->{'new_files'}->{$file} = 1;
169 }
170 }
171 # all files in the tmp folder need to get the gsdlzipfilenmae metadata
172 my $this_metadata = {};
173 $this_metadata->{"gsdlzipfilename"} = $filename_full_path;
174 &metadatautil::combine_metadata_structures($this_metadata, $metadata);
175 my $numdocs = &plugin::read ($pluginfo, "", $tmpdir, $block_hash, $this_metadata, $processor, $maxdocs, $total_count, $gli);
176 &FileUtils::removeFilesRecursive ($tmpdir);
177
178 $self->{'num_archives'} ++;
179
180 return $numdocs;
181
182}
183
184sub bunzip {
185 my $self = shift (@_);
186 my ($file) = @_;
187
188 if (system ("bunzip \"$file\"")!=0)
189 {
190 &FileUtils::removeFiles ($file);
191 }
192}
193
194sub bunzip2 {
195 my $self = shift (@_);
196 my ($file) = @_;
197
198 if (system ("bunzip2 \"$file\"")!=0)
199 {
200 &FileUtils::removeFiles ($file);
201 }
202}
203
204sub unzip {
205 my $self = shift (@_);
206 my ($file) = @_;
207
208 system ("unzip \"$file\"");
209 &FileUtils::removeFiles ($file) if -e $file;
210}
211
212sub untar {
213 my $self = shift (@_);
214 my ($file) = @_;
215
216 system ("tar xf \"$file\"");
217 &FileUtils::removeFiles ($file) if -e $file;
218}
219
220sub gunzip {
221 my $self = shift (@_);
222 my ($file) = @_;
223
224 if (system ("gunzip \"$file\"")!=0)
225 {
226 &FileUtils::removeFiles ($file);
227 };
228}
229
230
231
2321;
Note: See TracBrowser for help on using the repository browser.