source: main/trunk/greenstone2/perllib/plugins/ZIPPlugin.pm@ 31492

Last change on this file since 31492 was 31492, checked in by kjdon, 7 years ago

renamed EncodingUtil to CommonUtil, BasePlugin to BaseImporter. The idea is that only top level plugins that you can specify in your collection get to have plugin in their name. Modified all other plugins to reflect these name changes

  • Property svn:keywords set to Author Date Id Revision
File size: 6.4 KB
Line 
1###########################################################################
2#
3# ZIPPlugin.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# plugin which handles compressed and/or archived input formats
27#
28# currently handled formats and file extensions are:
29#
30# gzip (.gz, .z, .tgz, .taz)
31# bzip (.bz)
32# bzip2 (.bz2)
33# zip (.zip .jar)
34# tar (.tar)
35#
36# this plugin relies on the following utilities being present
37# (if trying to process the corresponding formats)
38#
39# gunzip (for gzip)
40# bunzip (for bzip)
41# bunzip2
42# unzip (for zip)
43# tar (for tar)
44
45
46package ZIPPlugin;
47
48use BaseImporter;
49use plugin;
50use util;
51use FileUtils;
52use Cwd;
53
54use strict;
55no strict 'refs'; # allow filehandles to be variables and viceversa
56
57BEGIN {
58 @ZIPPlugin::ISA = ('BaseImporter');
59}
60
61my $arguments =
62 [ { 'name' => "process_exp",
63 'desc' => "{BaseImporter.process_exp}",
64 'type' => "string",
65 'deft' => &get_default_process_exp(),
66 'reqd' => "no" } ];
67
68my $options = { 'name' => "ZIPPlugin",
69 'desc' => "{ZIPPlugin.desc}",
70 'abstract' => "no",
71 'inherits' => "yes",
72 'args' => $arguments };
73
74sub new {
75
76 my ($class) = shift (@_);
77 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
78 push(@$pluginlist, $class);
79
80 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
81 push(@{$hashArgOptLists->{"OptList"}},$options);
82
83 my $self = new BaseImporter($pluginlist, $inputargs, $hashArgOptLists);
84
85 return bless $self, $class;
86}
87
88sub begin {
89 my $self = shift (@_);
90 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
91
92 # Are we actually incremental and doing import?
93 my $proc_package_name = ref $processor;
94 if ($proc_package_name !~ /buildproc$/ && $self->{'incremental'} == 1) {
95 # Get the infodbtype value for this collection from the arcinfo object
96 my $infodbtype = $processor->getoutputinfo()->{'infodbtype'};
97 $infodbtype = "gdbm" if $infodbtype eq "gdbm-txtgz"; # in archives, cannot use txtgz version
98 my $output_dir = $processor->getoutputdir();
99 my $archives_inf = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $output_dir);
100
101 if ( -e $archives_inf ) {
102 $self->{'actually_incremental'} = 1;
103 }
104 }
105
106
107}
108# this is a recursive plugin
109sub is_recursive {
110 my $self = shift (@_);
111
112 return 1;
113}
114
115sub get_default_process_exp {
116 return q^(?i)\.(gz|tgz|z|taz|bz|bz2|zip|jar|tar)$^;
117}
118
119# return number of files processed, undef if can't process
120# Note that $base_dir might be "" and that $file might
121# include directories
122sub read {
123 my $self = shift (@_);
124 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
125 my $outhandle = $self->{'outhandle'};
126
127 # can we process this file??
128 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
129 return undef unless $self->can_process_this_file($filename_full_path);
130
131 my $tmpdir = $file;
132 $tmpdir =~ s/\.[^\.]*//;
133 $tmpdir = &util::rename_file($tmpdir, $self->{'file_rename_method'});
134 $tmpdir = &FileUtils::filenameConcatenate($ENV{'GSDLCOLLECTDIR'}, "tmp", $tmpdir);
135 &FileUtils::makeAllDirectories ($tmpdir);
136
137 print $outhandle "ZIPPlugin: extracting $filename_no_path to $tmpdir\n"
138 if $self->{'verbosity'} > 1;
139
140 # save current working directory
141 my $cwd = cwd();
142 chdir ($tmpdir) || die "Unable to change to $tmpdir";
143 &FileUtils::copyFiles ($filename_full_path, $tmpdir);
144
145 if ($file =~ /\.bz$/i) {
146 $self->bunzip ($filename_no_path);
147 } elsif ($file =~ /\.bz2$/i) {
148 $self->bunzip2 ($filename_no_path);
149 } elsif ($file =~ /\.(zip|jar|epub)$/i) {
150 $self->unzip ($filename_no_path);
151 } elsif ($file =~ /\.tar$/i) {
152 $self->untar ($filename_no_path);
153 } else {
154 $self->gunzip ($filename_no_path);
155 }
156
157 chdir ($cwd) || die "Unable to change back to $cwd";
158
159 # do the blocking step inside the folder
160 &plugin::file_block_read ($pluginfo, "", $tmpdir,
161 $block_hash, $metadata, $gli);
162
163 # if we are incremental, then we need to add all the files in the tmp folder into the new_files list otherwise they won't get processed.
164 if ($self->{'actually_incremental'}) {
165 my @file_list = ();
166 &inexport::add_dir_contents_to_list($tmpdir, \@file_list);
167 foreach my $file (@file_list) {
168 $block_hash->{'new_files'}->{$file} = 1;
169 }
170 }
171 # all files in the tmp folder need to get the gsdlzipfilenmae metadata
172 my $this_metadata = {};
173 $this_metadata->{"gsdlzipfilename"} = $filename_full_path;
174 &metadatautil::combine_metadata_structures($this_metadata, $metadata);
175 my $numdocs = &plugin::read ($pluginfo, "", $tmpdir, $block_hash, $this_metadata, $processor, $maxdocs, $total_count, $gli);
176 &FileUtils::removeFilesRecursive ($tmpdir);
177
178 $self->{'num_archives'} ++;
179
180 return $numdocs;
181
182}
183
184sub bunzip {
185 my $self = shift (@_);
186 my ($file) = @_;
187
188 if (system ("bunzip \"$file\"")!=0)
189 {
190 &FileUtils::removeFiles ($file);
191 }
192}
193
194sub bunzip2 {
195 my $self = shift (@_);
196 my ($file) = @_;
197
198 if (system ("bunzip2 \"$file\"")!=0)
199 {
200 &FileUtils::removeFiles ($file);
201 }
202}
203
204sub unzip {
205 my $self = shift (@_);
206 my ($file) = @_;
207
208 system ("unzip \"$file\"");
209 &FileUtils::removeFiles ($file) if -e $file;
210}
211
212sub untar {
213 my $self = shift (@_);
214 my ($file) = @_;
215
216 system ("tar xf \"$file\"");
217 &FileUtils::removeFiles ($file) if -e $file;
218}
219
220sub gunzip {
221 my $self = shift (@_);
222 my ($file) = @_;
223
224 if (system ("gunzip \"$file\"")!=0)
225 {
226 &FileUtils::removeFiles ($file);
227 };
228}
229
230
231
2321;
Note: See TracBrowser for help on using the repository browser.