source: main/trunk/greenstone2/perllib/plugins/ZIPPlugin.pm@ 23484

Last change on this file since 23484 was 23280, checked in by kjdon, 13 years ago

fixed this plugin up for incremental import. need to set gsdlzipfilename for all files unpacked into tmp dir. Also, if incremental, need to add all the files unpacked to the new files list so they get processed. If the zip file hasn't changed, then won't get into this read method.

  • Property svn:keywords set to Author Date Id Revision
File size: 6.2 KB
Line 
1###########################################################################
2#
3# ZIPPlugin.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# plugin which handles compressed and/or archived input formats
27#
28# currently handled formats and file extensions are:
29#
30# gzip (.gz, .z, .tgz, .taz)
31# bzip (.bz)
32# bzip2 (.bz2)
33# zip (.zip .jar)
34# tar (.tar)
35#
36# this plugin relies on the following utilities being present
37# (if trying to process the corresponding formats)
38#
39# gunzip (for gzip)
40# bunzip (for bzip)
41# bunzip2
42# unzip (for zip)
43# tar (for tar)
44
45
46package ZIPPlugin;
47
48use BasePlugin;
49use plugin;
50use util;
51use Cwd;
52
53use strict;
54no strict 'refs'; # allow filehandles to be variables and viceversa
55
56BEGIN {
57 @ZIPPlugin::ISA = ('BasePlugin');
58}
59
60my $arguments =
61 [ { 'name' => "process_exp",
62 'desc' => "{BasePlugin.process_exp}",
63 'type' => "string",
64 'deft' => &get_default_process_exp(),
65 'reqd' => "no" } ];
66
67my $options = { 'name' => "ZIPPlugin",
68 'desc' => "{ZIPPlugin.desc}",
69 'abstract' => "no",
70 'inherits' => "yes",
71 'args' => $arguments };
72
73sub new {
74
75 my ($class) = shift (@_);
76 my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
77 push(@$pluginlist, $class);
78
79 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
80 push(@{$hashArgOptLists->{"OptList"}},$options);
81
82 my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists);
83
84 return bless $self, $class;
85}
86
87sub begin {
88 my $self = shift (@_);
89 my ($pluginfo, $base_dir, $processor, $maxdocs) = @_;
90
91 # Are we actually incremental and doing import?
92 my $proc_package_name = ref $processor;
93 if ($proc_package_name !~ /buildproc$/ && $self->{'incremental'} == 1) {
94 # Get the infodbtype value for this collection from the arcinfo object
95 my $infodbtype = $processor->getoutputinfo()->{'infodbtype'};
96 $infodbtype = "gdbm" if $infodbtype eq "gdbm-txtgz"; # in archives, cannot use txtgz version
97 my $output_dir = $processor->getoutputdir();
98 my $archives_inf = &dbutil::get_infodb_file_path($infodbtype, "archiveinf-doc", $output_dir);
99
100 if ( -e $archives_inf ) {
101 $self->{'actually_incremental'} = 1;
102 }
103 }
104
105
106}
107# this is a recursive plugin
108sub is_recursive {
109 my $self = shift (@_);
110
111 return 1;
112}
113
114sub get_default_process_exp {
115 return q^(?i)\.(gz|tgz|z|taz|bz|bz2|zip|jar|tar)$^;
116}
117
118# return number of files processed, undef if can't process
119# Note that $base_dir might be "" and that $file might
120# include directories
121sub read {
122 my $self = shift (@_);
123 my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
124 my $outhandle = $self->{'outhandle'};
125
126 # can we process this file??
127 my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
128 return undef unless $self->can_process_this_file($filename_full_path);
129
130 my $tmpdir = $file;
131 $tmpdir =~ s/\.[^\.]*//;
132 $tmpdir = &util::rename_file($tmpdir, $self->{'file_rename_method'});
133 $tmpdir = &util::filename_cat($ENV{'GSDLCOLLECTDIR'}, "tmp", $tmpdir);
134 &util::mk_all_dir ($tmpdir);
135
136 print $outhandle "ZIPPlugin: extracting $filename_no_path to $tmpdir\n"
137 if $self->{'verbosity'} > 1;
138
139 # save current working directory
140 my $cwd = cwd();
141 chdir ($tmpdir) || die "Unable to change to $tmpdir";
142 &util::cp ($filename_full_path, $tmpdir);
143
144 if ($file =~ /\.bz$/i) {
145 $self->bunzip ($filename_no_path);
146 } elsif ($file =~ /\.bz2$/i) {
147 $self->bunzip2 ($filename_no_path);
148 } elsif ($file =~ /\.(zip|jar)$/i) {
149 $self->unzip ($filename_no_path);
150 } elsif ($file =~ /\.tar$/i) {
151 $self->untar ($filename_no_path);
152 } else {
153 $self->gunzip ($filename_no_path);
154 }
155
156 chdir ($cwd) || die "Unable to change back to $cwd";
157
158 # do the blocking step inside the folder
159 &plugin::file_block_read ($pluginfo, "", $tmpdir,
160 $block_hash, $metadata, $gli);
161
162 # if we are incremental, then we need to add all the files in the tmp folder into the new_files list otherwise they won't get processed.
163 if ($self->{'actually_incremental'}) {
164 my @file_list = ();
165 &inexport::add_dir_contents_to_list($tmpdir, \@file_list);
166 foreach my $file (@file_list) {
167 $block_hash->{'new_files'}->{$file} = 1;
168 }
169 }
170 # all files in the tmp folder need to get the gsdlzipfilenmae metadata
171 my $this_metadata = {};
172 $this_metadata->{"gsdlzipfilename"} = $filename_full_path;
173 &metadatautil::combine_metadata_structures($this_metadata, $metadata);
174 my $numdocs = &plugin::read ($pluginfo, "", $tmpdir, $block_hash, $this_metadata, $processor, $maxdocs, $total_count, $gli);
175 &util::rm_r ($tmpdir);
176
177 $self->{'num_archives'} ++;
178
179 return $numdocs;
180
181}
182
183sub bunzip {
184 my $self = shift (@_);
185 my ($file) = @_;
186
187 if (system ("bunzip \"$file\"")!=0)
188 {
189 &util::rm ($file);
190 }
191}
192
193sub bunzip2 {
194 my $self = shift (@_);
195 my ($file) = @_;
196
197 if (system ("bunzip2 \"$file\"")!=0)
198 {
199 &util::rm ($file);
200 }
201}
202
203sub unzip {
204 my $self = shift (@_);
205 my ($file) = @_;
206
207 system ("unzip \"$file\"");
208 &util::rm ($file) if -e $file;
209}
210
211sub untar {
212 my $self = shift (@_);
213 my ($file) = @_;
214
215 system ("tar xf \"$file\"");
216 &util::rm ($file) if -e $file;
217}
218
219sub gunzip {
220 my $self = shift (@_);
221 my ($file) = @_;
222
223 if (system ("gunzip \"$file\"")!=0)
224 {
225 &util::rm ($file);
226 };
227}
228
229
230
2311;
Note: See TracBrowser for help on using the repository browser.