root/main/trunk/greenstone2/perllib/plugins/ZIPPlugin.pm @ 23261

Revision 23261, 4.8 KB (checked in by kjdon, 9 years ago)

ZIPPlugin needs to do a block pass on the extracted folder so we don't eg get all html images processed as individual files

  • Property svn:keywords set to Author Date Id Revision
Line 
1###########################################################################
2#
3# ZIPPlugin.pm --
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26# plugin which handles compressed and/or archived input formats
27#
28# currently handled formats and file extensions are:
29#
30# gzip (.gz, .z, .tgz, .taz)
31# bzip (.bz)
32# bzip2 (.bz2)
33# zip (.zip .jar)
34# tar (.tar)
35#
36# this plugin relies on the following utilities being present
37# (if trying to process the corresponding formats)
38#
39# gunzip (for gzip)
40# bunzip (for bzip)
41# bunzip2
42# unzip (for zip)
43# tar (for tar)
44
45
46package ZIPPlugin;
47
48use BasePlugin;
49use plugin;
50use util;
51use Cwd;
52
53use strict;
54no strict 'refs'; # allow filehandles to be variables and viceversa
55
56BEGIN {
57    @ZIPPlugin::ISA = ('BasePlugin');
58}
59
60my $arguments =
61    [ { 'name' => "process_exp",
62    'desc' => "{BasePlugin.process_exp}",
63    'type' => "string",
64    'deft' => &get_default_process_exp(),
65    'reqd' => "no" } ];
66
67my $options = { 'name'     => "ZIPPlugin",
68        'desc'     => "{ZIPPlugin.desc}",
69        'abstract' => "no",
70        'inherits' => "yes",
71        'args'     => $arguments };
72
73sub new {
74
75    my ($class) = shift (@_);
76    my ($pluginlist,$inputargs,$hashArgOptLists) = @_;
77    push(@$pluginlist, $class);
78
79    push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
80    push(@{$hashArgOptLists->{"OptList"}},$options);
81
82    my $self = new BasePlugin($pluginlist, $inputargs, $hashArgOptLists);
83
84    return bless $self, $class;
85}
86
87# this is a recursive plugin
88sub is_recursive {
89    my $self = shift (@_);
90
91    return 1;
92}
93
94sub get_default_process_exp {
95    return q^(?i)\.(gz|tgz|z|taz|bz|bz2|zip|jar|tar)$^;
96}
97
98# return number of files processed, undef if can't process
99# Note that $base_dir might be "" and that $file might
100# include directories
101sub read {
102    my $self = shift (@_);
103    my ($pluginfo, $base_dir, $file, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli) = @_;
104    my $outhandle = $self->{'outhandle'};
105
106    # can we process this file??
107    my ($filename_full_path, $filename_no_path) = &util::get_full_filenames($base_dir, $file);
108    return undef unless $self->can_process_this_file($filename_full_path);
109   
110    my $tmpdir = &util::get_tmp_filename ();
111    &util::mk_all_dir ($tmpdir);
112   
113    print $outhandle "ZIPPlugin: extracting $filename_no_path to $tmpdir\n"
114    if $self->{'verbosity'} > 1;
115   
116    # save current working directory
117    my $cwd = cwd();
118    chdir ($tmpdir) || die "Unable to change to $tmpdir";
119    &util::cp ($filename_full_path, $tmpdir);
120   
121    if ($file =~ /\.bz$/i) {
122    $self->bunzip ($filename_no_path);
123    } elsif ($file =~ /\.bz2$/i) {
124    $self->bunzip2 ($filename_no_path);
125    } elsif ($file =~ /\.(zip|jar)$/i) {
126    $self->unzip ($filename_no_path);
127    } elsif ($file =~ /\.tar$/i) {
128    $self->untar ($filename_no_path);
129    } else {
130    $self->gunzip ($filename_no_path);
131    }
132   
133    chdir ($cwd) || die "Unable to change back to $cwd";
134   
135    # do the blocking step inside the folder
136    &plugin::file_block_read ($pluginfo, "", $tmpdir,
137                  $block_hash, $metadata, $gli);
138    my $numdocs = &plugin::read ($pluginfo, "", $tmpdir, $block_hash, $metadata, $processor, $maxdocs, $total_count, $gli);
139    &util::rm_r ($tmpdir);
140   
141    $self->{'num_archives'} ++;
142   
143    return $numdocs;
144   
145}
146
147sub bunzip {
148    my $self = shift (@_);
149    my ($file) = @_;
150
151    if (system ("bunzip \"$file\"")!=0)
152    {
153    &util::rm ($file);
154    }
155}
156
157sub bunzip2 {
158    my $self = shift (@_);
159    my ($file) = @_;
160
161    if (system ("bunzip2 \"$file\"")!=0)
162    {
163    &util::rm ($file);
164    }
165}
166
167sub unzip {
168    my $self = shift (@_);
169    my ($file) = @_;
170
171    system ("unzip \"$file\"");
172    &util::rm ($file) if -e $file;
173}
174
175sub untar {
176    my $self = shift (@_);
177    my ($file) = @_;
178
179    system ("tar xf \"$file\"");
180    &util::rm ($file) if -e $file;
181}
182
183sub gunzip {
184    my $self = shift (@_);
185    my ($file) = @_;
186
187    if (system ("gunzip \"$file\"")!=0)
188    {
189    &util::rm ($file);
190    };
191}
192
1931;
Note: See TracBrowser for help on using the browser.