source: main/trunk/greenstone2/perllib/downloaders/MediaWikiDownload.pm@ 32779

Last change on this file since 32779 was 32779, checked in by ak19, 5 years ago
  1. Bugfix to wget downloading: call to quotewords() necessarily has 2nd parameter keep set to to false/0, in order to remove quotes from cmd args, but which also has the undesirable side-effect of removing single backslashes (while double backslashes get turned to single backslashes). This caused a bug in the download tutorial failing as the cache dir on windows, which contained backslashes going in, ended up containing no backslashes at all when wget was run. The correct place to fix it is before quotewords() gets called in WgetDownload's 2 UseWget() methods. Before quotewords() is called, singlebackslashes are now protected as double backslashes, so that quotewords with 2nd param keep=0 now gets things back to normal by turning double backslashes to single again. 2. Minor change to MediaWikiDownload to reuse variable.
  • Property svn:executable set to *
File size: 9.4 KB
RevLine 
[14248]1###########################################################################
2#
3# MediaWikiDownload.pm -- downloader for wiki pages
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package MediaWikiDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use WgetDownload;
35
36sub BEGIN {
37 @MediaWikiDownload::ISA = ('WgetDownload');
38}
39
40use strict; # every perl program should have this!
41no strict 'refs'; # make an exception so we can use variables as filehandles
42
43my $arguments =
44 [ { 'name' => "url",
45 'disp' => "{WebDownload.url_disp}",
46 'desc' => "{WebDownload.url}",
47 'type' => "string",
48 'reqd' => "yes"},
49 { 'name' => "depth",
50 'disp' => "{WebDownload.depth_disp}",
51 'desc' => "{WebDownload.depth}",
52 'type' => "int",
53 'deft' => "0",
54 "range" => "0,",
55 'reqd' => "no"},
56 { 'name' => "below",
57 'disp' => "{WebDownload.below_disp}",
58 'desc' => "{WebDownload.below}",
59 'type' => "flag",
60 'reqd' => "no"},
61 { 'name' => "within",
62 'disp' => "{WebDownload.within_disp}",
63 'desc' => "{WebDownload.within}",
64 'type' => "flag",
65 'reqd' => "no"},
66 { 'name' => "reject_files",
67 'disp' => "{MediaWikiDownload.reject_filetype_disp}",
68 'desc' => "{MediaWikiDownload.reject_filetype}",
69 'type' => "string",
70 'reqd' => "no",
71 'deft' => "*action=*,*diff=*,*oldid=*,*printable*,*Recentchangeslinked*,*Userlogin*,*Whatlinkshere*,*redirect*,*Special:*,Talk:*,Image:*,*.ppt,*.pdf,*.zip,*.doc"},
72 { 'name' => "exclude_directories",
73 'disp' => "{MediaWikiDownload.exclude_directories_disp}",
74 'desc' => "{MediaWikiDownload.exclude_directories}",
75 'type' => "string",
76 'reqd' => "no",
77 'deft' => "/wiki/index.php/Special:Recentchangeslinked,/wiki/index.php/Special:Whatlinkshere,/wiki/index.php/Talk:Creating_CD"},
78 ];
79
80my $options = { 'name' => "MediaWikiDownload",
81 'desc' => "{MediaWikiDownload.desc}",
82 'abstract' => "no",
83 'inherits' => "yes",
84 'args' => $arguments };
85
86my $wget_options = "";
87
88my $self;
89
90sub new
91{
92 my ($class) = shift (@_);
93 my ($getlist,$inputargs,$hashArgOptLists) = @_;
94 push(@$getlist, $class);
95
[17207]96 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
97 push(@{$hashArgOptLists->{"OptList"}},$options);
[14248]98
[17207]99 my $self = new WgetDownload($getlist,$inputargs,$hashArgOptLists);
[14248]100
101 return bless $self, $class;
102}
103
104
105sub download
106{
107 my ($self) = shift (@_);
108 my ($hashGeneralOptions) = @_;
109
110 # Download options
111 my $strOptions = $self->generateOptionsString();
112 my $strWgetOptions = $self->getWgetOptions();
113
114 # Setup the command for using wget
115 $wget_options = "-N -k -x -t 2 -P \"". $hashGeneralOptions->{"cache_dir"}."\" $strWgetOptions $strOptions ";
[32779]116 my $cmdWget = $wget_options . $self->{'url'};
[14248]117
118 # Download the web pages
119 # print "Strat download from $self->{'url'}...\n";
120 print STDERR "<<Undefined Maximum>>\n";
121
122 my $strResponse = $self->useWget($cmdWget,1);
123
124 # if ($strResponse ne ""){print "$strResponse\n";}
125
126 print STDERR "Finish download from $self->{'url'}\n";
127
128 # check css files for HTML pages are downloaded as well
129 $self->{'url'} =~ /http:\/\/([^\/]*)\//;
[28250]130 my $base_url = &FileUtils::filenameConcatenate($hashGeneralOptions->{"cache_dir"}, $1);
[14248]131 &check_file($base_url, $self);
132
133 print STDERR "<<Finished>>\n";
134
135 return 1;
136}
137
138sub check_file
139{
140 my $dir = shift;
141 my ($self) = shift;
142
143 local *DIR;
144
145 opendir DIR, $dir or return;
146
147 my @contents = grep { $_ ne '.' and $_ ne '..' } readdir DIR;
148
149 closedir DIR;
150
151 foreach (@contents) {
152 # broad-first search
153 &check_file("$dir/$_", $self);
154
155 # read in every HTML page and check the css associated with import statement already exists.
156 &check_imported_css($self, "$dir/$_");
157 }
158}
159
160sub check_imported_css
161{
162 my ($self) = shift(@_);
163 my $downloaded_file = shift(@_);
164
165 $downloaded_file =~ /(.+)(\/|\\)cache(\/|\\)([^\/\\]+)(\/|\\)/;
166
167 # external website url, to make up the stylesheet urls
168 my $base_website_url = $4;
169
170 # the cache download file directory
171 my $base_dir = "$1$2cache$3$4" if defined $base_website_url;
172
173 if($downloaded_file=~/(.+)\.(html|htm)/) {
174 my $content = "";
175 if(open(INPUT, "<$downloaded_file")){
176 while(my $line = <INPUT>){
177 $content .= $line;
178 }
179 close(INPUT);
180 }
181
182 my @css_files;
183 my @css_files_paths;
184 my $css_file_count = 0;
185 while($content =~ /<style type="text\/css"(.+)?import "(.+)?"/ig){
186 $css_files[$css_file_count] = $base_website_url . $2 if defined $2;
187 $css_files_paths[$css_file_count] = $base_dir . $2 if defined $2;
188 $css_file_count++;
189 }
190
191 for($css_file_count=0; $css_file_count<scalar(@css_files); $css_file_count++) {
192 my $css_file = "http://" . $css_files[$css_file_count];
193 my $css_file_path = $css_files_paths[$css_file_count];
194
195 # trim the ? mark append to the end of a stylesheet
196 $css_file =~ s/\?([^\/\.\s]+)$//isg;
197 $css_file_path =~ s/\?([^\/\.\s]+)$//isg;
198
199 # do nothing if the css file existed
200 next if(-e $css_file_path);
201
202 # otherwise use Wget to download the css files
203 my $cmdWget = $wget_options . $css_file;
204 my $strResponse = $self->useWget($cmdWget,1);
205
206 print STDERR "Downloaded associated StyleSheet : $css_file\n";
207 }
208 }
209}
210
211
212sub generateOptionsString
213{
214 my ($self) = @_;
215 my $strOptions;
216
217 (defined $self) || &error("generateOptionsString","No \$self is defined!!\n");
218 (defined $self->{'depth'})|| &error("generateOptionsString","No depth is defined!!\n");
219
220 # -r for recursive downloading
221 # -E to append a 'html' suffix to a file of type 'application/xhtml+xml' or 'text/html'
222 $strOptions .="-r -E ";
223
224 # -X exclude file directories
225 if($self->{'exclude_directories'}==1) {
226 $strOptions .="-X " . $self->{'exclude_directories'};
227 } else {
228 $strOptions .="-X /wiki/index.php/Special:Recentchangeslinked,/wiki/index.php/Special:Whatlinkshere,/wiki/index.php/Talk:Creating_CD ";
229 }
230
231 # -R reject file list, reject files with these text in their names
232 if($self->{'reject_files'}==1) {
233 $strOptions .="-R " . $self->{'reject_files'};
234 } else {
235 $strOptions .="-R *action=*,*diff=*,*oldid=*,*printable*,*Recentchangeslinked*,*Userlogin*,*Whatlinkshere*,*redirect*,*Special:*,Talk:*,Image:* ";
236 }
237
238 if($self->{'depth'} == 0){
239 $strOptions .= " ";
240 } elsif($self->{'depth'} > 0) {
241 $strOptions .= "-l ".$self->{'depth'}." "; # already got -r
242 } else {
243 $self->error("setupOptions","Incorrect Depth is defined!!\n");
244 }
245
246 if($self->{'below'}) {
247 $strOptions .="-np ";
248 }
249
250 # if($self->{'html_only'}) {
251 # $strOptions .="-A .html,.htm,.shm,.shtml,.asp,.php,.cgi,*?*=* ";
252 # } else{
253 # $strOptions .="-p ";
254 #}
255
256 if (!$self->{'within'}){
257 $strOptions .="-H ";
258 }
259
260 return $strOptions;
261}
262
263sub url_information
264{
265 my ($self) = shift (@_);
266
267 my $strOptions = $self->getWgetOptions();
268
269 my $strBaseCMD = $strOptions." -q -O - \"$self->{'url'}\"";
270
271
272 my $strIdentifyText = $self->useWget($strBaseCMD);
273
274 if (!defined $strIdentifyText or $strIdentifyText eq "" ){
275 print STDERR "Server information is unavailable.\n";
276 print STDERR "<<Finished>>\n";
277 return;
278 }
279
280 while ($strIdentifyText =~ m/^(.*)<title>(.*?)<\/title>(.*)$/s) {
281 $strIdentifyText = $1.$3;
282 print STDERR "Page Title: $2\n";
283 }
284
285 while ($strIdentifyText =~ m/^(.*)<meta (.*?)>(.*)$/s) {
286 $strIdentifyText = $1.$3;
287 my $strTempString = $2;
288 print STDERR "\n";
289
290 while($strTempString =~ m/(.*?)=[\"|\'](.*?)[\"|\'](.*?)$/s){
291 # Store the infromation in to variable, since next time when we do
292 # regular expression, we will lost all the $1, $2, $X....
293 $strTempString = $3;
294 my $strMetaName = $1;
295 my $strMetaContain = $2;
296
297 # Take out the extra space in the beginning of the string.
298 $strMetaName =~ s/^([" "])+//m;
299 $strMetaContain =~ s/^([" "])+//m;
300
301 print STDERR "$strMetaName: $strMetaContain\n\n";
302 }
303 }
304
305 print STDERR "<<Finished>>\n";
306
307}
308
309
310sub error
311{
312 my ($strFunctionName,$strError) = @_;
313 {
314 print "Error occoured in MediaWikiDownload.pm\n".
315 "In Function:".$strFunctionName."\n".
316 "Error Message:".$strError."\n";
317 exit(-1);
318 }
319}
320
3211;
322
Note: See TracBrowser for help on using the repository browser.