source: gsdl/branches/gsdl-2.74/perllib/downloaders/MediaWikiDownload.pm@ 14270

Last change on this file since 14270 was 14270, checked in by oranfry, 17 years ago

merged selected changes to the gsdl trunk since r14217 into the 2.74 branch

  • Property svn:executable set to *
File size: 9.6 KB
Line 
1###########################################################################
2#
3# MediaWikiDownload.pm -- downloader for wiki pages
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package MediaWikiDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use WgetDownload;
35
36sub BEGIN {
37 @MediaWikiDownload::ISA = ('WgetDownload');
38}
39
40use strict; # every perl program should have this!
41no strict 'refs'; # make an exception so we can use variables as filehandles
42
43my $arguments =
44 [ { 'name' => "url",
45 'disp' => "{WebDownload.url_disp}",
46 'desc' => "{WebDownload.url}",
47 'type' => "string",
48 'reqd' => "yes"},
49 { 'name' => "depth",
50 'disp' => "{WebDownload.depth_disp}",
51 'desc' => "{WebDownload.depth}",
52 'type' => "int",
53 'deft' => "0",
54 "range" => "0,",
55 'reqd' => "no"},
56 { 'name' => "below",
57 'disp' => "{WebDownload.below_disp}",
58 'desc' => "{WebDownload.below}",
59 'type' => "flag",
60 'reqd' => "no"},
61 { 'name' => "within",
62 'disp' => "{WebDownload.within_disp}",
63 'desc' => "{WebDownload.within}",
64 'type' => "flag",
65 'reqd' => "no"},
66 { 'name' => "reject_files",
67 'disp' => "{MediaWikiDownload.reject_filetype_disp}",
68 'desc' => "{MediaWikiDownload.reject_filetype}",
69 'type' => "string",
70 'reqd' => "no",
71 'deft' => "*action=*,*diff=*,*oldid=*,*printable*,*Recentchangeslinked*,*Userlogin*,*Whatlinkshere*,*redirect*,*Special:*,Talk:*,Image:*,*.ppt,*.pdf,*.zip,*.doc"},
72 { 'name' => "exclude_directories",
73 'disp' => "{MediaWikiDownload.exclude_directories_disp}",
74 'desc' => "{MediaWikiDownload.exclude_directories}",
75 'type' => "string",
76 'reqd' => "no",
77 'deft' => "/wiki/index.php/Special:Recentchangeslinked,/wiki/index.php/Special:Whatlinkshere,/wiki/index.php/Talk:Creating_CD"},
78 ];
79
80my $options = { 'name' => "MediaWikiDownload",
81 'desc' => "{MediaWikiDownload.desc}",
82 'abstract' => "no",
83 'inherits' => "yes",
84 'args' => $arguments };
85
86my $wget_options = "";
87
88my $self;
89
90sub new
91{
92 my ($class) = shift (@_);
93 my ($getlist,$inputargs,$hashArgOptLists) = @_;
94 push(@$getlist, $class);
95
96 if(defined $arguments){ push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});}
97 if(defined $options) { push(@{$hashArgOptLists->{"OptList"}},$options)};
98
99 my $self = (defined $hashArgOptLists)? new WgetDownload($getlist,$inputargs,$hashArgOptLists): new WgetDownload($getlist,$inputargs);
100
101 return bless $self, $class;
102}
103
104
105sub download
106{
107 my ($self) = shift (@_);
108 my ($hashGeneralOptions) = @_;
109
110 # Download options
111 my $strOptions = $self->generateOptionsString();
112 my $strWgetOptions = $self->getWgetOptions();
113
114 # Setup the command for using wget
115 $wget_options = "-N -k -x -t 2 -P \"". $hashGeneralOptions->{"cache_dir"}."\" $strWgetOptions $strOptions ";
116 my $cmdWget = "-N -k -x -t 2 -P \"". $hashGeneralOptions->{"cache_dir"}."\" $strWgetOptions $strOptions " . $self->{'url'};
117
118 # Download the web pages
119 # print "Strat download from $self->{'url'}...\n";
120 print STDERR "<<Undefined Maximum>>\n";
121
122 my $strResponse = $self->useWget($cmdWget,1);
123
124 # if ($strResponse ne ""){print "$strResponse\n";}
125
126 print STDERR "Finish download from $self->{'url'}\n";
127
128 # check css files for HTML pages are downloaded as well
129 $self->{'url'} =~ /http:\/\/([^\/]*)\//;
130 my $base_url = &util::filename_cat($hashGeneralOptions->{"cache_dir"}, $1);
131 &check_file($base_url, $self);
132
133 print STDERR "<<Finished>>\n";
134
135 return 1;
136}
137
138sub check_file
139{
140 my $dir = shift;
141 my ($self) = shift;
142
143 local *DIR;
144
145 opendir DIR, $dir or return;
146
147 my @contents = grep { $_ ne '.' and $_ ne '..' } readdir DIR;
148
149 closedir DIR;
150
151 foreach (@contents) {
152 # broad-first search
153 &check_file("$dir/$_", $self);
154
155 # read in every HTML page and check the css associated with import statement already exists.
156 &check_imported_css($self, "$dir/$_");
157 }
158}
159
160sub check_imported_css
161{
162 my ($self) = shift(@_);
163 my $downloaded_file = shift(@_);
164
165 $downloaded_file =~ /(.+)(\/|\\)cache(\/|\\)([^\/\\]+)(\/|\\)/;
166
167 # external website url, to make up the stylesheet urls
168 my $base_website_url = $4;
169
170 # the cache download file directory
171 my $base_dir = "$1$2cache$3$4" if defined $base_website_url;
172
173 if($downloaded_file=~/(.+)\.(html|htm)/) {
174 my $content = "";
175 if(open(INPUT, "<$downloaded_file")){
176 while(my $line = <INPUT>){
177 $content .= $line;
178 }
179 close(INPUT);
180 }
181
182 my @css_files;
183 my @css_files_paths;
184 my $css_file_count = 0;
185 while($content =~ /<style type="text\/css"(.+)?import "(.+)?"/ig){
186 $css_files[$css_file_count] = $base_website_url . $2 if defined $2;
187 $css_files_paths[$css_file_count] = $base_dir . $2 if defined $2;
188 $css_file_count++;
189 }
190
191 for($css_file_count=0; $css_file_count<scalar(@css_files); $css_file_count++) {
192 my $css_file = "http://" . $css_files[$css_file_count];
193 my $css_file_path = $css_files_paths[$css_file_count];
194
195 # trim the ? mark append to the end of a stylesheet
196 $css_file =~ s/\?([^\/\.\s]+)$//isg;
197 $css_file_path =~ s/\?([^\/\.\s]+)$//isg;
198
199 # do nothing if the css file existed
200 next if(-e $css_file_path);
201
202 # otherwise use Wget to download the css files
203 my $cmdWget = $wget_options . $css_file;
204 my $strResponse = $self->useWget($cmdWget,1);
205
206 print STDERR "Downloaded associated StyleSheet : $css_file\n";
207 }
208 }
209}
210
211
212sub generateOptionsString
213{
214 my ($self) = @_;
215 my $strOptions;
216
217 (defined $self) || &error("generateOptionsString","No \$self is defined!!\n");
218 (defined $self->{'depth'})|| &error("generateOptionsString","No depth is defined!!\n");
219
220 # -r for recursive downloading
221 # -E to append a 'html' suffix to a file of type 'application/xhtml+xml' or 'text/html'
222 $strOptions .="-r -E ";
223
224 # -X exclude file directories
225 if($self->{'exclude_directories'}==1) {
226 $strOptions .="-X " . $self->{'exclude_directories'};
227 } else {
228 $strOptions .="-X /wiki/index.php/Special:Recentchangeslinked,/wiki/index.php/Special:Whatlinkshere,/wiki/index.php/Talk:Creating_CD ";
229 }
230
231 # -R reject file list, reject files with these text in their names
232 if($self->{'reject_files'}==1) {
233 $strOptions .="-R " . $self->{'reject_files'};
234 } else {
235 $strOptions .="-R *action=*,*diff=*,*oldid=*,*printable*,*Recentchangeslinked*,*Userlogin*,*Whatlinkshere*,*redirect*,*Special:*,Talk:*,Image:* ";
236 }
237
238 if($self->{'depth'} == 0){
239 $strOptions .= " ";
240 } elsif($self->{'depth'} > 0) {
241 $strOptions .= "-l ".$self->{'depth'}." "; # already got -r
242 } else {
243 $self->error("setupOptions","Incorrect Depth is defined!!\n");
244 }
245
246 if($self->{'below'}) {
247 $strOptions .="-np ";
248 }
249
250 # if($self->{'html_only'}) {
251 # $strOptions .="-A .html,.htm,.shm,.shtml,.asp,.php,.cgi,*?*=* ";
252 # } else{
253 # $strOptions .="-p ";
254 #}
255
256 if (!$self->{'within'}){
257 $strOptions .="-H ";
258 }
259
260 return $strOptions;
261}
262
263sub url_information
264{
265 my ($self) = shift (@_);
266
267 my $strOptions = $self->getWgetOptions();
268
269 my $strBaseCMD = $strOptions." -q -O - \"$self->{'url'}\"";
270
271
272 my $strIdentifyText = $self->useWget($strBaseCMD);
273
274 if (!defined $strIdentifyText or $strIdentifyText eq "" ){
275 print STDERR "Server information is unavailable.\n";
276 print STDERR "<<Finished>>\n";
277 return;
278 }
279
280 while ($strIdentifyText =~ m/^(.*)<title>(.*?)<\/title>(.*)$/s) {
281 $strIdentifyText = $1.$3;
282 print STDERR "Page Title: $2\n";
283 }
284
285 while ($strIdentifyText =~ m/^(.*)<meta (.*?)>(.*)$/s) {
286 $strIdentifyText = $1.$3;
287 my $strTempString = $2;
288 print STDERR "\n";
289
290 while($strTempString =~ m/(.*?)=[\"|\'](.*?)[\"|\'](.*?)$/s){
291 # Store the infromation in to variable, since next time when we do
292 # regular expression, we will lost all the $1, $2, $X....
293 $strTempString = $3;
294 my $strMetaName = $1;
295 my $strMetaContain = $2;
296
297 # Take out the extra space in the beginning of the string.
298 $strMetaName =~ s/^([" "])+//m;
299 $strMetaContain =~ s/^([" "])+//m;
300
301 print STDERR "$strMetaName: $strMetaContain\n\n";
302 }
303 }
304
305 print STDERR "<<Finished>>\n";
306
307}
308
309
310sub error
311{
312 my ($strFunctionName,$strError) = @_;
313 {
314 print "Error occoured in MediaWikiDownload.pm\n".
315 "In Function:".$strFunctionName."\n".
316 "Error Message:".$strError."\n";
317 exit(-1);
318 }
319}
320
3211;
322
Note: See TracBrowser for help on using the repository browser.