source: main/trunk/greenstone2/perllib/downloaders/MediaWikiDownload.pm@ 31920

Last change on this file since 31920 was 28250, checked in by ak19, 11 years ago
  1. Jenny and I fixed an oversight in OAIDownload, thanks to Kathy's suggestion, where an html page's contents used to be inspected for file types specified to be downloaded with GetDocument ONLY if the file extension was htm(l). Now the test is whether the header specifies Content-Type text/html. 2. Deprecated utils functions replaced with their FileUtils equivalents.
  • Property svn:executable set to *
File size: 9.5 KB
Line 
1###########################################################################
2#
3# MediaWikiDownload.pm -- downloader for wiki pages
4# A component of the Greenstone digital library software
5# from the New Zealand Digital Library Project at the
6# University of Waikato, New Zealand.
7#
8# Copyright (C) 1999 New Zealand Digital Library Project
9#
10# This program is free software; you can redistribute it and/or modify
11# it under the terms of the GNU General Public License as published by
12# the Free Software Foundation; either version 2 of the License, or
13# (at your option) any later version.
14#
15# This program is distributed in the hope that it will be useful,
16# but WITHOUT ANY WARRANTY; without even the implied warranty of
17# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
18# GNU General Public License for more details.
19#
20# You should have received a copy of the GNU General Public License
21# along with this program; if not, write to the Free Software
22# Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
23#
24###########################################################################
25
26package MediaWikiDownload;
27
28eval {require bytes};
29
30# suppress the annoying "subroutine redefined" warning that various
31# plugins cause under perl 5.6
32$SIG{__WARN__} = sub {warn($_[0]) unless ($_[0] =~ /Subroutine\s+\S+\sredefined/)};
33
34use WgetDownload;
35
36sub BEGIN {
37 @MediaWikiDownload::ISA = ('WgetDownload');
38}
39
40use strict; # every perl program should have this!
41no strict 'refs'; # make an exception so we can use variables as filehandles
42
43my $arguments =
44 [ { 'name' => "url",
45 'disp' => "{WebDownload.url_disp}",
46 'desc' => "{WebDownload.url}",
47 'type' => "string",
48 'reqd' => "yes"},
49 { 'name' => "depth",
50 'disp' => "{WebDownload.depth_disp}",
51 'desc' => "{WebDownload.depth}",
52 'type' => "int",
53 'deft' => "0",
54 "range" => "0,",
55 'reqd' => "no"},
56 { 'name' => "below",
57 'disp' => "{WebDownload.below_disp}",
58 'desc' => "{WebDownload.below}",
59 'type' => "flag",
60 'reqd' => "no"},
61 { 'name' => "within",
62 'disp' => "{WebDownload.within_disp}",
63 'desc' => "{WebDownload.within}",
64 'type' => "flag",
65 'reqd' => "no"},
66 { 'name' => "reject_files",
67 'disp' => "{MediaWikiDownload.reject_filetype_disp}",
68 'desc' => "{MediaWikiDownload.reject_filetype}",
69 'type' => "string",
70 'reqd' => "no",
71 'deft' => "*action=*,*diff=*,*oldid=*,*printable*,*Recentchangeslinked*,*Userlogin*,*Whatlinkshere*,*redirect*,*Special:*,Talk:*,Image:*,*.ppt,*.pdf,*.zip,*.doc"},
72 { 'name' => "exclude_directories",
73 'disp' => "{MediaWikiDownload.exclude_directories_disp}",
74 'desc' => "{MediaWikiDownload.exclude_directories}",
75 'type' => "string",
76 'reqd' => "no",
77 'deft' => "/wiki/index.php/Special:Recentchangeslinked,/wiki/index.php/Special:Whatlinkshere,/wiki/index.php/Talk:Creating_CD"},
78 ];
79
80my $options = { 'name' => "MediaWikiDownload",
81 'desc' => "{MediaWikiDownload.desc}",
82 'abstract' => "no",
83 'inherits' => "yes",
84 'args' => $arguments };
85
86my $wget_options = "";
87
88my $self;
89
90sub new
91{
92 my ($class) = shift (@_);
93 my ($getlist,$inputargs,$hashArgOptLists) = @_;
94 push(@$getlist, $class);
95
96 push(@{$hashArgOptLists->{"ArgList"}},@{$arguments});
97 push(@{$hashArgOptLists->{"OptList"}},$options);
98
99 my $self = new WgetDownload($getlist,$inputargs,$hashArgOptLists);
100
101 return bless $self, $class;
102}
103
104
105sub download
106{
107 my ($self) = shift (@_);
108 my ($hashGeneralOptions) = @_;
109
110 # Download options
111 my $strOptions = $self->generateOptionsString();
112 my $strWgetOptions = $self->getWgetOptions();
113
114 # Setup the command for using wget
115 $wget_options = "-N -k -x -t 2 -P \"". $hashGeneralOptions->{"cache_dir"}."\" $strWgetOptions $strOptions ";
116 my $cmdWget = "-N -k -x -t 2 -P \"". $hashGeneralOptions->{"cache_dir"}."\" $strWgetOptions $strOptions " . $self->{'url'};
117
118 # Download the web pages
119 # print "Strat download from $self->{'url'}...\n";
120 print STDERR "<<Undefined Maximum>>\n";
121
122 my $strResponse = $self->useWget($cmdWget,1);
123
124 # if ($strResponse ne ""){print "$strResponse\n";}
125
126 print STDERR "Finish download from $self->{'url'}\n";
127
128 # check css files for HTML pages are downloaded as well
129 $self->{'url'} =~ /http:\/\/([^\/]*)\//;
130 my $base_url = &FileUtils::filenameConcatenate($hashGeneralOptions->{"cache_dir"}, $1);
131 &check_file($base_url, $self);
132
133 print STDERR "<<Finished>>\n";
134
135 return 1;
136}
137
138sub check_file
139{
140 my $dir = shift;
141 my ($self) = shift;
142
143 local *DIR;
144
145 opendir DIR, $dir or return;
146
147 my @contents = grep { $_ ne '.' and $_ ne '..' } readdir DIR;
148
149 closedir DIR;
150
151 foreach (@contents) {
152 # broad-first search
153 &check_file("$dir/$_", $self);
154
155 # read in every HTML page and check the css associated with import statement already exists.
156 &check_imported_css($self, "$dir/$_");
157 }
158}
159
160sub check_imported_css
161{
162 my ($self) = shift(@_);
163 my $downloaded_file = shift(@_);
164
165 $downloaded_file =~ /(.+)(\/|\\)cache(\/|\\)([^\/\\]+)(\/|\\)/;
166
167 # external website url, to make up the stylesheet urls
168 my $base_website_url = $4;
169
170 # the cache download file directory
171 my $base_dir = "$1$2cache$3$4" if defined $base_website_url;
172
173 if($downloaded_file=~/(.+)\.(html|htm)/) {
174 my $content = "";
175 if(open(INPUT, "<$downloaded_file")){
176 while(my $line = <INPUT>){
177 $content .= $line;
178 }
179 close(INPUT);
180 }
181
182 my @css_files;
183 my @css_files_paths;
184 my $css_file_count = 0;
185 while($content =~ /<style type="text\/css"(.+)?import "(.+)?"/ig){
186 $css_files[$css_file_count] = $base_website_url . $2 if defined $2;
187 $css_files_paths[$css_file_count] = $base_dir . $2 if defined $2;
188 $css_file_count++;
189 }
190
191 for($css_file_count=0; $css_file_count<scalar(@css_files); $css_file_count++) {
192 my $css_file = "http://" . $css_files[$css_file_count];
193 my $css_file_path = $css_files_paths[$css_file_count];
194
195 # trim the ? mark append to the end of a stylesheet
196 $css_file =~ s/\?([^\/\.\s]+)$//isg;
197 $css_file_path =~ s/\?([^\/\.\s]+)$//isg;
198
199 # do nothing if the css file existed
200 next if(-e $css_file_path);
201
202 # otherwise use Wget to download the css files
203 my $cmdWget = $wget_options . $css_file;
204 my $strResponse = $self->useWget($cmdWget,1);
205
206 print STDERR "Downloaded associated StyleSheet : $css_file\n";
207 }
208 }
209}
210
211
212sub generateOptionsString
213{
214 my ($self) = @_;
215 my $strOptions;
216
217 (defined $self) || &error("generateOptionsString","No \$self is defined!!\n");
218 (defined $self->{'depth'})|| &error("generateOptionsString","No depth is defined!!\n");
219
220 # -r for recursive downloading
221 # -E to append a 'html' suffix to a file of type 'application/xhtml+xml' or 'text/html'
222 $strOptions .="-r -E ";
223
224 # -X exclude file directories
225 if($self->{'exclude_directories'}==1) {
226 $strOptions .="-X " . $self->{'exclude_directories'};
227 } else {
228 $strOptions .="-X /wiki/index.php/Special:Recentchangeslinked,/wiki/index.php/Special:Whatlinkshere,/wiki/index.php/Talk:Creating_CD ";
229 }
230
231 # -R reject file list, reject files with these text in their names
232 if($self->{'reject_files'}==1) {
233 $strOptions .="-R " . $self->{'reject_files'};
234 } else {
235 $strOptions .="-R *action=*,*diff=*,*oldid=*,*printable*,*Recentchangeslinked*,*Userlogin*,*Whatlinkshere*,*redirect*,*Special:*,Talk:*,Image:* ";
236 }
237
238 if($self->{'depth'} == 0){
239 $strOptions .= " ";
240 } elsif($self->{'depth'} > 0) {
241 $strOptions .= "-l ".$self->{'depth'}." "; # already got -r
242 } else {
243 $self->error("setupOptions","Incorrect Depth is defined!!\n");
244 }
245
246 if($self->{'below'}) {
247 $strOptions .="-np ";
248 }
249
250 # if($self->{'html_only'}) {
251 # $strOptions .="-A .html,.htm,.shm,.shtml,.asp,.php,.cgi,*?*=* ";
252 # } else{
253 # $strOptions .="-p ";
254 #}
255
256 if (!$self->{'within'}){
257 $strOptions .="-H ";
258 }
259
260 return $strOptions;
261}
262
263sub url_information
264{
265 my ($self) = shift (@_);
266
267 my $strOptions = $self->getWgetOptions();
268
269 my $strBaseCMD = $strOptions." -q -O - \"$self->{'url'}\"";
270
271
272 my $strIdentifyText = $self->useWget($strBaseCMD);
273
274 if (!defined $strIdentifyText or $strIdentifyText eq "" ){
275 print STDERR "Server information is unavailable.\n";
276 print STDERR "<<Finished>>\n";
277 return;
278 }
279
280 while ($strIdentifyText =~ m/^(.*)<title>(.*?)<\/title>(.*)$/s) {
281 $strIdentifyText = $1.$3;
282 print STDERR "Page Title: $2\n";
283 }
284
285 while ($strIdentifyText =~ m/^(.*)<meta (.*?)>(.*)$/s) {
286 $strIdentifyText = $1.$3;
287 my $strTempString = $2;
288 print STDERR "\n";
289
290 while($strTempString =~ m/(.*?)=[\"|\'](.*?)[\"|\'](.*?)$/s){
291 # Store the infromation in to variable, since next time when we do
292 # regular expression, we will lost all the $1, $2, $X....
293 $strTempString = $3;
294 my $strMetaName = $1;
295 my $strMetaContain = $2;
296
297 # Take out the extra space in the beginning of the string.
298 $strMetaName =~ s/^([" "])+//m;
299 $strMetaContain =~ s/^([" "])+//m;
300
301 print STDERR "$strMetaName: $strMetaContain\n\n";
302 }
303 }
304
305 print STDERR "<<Finished>>\n";
306
307}
308
309
310sub error
311{
312 my ($strFunctionName,$strError) = @_;
313 {
314 print "Error occoured in MediaWikiDownload.pm\n".
315 "In Function:".$strFunctionName."\n".
316 "Error Message:".$strError."\n";
317 exit(-1);
318 }
319}
320
3211;
322
Note: See TracBrowser for help on using the repository browser.