1 | # -*- perl -*-
|
---|
2 | # w3http.pm --- send http requests, janl's 12" mix for w3mir
|
---|
3 | #
|
---|
4 | $VERSION=1.0.22;
|
---|
5 | #
|
---|
6 | # This implements http/1.0 requests. We'll have problems with http/0.9
|
---|
7 | # This is in no way specific to w3mir.
|
---|
8 | #
|
---|
9 | # IMPORTANT: The caller should initialize the C locale for some of the
|
---|
10 | # things here to work correctly (specifically the strftime function).
|
---|
11 | #
|
---|
12 | # This is a rewrite of http.pl by Oscar Nierstrasz; I copied the code he he
|
---|
13 | # copied from the camel book. Some functions written by Gorm Haug Eriksen
|
---|
14 | # ([email protected]) has been used as is.
|
---|
15 | #
|
---|
16 | # Contributors:
|
---|
17 | # Nicolai Langfeldt, [email protected]
|
---|
18 | # Gorm Haug Eriksen, [email protected]
|
---|
19 | # Chris Szurgot, [email protected]
|
---|
20 | # Bernhard Weisshuh, [email protected]
|
---|
21 | # Copying and modification is governed by the "Artistic License" enclosed in
|
---|
22 | # the w3mir distribution
|
---|
23 | #
|
---|
24 | # gorm :
|
---|
25 | # &w3http::get_last_modified return the last modified stamp on a file in
|
---|
26 | # the right format for use with http
|
---|
27 | #
|
---|
28 | # janl:
|
---|
29 | # &http::query: Send a http query. A completely general function to send a
|
---|
30 | # http query. Will extract header values, http response code and, optionaly,
|
---|
31 | # convert text files to local linefeed format.
|
---|
32 | #
|
---|
33 | # Variables to examine after a query
|
---|
34 | # $w3http::document: The document returned by the query, if any.
|
---|
35 | # $w3http::doclen: The length of the document
|
---|
36 | # $w3http::result: The numerical http result code. It may take these values:
|
---|
37 | # - Normal HTTP reply codes
|
---|
38 | # - 98: OS error, permanent, errormessage in $!
|
---|
39 | # - 99: Other permanent error: see $restext.
|
---|
40 | # - 100: Transient error: Timeout/connection broken
|
---|
41 | # $w3http::restext: The english(?) HTTP result or w3http generated message
|
---|
42 | # $w3http::header: The http header returned.
|
---|
43 | # $w3http::plaintexthtml: 1 if this doc is non-content-encoded text/html
|
---|
44 | # (as opposed to content-encoding: compressed content-type: text/html
|
---|
45 | # which needs decompression before we can inspect the html)
|
---|
46 | # The tests are somewhat longwinded so I do it just once here.
|
---|
47 | # %w3http::headval: Associative array of header values
|
---|
48 | # $w3http::headval{'CONTENT-TYPE'}: Derived content type, stripped of charset
|
---|
49 | # qualifiers and other distractions.
|
---|
50 | # $w3http::xfbytes: Transfered bytes, cumulative. Document part only.
|
---|
51 | # $w3http::headbytes: Bytes of headers received, cumulative.
|
---|
52 | #
|
---|
53 | # Variables that change http's behaviour/requests:
|
---|
54 | # $w3http::agent: User agent, default is basename of $0
|
---|
55 | # $w3http::from: Request is from, default is user@host
|
---|
56 | # $w3http::version: The http version to use, only 1.0 is known to me.
|
---|
57 | # $w3http::timeout: How long to wait for new data to arrive, default is 600sec
|
---|
58 | # $w3http::buflen: Network read buffer size, default is 4096. It might give a
|
---|
59 | # speedup to tune this for specific servers' so it matches their send
|
---|
60 | # size. This size can be detected if we want to, I think.
|
---|
61 | # $w3http::debug: 1 debuging output, 2, more, 3 queries and replies
|
---|
62 | # $w3http::verbose: 0: say nothing, 1: print progress info
|
---|
63 | # $w3http::convert: Convert text/* documents to local newline convention?
|
---|
64 | # The default is to do it.
|
---|
65 | # $w3http::proxyserver: The name of the proxyserver to use.
|
---|
66 | # $w3http::proxyport: The port of the proxyserver to use. 0 if no proxyserver.
|
---|
67 | # $w3http::proxyuser: If this is set proxy authentication will be used.
|
---|
68 | # $w3http::proxypasswd: The password for proxy authentication
|
---|
69 | #
|
---|
70 | # Things gotten from main:
|
---|
71 | # - $main::win32: 1 if win32 restrictions apply to this system
|
---|
72 | # - $main::nulldevice: Bit sink file/device on this system.
|
---|
73 | #
|
---|
74 | # History (european date format dd/mm/yy):
|
---|
75 | # janl ??/??/95 -- Rewrite finished
|
---|
76 | # szurgot ??/??/95 -- Win32 compatability
|
---|
77 | # janl 16/05/96 -- Added SAVEBIN option, based on idea by szurgot
|
---|
78 | # szurgot 03/05/96 -- Corrected typo in check for content-length against
|
---|
79 | # retreive document length. Added test for zero-length
|
---|
80 | # documents (Not retreived because not-modified)
|
---|
81 | # szurgot 19/05/96 -- Win32 adaptions, fixes.
|
---|
82 | # janl 19/05/96 -- Chris won an argument, and janl simplified http
|
---|
83 | # retrival loop (-> version 1.0.4)
|
---|
84 | # janl 09/09/96 -- Incorporated a patch submited by Michael Kriby -> 1.0.5
|
---|
85 | # janl 16/09/96 -- Support for authorization. -> 1.0.6
|
---|
86 | # janl 27/09/96 -- Support for Accept header, lack pointed out by
|
---|
87 | # [email protected]: ... HTTP/1.1 (§14.1) says
|
---|
88 | # ``If no Accept header field is present, then it is
|
---|
89 | # assumed that the client accepts all media types,
|
---|
90 | # earlier versions of the protocol suggest that only
|
---|
91 | # text/plain and text/html will be offered by default.''
|
---|
92 | # This contradicts my memory of a http/1.0 draft.
|
---|
93 | # Also added $ACCEPT option.
|
---|
94 | # janl 20/10/96 -- Now uses HTTP::Date to produce HTTP timestamps -> 1.0.7
|
---|
95 | # janl 27/10/96 -- Didn't use to check if gethostbyname worked -> 1.0.8
|
---|
96 | # janl 02/12/96 -- Forgot a unlink when renaming temporary files.
|
---|
97 | # janl 21/02/97 -- Multipele $ACCEPT options work. -> 1.0.9
|
---|
98 | # janl 19/03/97 -- Now issues Host: header -> 1.0.10
|
---|
99 | # janl 10/04/97 -- Changed from wwwurl to URI::URL, and various related
|
---|
100 | # changes. -> 1.0.11
|
---|
101 | # janl 09/05/97 -- Microsoft ISS servers are _so_ broken -> 1.0.12
|
---|
102 | # (don't close the write end of the HTTP socket after
|
---|
103 | # sending a query to them)
|
---|
104 | # janl 12/05/97 -- New version of perl caught some typos, fixed
|
---|
105 | # longstanding bug in the newline conversion bit.
|
---|
106 | # -> 1.0.13
|
---|
107 | # janl 06/06/97 -- Demand Loading of MIME::BASE64 -> 1.0.14
|
---|
108 | # janl 01/12/97 -- FAT filesystems drops LSB of modtime. Patch from
|
---|
109 | # Greg Lindhorst ([email protected])
|
---|
110 | # -- whoami does not exist on win32, hardwire a default
|
---|
111 | # value (unknown) (also Greg L.) -> 1.0.15
|
---|
112 | # janl 01/22/98 -- Proxy authentication as outlined by Christian Geuer
|
---|
113 | # janl 02/20/98 -- Complex 'content-type' headers handled. -> 1.0.17
|
---|
114 | # janl 04/20/98 -- Only newline convert text/html, everything else is
|
---|
115 | # handled as binary. -> 1.0.18
|
---|
116 | # janl 12/05/98 -- Store tmpfile in its final destination directory
|
---|
117 | # avoiding asking movefile move it across filesystems.
|
---|
118 | # -> 1.0.19
|
---|
119 | # janl 01/08/98 -- Timeout fix from Michael Gusev, also flag short doc
|
---|
120 | # as error.
|
---|
121 | # janl 24/09/98 -- Better error handling -> 1.0.20
|
---|
122 | # bkw 17/12/98 -- Fixed problem with tempfile-generation when
|
---|
123 | # running in forget-mode (-f)
|
---|
124 | # janl 05/01/99 -- Referer: dropped if argument not true -> 1.0.21
|
---|
125 | # janl 13/04/99 -- Added workaround for broken win32 perl resolving.
|
---|
126 |
|
---|
127 | package w3http;
|
---|
128 |
|
---|
129 | require 5.002;
|
---|
130 | use Socket;
|
---|
131 | use HTTP::Date;
|
---|
132 | use Sys::Hostname;
|
---|
133 | use URI::URL;
|
---|
134 |
|
---|
135 | # Suplementary libwww-perl:
|
---|
136 | sub URI::URL::_generic::basename {
|
---|
137 | my $self = shift;
|
---|
138 | my @p = $self->path_components;
|
---|
139 | my $old = $p[-1];
|
---|
140 | if (@_) {
|
---|
141 | splice(@p, -1, 1, shift);
|
---|
142 | $self->path_components(@p)
|
---|
143 | }
|
---|
144 | $old;
|
---|
145 | }
|
---|
146 |
|
---|
147 |
|
---|
148 | END {
|
---|
149 | # Remove tmp file and such in here. That means that main:: gotta catch
|
---|
150 | # interrupt signals and exit on them, so ENDs are executed.
|
---|
151 | }
|
---|
152 |
|
---|
153 | use strict;
|
---|
154 | # Global variables, we want to share them:
|
---|
155 | use vars qw($GET $HEAD $GETURL $HEADURL $IFMOD $IFMODF $AUTHORIZ $REFERER);
|
---|
156 | use vars qw($SAVEBIN $ACCEPT $NOUSER $FREEHEAD $agent $version $timeout);
|
---|
157 | use vars qw($debug $convert $proxyserver $proxyport $xfbytes $headbytes);
|
---|
158 | use vars qw($verbose $result $restext $header $document);
|
---|
159 | use vars qw($plaintexthtml %headval $progress $doclen $proxyuser);
|
---|
160 | use vars qw($proxypasswd);
|
---|
161 |
|
---|
162 | my $hasAlarm; # Win32 does not have any alarm
|
---|
163 | my $chime; # Has the alarm gone off yet?
|
---|
164 | my %address; # My own DNS cache
|
---|
165 | my $savALRM; # Saved ALRM handler
|
---|
166 | my $savPIPE; # Saved PIPE handler
|
---|
167 |
|
---|
168 | # The main:: program should detect if we're running on win32 or not,
|
---|
169 | # somehow
|
---|
170 | if ($main::win32) {
|
---|
171 | warn "win32\n";
|
---|
172 | # Compensate for lacks of win32 perl.
|
---|
173 | $hasAlarm=0;
|
---|
174 | # Seems to be unavailable in win32/perl5.001. It has to be in 5.003!
|
---|
175 | # eval "sub sockaddr_in {
|
---|
176 | # ($port, $thataddr) = @_;
|
---|
177 | # $sockaddr = 'S n a4 x8';
|
---|
178 | # return pack($sockaddr, &AF_INET, $port, $thataddr);
|
---|
179 | # }";
|
---|
180 | } else {
|
---|
181 | $hasAlarm=1;
|
---|
182 | }
|
---|
183 |
|
---|
184 |
|
---|
185 | # Find out some things
|
---|
186 | my $thishost = hostname();
|
---|
187 | my $proto = getprotobyname("tcp");
|
---|
188 |
|
---|
189 | (my $name, undef) = gethostbyname($thishost);
|
---|
190 | chomp(my $user = $ENV{'LOGNAME'} || $ENV{'USER'} || `whoami` || 'unknown');
|
---|
191 | my $from = "$user\@$name";
|
---|
192 |
|
---|
193 | my $nl = "\r\n";
|
---|
194 | # Default values, change by assignment in using-program.
|
---|
195 | $agent = $0; $agent =~ s~.*/~~; # Basename
|
---|
196 | $version= "1.0";
|
---|
197 | $timeout= 600; # Timeout while waiting for data/connection
|
---|
198 | my $buflen = 4096; # recv buffer length
|
---|
199 | $debug = 0; # Debuging output?
|
---|
200 | $convert = 1; # Convert newlines of text docs to local format
|
---|
201 | $proxyserver=''; # Proxy server.
|
---|
202 | $proxyport=0; # Proxy server port. 0 if no proxy.
|
---|
203 | $proxyuser=''; # Username for proxy authentication
|
---|
204 | $proxypasswd=''; # Password for proxy authentication
|
---|
205 | $xfbytes=0; # 0 bytes transfered, cumulative
|
---|
206 | $headbytes=0; # 0 bytes of headers, cumulative
|
---|
207 | $doclen=0; # 0 bytes in doc, pr. document
|
---|
208 | my $tmpfile="w3mir$$.tmp"; # Temporary filename
|
---|
209 | $verbose=0; # Verbosenes, 0: silent, 1: progress info
|
---|
210 |
|
---|
211 | # Query opcodes
|
---|
212 | $GET = 1; # GET query. Arg: host,port,path
|
---|
213 | $HEAD = 2; # HEAD query. Arg: host,port,path
|
---|
214 | $GETURL = 3; # GET query. Arg: url
|
---|
215 | $HEADURL = 4; # HEAD query. Arg: url
|
---|
216 | # Here we lack PUT, which is not implemented
|
---|
217 | # Modify query thus:
|
---|
218 | $IFMOD = 101; # If-modified after: Arg: HTTP-date-str
|
---|
219 | $IFMODF = 102; # If-modified after file: Arg: local-file-name
|
---|
220 | $AUTHORIZ= 103; # Basic authorization. Arg: 'user:password'
|
---|
221 | $REFERER = 104; # Referer: Arg: Referer
|
---|
222 | $SAVEBIN = 105; # Write binary files to disk. Arg: File name
|
---|
223 | # If this opcode is used then main must provide
|
---|
224 | # a &main::movefile(oldname,newname) procedure
|
---|
225 | # that handles moving the tmp file to the
|
---|
226 | # final name/location.
|
---|
227 | $ACCEPT = 106; # Accept header value: Arg: value
|
---|
228 | $NOUSER = 107; # Don't insert user header. Arg: none
|
---|
229 | $FREEHEAD= 999; # Freeform header, one line. Arg: header
|
---|
230 |
|
---|
231 | sub query {
|
---|
232 | # Build and send a HTTP query. And also receive response - janl 95/09/18
|
---|
233 | #
|
---|
234 | # Return codes: 0 if it didn't work. 1 if it did work.
|
---|
235 | # HTTP style result code in w3http::$result and message in w3http::$restext
|
---|
236 |
|
---|
237 | # We do next to no argument type checking btw.
|
---|
238 |
|
---|
239 | my($host,$port,$request,$query,$method,$inp,$linp,$saveto,$save,$arg);
|
---|
240 | my($start,$wantbytes,$thataddr,$err,$headb,$tmpf,$ldoc,$nouser,$q,$accept);
|
---|
241 | my($origreq,$req_o,$plaintext);
|
---|
242 |
|
---|
243 | # Something ought to be said
|
---|
244 | $result=99;
|
---|
245 | $restext='w3http: internal error';
|
---|
246 | $nouser=0;
|
---|
247 |
|
---|
248 | if ($version ne '1.0') {
|
---|
249 | warn "Unknown HTTP version $version, no request sent\n";
|
---|
250 | return 0;
|
---|
251 | }
|
---|
252 |
|
---|
253 | $accept=$saveto=$query='';
|
---|
254 |
|
---|
255 | # Find out what to ask for
|
---|
256 |
|
---|
257 | while (defined($arg=shift)) {
|
---|
258 | if ($arg == $GET) {
|
---|
259 | $host=shift;
|
---|
260 | $port=shift;
|
---|
261 | $request=shift;
|
---|
262 | $req_o=url 'http://'.$host.':'.$port.$request;
|
---|
263 | if ($proxyport) {
|
---|
264 | $query.='GET http://'.$req_o->as_string;
|
---|
265 | } else {
|
---|
266 | $query.='GET '.$req_o->epath;
|
---|
267 | }
|
---|
268 | $query.=' HTTP/'.$version.$nl;
|
---|
269 | } elsif ($arg == $HEAD) {
|
---|
270 | $host=shift;
|
---|
271 | $port=shift;
|
---|
272 | $request=shift;
|
---|
273 | $req_o=url 'http://'.$host.':'.$port.$request;
|
---|
274 | if ($proxyport) {
|
---|
275 | $query.='HEAD '.$req_o->as_string;
|
---|
276 | } else {
|
---|
277 | $query.='HEAD '.$req_o->epath;
|
---|
278 | }
|
---|
279 | $query.=' HTTP/'.$version.$nl;
|
---|
280 | } elsif ($arg == $GETURL) {
|
---|
281 | $req_o=shift;
|
---|
282 | $req_o=url $req_o unless ref $req_o;
|
---|
283 | ($method,undef,undef,$host,$port,$request,undef,$q) = $req_o->crack;
|
---|
284 | if ($proxyport) {
|
---|
285 | $query.='GET '.$req_o->as_string;
|
---|
286 | } else {
|
---|
287 | $q=$req_o->equery;
|
---|
288 | $query.='GET '.$request.($q?"?$q":'');
|
---|
289 | }
|
---|
290 | $query.=' HTTP/'.$version.$nl;
|
---|
291 | } elsif ($arg == $HEADURL) {
|
---|
292 | $req_o=shift;
|
---|
293 | $req_o=url $req_o unless ref $req_o;
|
---|
294 | if ($proxyport) {
|
---|
295 | $query.='HEAD '.$req_o->as_string;
|
---|
296 | } else {
|
---|
297 | $q=$req_o->equery;
|
---|
298 | $query.='HEAD '.$req_o->epath.($q?"?$q":'');
|
---|
299 | }
|
---|
300 | $query.=' HTTP/'.$version.$nl;
|
---|
301 | } elsif ($arg == $IFMOD) {
|
---|
302 | $query.='If-Modified-Since: '.(shift).$nl;
|
---|
303 | } elsif ($arg == $IFMODF) {
|
---|
304 | $query.='If-Modified-Since: '.&last_modified(shift).$nl;
|
---|
305 | } elsif ($arg == $AUTHORIZ) {
|
---|
306 | # Demand-load MIME::Base64
|
---|
307 | if (!defined(&MIME::Base64::encode)) {
|
---|
308 | eval "use MIME::Base64;";
|
---|
309 | die "w3http: Could not load MIME::Base64 module necessary for authentication\n"
|
---|
310 | unless defined(&MIME::Base64::encode);
|
---|
311 | }
|
---|
312 | $query.='Authorization: Basic '.MIME::Base64::encode(shift,'').$nl;
|
---|
313 | } elsif ($arg == $REFERER) {
|
---|
314 | my($referer)=shift;
|
---|
315 | $query.='Referer: '.$referer.$nl if $referer;
|
---|
316 | } elsif ($arg == $SAVEBIN) {
|
---|
317 | $saveto=shift;
|
---|
318 | } elsif ($arg == $ACCEPT) {
|
---|
319 | $accept.='Accept: '.(shift).$nl;
|
---|
320 | } elsif ($arg == $NOUSER) {
|
---|
321 | $nouser=1;
|
---|
322 | } elsif ($arg == $FREEHEAD) {
|
---|
323 | $query.=(shift).$nl;
|
---|
324 | } else {
|
---|
325 | warn "Unknown http query opcode: $arg\n";
|
---|
326 | }
|
---|
327 | # Insert the last parts of the query:
|
---|
328 | }
|
---|
329 |
|
---|
330 | $query.='Host: '.$req_o->netloc.$nl;
|
---|
331 | $query.='From: '.$from.$nl unless $nouser;
|
---|
332 |
|
---|
333 | $accept='Accept: */*'.$nl unless $accept;
|
---|
334 |
|
---|
335 | if ($proxyport) {
|
---|
336 | # Use proxy instead of originserver
|
---|
337 | $host=$proxyserver;
|
---|
338 | $port=$proxyport;
|
---|
339 |
|
---|
340 | # Add authentication stuff to query
|
---|
341 | if ($proxyuser) {
|
---|
342 | # Demand-load MIME::Base64
|
---|
343 | if (!defined(&MIME::Base64::encode)) {
|
---|
344 | eval "use MIME::Base64;";
|
---|
345 | die "w3http: Could not load MIME::Base64 module necessary for authentication\n"
|
---|
346 | unless defined(&MIME::Base64::encode);
|
---|
347 | }
|
---|
348 |
|
---|
349 | $query.='Proxy-Authorization: Basic '.
|
---|
350 | MIME::Base64::encode($proxyuser.':'.$proxypasswd);
|
---|
351 |
|
---|
352 | print STDERR "\nProxyuser: [$proxyuser]\nProxypasswd: [$proxypasswd]\n"
|
---|
353 | if $debug>=2;
|
---|
354 | }
|
---|
355 | }
|
---|
356 |
|
---|
357 | $query.='User-Agent: '.$agent.$nl.$accept.$nl;
|
---|
358 |
|
---|
359 | # If we're using proxy then set up things...
|
---|
360 | print STDERR "\nQUERY:\n",$query,"---\n" if $debug>=2;
|
---|
361 |
|
---|
362 | # win32 fix: this should be added in case of troubles with
|
---|
363 | # gethostbyname. possible reason: nameserver down?
|
---|
364 | if ($host =~ /^\d+(\.\d+){3}$/) {
|
---|
365 | # in case gethostbyname will not work ... ;-)
|
---|
366 | $address{$host} = pack 'C4', (split /\./, $host);
|
---|
367 | }
|
---|
368 |
|
---|
369 | # Find out who to ask, check if we know already
|
---|
370 | if (exists($address{$host})) {
|
---|
371 | # We know
|
---|
372 | $thataddr=$address{$host};
|
---|
373 | } else {
|
---|
374 | # Cache miss, get and remember.
|
---|
375 | (my $fqdn, undef, undef, undef, $thataddr) = gethostbyname($host);
|
---|
376 | # Hostname lookup failure? Cache even misses.
|
---|
377 | if (defined($fqdn)) {
|
---|
378 | print STDERR "Lookup of $host:\nFQDN: $fqdn\n"
|
---|
379 | if $debug;
|
---|
380 | $address{$host}=$thataddr;
|
---|
381 | $address{$fqdn}=$thataddr if $fqdn ne $host;
|
---|
382 | } else {
|
---|
383 | $thataddr=$address{$host}=undef;
|
---|
384 | }
|
---|
385 | }
|
---|
386 |
|
---|
387 | # Check if lookup failure, return
|
---|
388 | if (!defined($thataddr)) {
|
---|
389 | $restext='Host lookup failure';
|
---|
390 | return;
|
---|
391 | }
|
---|
392 |
|
---|
393 | $port=80 unless defined($port) && $port;
|
---|
394 |
|
---|
395 | # When connected we might receive SIGPIPE. I'm not sure if the
|
---|
396 | # default behaviour of dying is beneficial in that case. If we get
|
---|
397 | # alarm a timeout has expired.
|
---|
398 | $savPIPE = $SIG{'PIPE'};
|
---|
399 | $savALRM = $SIG{'ALRM'};
|
---|
400 |
|
---|
401 | $chime=0; # There has been no alarm yet
|
---|
402 | $SIG{'ALRM'} = \&timeout;
|
---|
403 | $SIG{'PIPE'} = \&ignore;
|
---|
404 |
|
---|
405 | # Close the socket, just in case, and ignore error returns
|
---|
406 | close(FS);
|
---|
407 |
|
---|
408 | socket(FS, AF_INET, SOCK_STREAM, $proto) or return &oserror;
|
---|
409 | warn "Got my socks on\n" if $debug;
|
---|
410 |
|
---|
411 | my $paddr = sockaddr_in($port, $thataddr);
|
---|
412 | connect(FS, $paddr) or return &oserror;
|
---|
413 | warn "Connected\n" if $debug;
|
---|
414 |
|
---|
415 | # Arrange timeout
|
---|
416 | alarm($timeout) if $hasAlarm;
|
---|
417 |
|
---|
418 | # We have, in fact, received SIGPIPE on this line:
|
---|
419 | send(FS,$query,0) or return &oserror;
|
---|
420 |
|
---|
421 | if ($chime) {
|
---|
422 | $result=100;
|
---|
423 | $restext='timeout sending query';
|
---|
424 | return &resetsign;
|
---|
425 | }
|
---|
426 |
|
---|
427 | $header='';
|
---|
428 | $document='';
|
---|
429 | $inp=' 'x$buflen;
|
---|
430 | $doclen=$chime=$plaintext=$plaintexthtml=$save=0;
|
---|
431 |
|
---|
432 | # Breaks some M$ ISS servers:
|
---|
433 | # shutdown(FS,1); # Half-close socket, sending now not allowed
|
---|
434 |
|
---|
435 | print STDERR ", receiving header" if $verbose>0;
|
---|
436 |
|
---|
437 | # Retrive HTTP response HEADER. Why do I use recv and not <FS>?
|
---|
438 | # Because then the timeout can work correctly!
|
---|
439 | while (1) {
|
---|
440 | # Set up alarm to ensure recv returns within a reasonable timeframe
|
---|
441 | alarm($timeout) if $hasAlarm;
|
---|
442 | $err = recv(FS,$inp,$buflen,0);
|
---|
443 | # recv returned, cancel alarm.
|
---|
444 | alarm(0) if $hasAlarm;
|
---|
445 |
|
---|
446 | # If there has been a timeout, then we quit now. The recv man page
|
---|
447 | # does not seem to allow recv to return the bytes received up to
|
---|
448 | # the timeout.
|
---|
449 | if ($chime) {
|
---|
450 | $result=100;
|
---|
451 | $restext='timeout fetching document';
|
---|
452 | $!=0;
|
---|
453 | if ($save) {
|
---|
454 | unlink($tmpf) ||
|
---|
455 | warn "Could not unlink $tmpf: $!\n";
|
---|
456 | }
|
---|
457 | return &resetsign;
|
---|
458 | }
|
---|
459 |
|
---|
460 | # recv returnes the undefined value on error
|
---|
461 | if (!defined($err)) {
|
---|
462 | warn "Error in recv: $!\n";
|
---|
463 | last;
|
---|
464 | }
|
---|
465 |
|
---|
466 | $linp=length($inp);
|
---|
467 |
|
---|
468 | # If the returned input was 0 in length then we've gotten to the
|
---|
469 | # end of the response.
|
---|
470 | last unless $linp;
|
---|
471 |
|
---|
472 | # Accounting
|
---|
473 | $xfbytes += $linp;
|
---|
474 | $doclen += $linp;
|
---|
475 |
|
---|
476 | # Accumulate input
|
---|
477 | $header.=$inp;
|
---|
478 |
|
---|
479 | # eof(SOCKET) has strange semantics it seems
|
---|
480 | # last if eof(FS);
|
---|
481 |
|
---|
482 | # Check if header is complete
|
---|
483 | last if ($header =~ m/(\r?\n\r?\n)/);
|
---|
484 | }
|
---|
485 |
|
---|
486 | if (length($header)==0) {
|
---|
487 | $restext='the HTTP reply header is empty!';
|
---|
488 | return &resetsign;
|
---|
489 | }
|
---|
490 |
|
---|
491 | if ($header =~ m/(\r?\n\r?\n)/) {
|
---|
492 | $header=$`;
|
---|
493 | $document=$';
|
---|
494 | } else {
|
---|
495 | $header=$document;
|
---|
496 | }
|
---|
497 |
|
---|
498 | # Adjust accounting
|
---|
499 | $headb = length($header)+length($1);
|
---|
500 | $headbytes += $headb;
|
---|
501 | $xfbytes -= $headb;
|
---|
502 | $doclen -= $headb;
|
---|
503 |
|
---|
504 | # Pick headers to pieces
|
---|
505 | ($result,$restext,%headval)=&analyze_header($header);
|
---|
506 |
|
---|
507 | print STDERR "REPLY:\n",$header,"\n---\n" if $debug>=2;
|
---|
508 |
|
---|
509 | # Check if the document is a non-encoded text document. The contents
|
---|
510 | # could be (x-)?compress or (x-)gzip coded (compressed in other
|
---|
511 | # words).
|
---|
512 |
|
---|
513 | $plaintext=defined($headval{'CONTENT-TYPE'}) &&
|
---|
514 | (substr($headval{'CONTENT-TYPE'},0,5) eq 'text/' || 0) &&
|
---|
515 | !defined($headval{'content-encoding'});
|
---|
516 | $plaintexthtml=$plaintext &&
|
---|
517 | ($headval{'CONTENT-TYPE'} eq 'text/html');
|
---|
518 |
|
---|
519 | if ($result==200) {
|
---|
520 |
|
---|
521 | # Save this to a file, or not? Never save html files.
|
---|
522 | if ($saveto && !$plaintexthtml) {
|
---|
523 | # We're going to save this document directly into a file. This
|
---|
524 | # stresses the VM less when getting the large binares so often
|
---|
525 | # found at cool sites.
|
---|
526 | $save=1;
|
---|
527 |
|
---|
528 | # Find a temporary filename
|
---|
529 | $tmpf=url "file:$saveto";
|
---|
530 | $tmpf->basename($tmpfile);
|
---|
531 | $tmpf=$tmpf->unix_path;
|
---|
532 |
|
---|
533 | # Find suitable final filename, one with no URL escapes
|
---|
534 | $saveto=(url "file:$saveto")->unix_path;
|
---|
535 |
|
---|
536 | # If output to stdout then send it directly there rather than
|
---|
537 | # using disk unnecesarily.
|
---|
538 | $tmpf='-' if ($saveto eq '-');
|
---|
539 |
|
---|
540 | # If output is nulldevice (running -f), use it also for tmpfile,
|
---|
541 | # since it would otherwise try to create it in /dev under unix.
|
---|
542 | $tmpf=$main::nulldevice if ($saveto eq $main::nulldevice);
|
---|
543 |
|
---|
544 | warn "USING TMPFILE: $tmpf\n" if $debug;
|
---|
545 |
|
---|
546 | open(SAVE,">$tmpf") ||
|
---|
547 | die "Could not open tmp file: $tmpf: $!\n";
|
---|
548 | binmode SAVE; # It's a binary file...
|
---|
549 | }
|
---|
550 |
|
---|
551 | if ($verbose>0) {
|
---|
552 | print STDERR ", document";
|
---|
553 | print STDERR "->disk" if $save;
|
---|
554 | }
|
---|
555 |
|
---|
556 | # Now retrive document itself. Se comments in header loop
|
---|
557 | $start=time;
|
---|
558 | $wantbytes = defined($headval{'content-length'})?
|
---|
559 | $headval{'content-length'}:0;
|
---|
560 |
|
---|
561 | $ldoc=length($document);
|
---|
562 |
|
---|
563 | while (1) {
|
---|
564 | alarm($timeout) if $hasAlarm;
|
---|
565 | recv(FS,$inp,$buflen,0);
|
---|
566 | alarm(0) if $hasAlarm;
|
---|
567 |
|
---|
568 | if ($chime) {
|
---|
569 | $result=100;
|
---|
570 | $restext='timeout fetching document';
|
---|
571 | $!=0;
|
---|
572 | if ($save) {
|
---|
573 | unlink($tmpf) ||
|
---|
574 | warn "Could not unlink $tmpf: $!\n";
|
---|
575 | }
|
---|
576 | return &resetsign;
|
---|
577 | }
|
---|
578 |
|
---|
579 | $linp=length($inp);
|
---|
580 |
|
---|
581 | last unless $linp || $ldoc;
|
---|
582 | $ldoc = 0;
|
---|
583 |
|
---|
584 | $xfbytes += $linp;
|
---|
585 | $doclen += $linp;
|
---|
586 |
|
---|
587 | if ($verbose>0 && time-$start>5) {
|
---|
588 | # Write progress info ...
|
---|
589 | if ($wantbytes) {
|
---|
590 | $progress = sprintf " %3d%%", $doclen/$wantbytes*100;
|
---|
591 | } else {
|
---|
592 | $progress = sprintf " %d", $doclen;
|
---|
593 | }
|
---|
594 | print STDERR $progress, "\ch"x(length($progress));
|
---|
595 | # ...every 5 seconds
|
---|
596 | $start=time;
|
---|
597 | }
|
---|
598 |
|
---|
599 | $document.=$inp;
|
---|
600 |
|
---|
601 | if ($save) {
|
---|
602 | $err = print SAVE $document;
|
---|
603 | die "Error writing $tmpf: $!\n" unless $err;
|
---|
604 | $document='';
|
---|
605 | }
|
---|
606 |
|
---|
607 | # The eof test seems to work very oddly for sockets.
|
---|
608 | # last if eof(FS);
|
---|
609 | }
|
---|
610 |
|
---|
611 | close(FS); # Close socket completely
|
---|
612 |
|
---|
613 | print STDERR "DOCUMENT:\n----\n",$document,"\n----\n" if $debug>=255;
|
---|
614 |
|
---|
615 | if ($wantbytes &&
|
---|
616 | $wantbytes != $doclen) {
|
---|
617 | $result=100;
|
---|
618 | $restext='transfer error; too many bytes in document';
|
---|
619 | $restext='document was incomplete' if ($wantbytes > $doclen) ;
|
---|
620 | print STDERR "SHORT DOCUMENT" if $debug>=16;
|
---|
621 | if ($save) {
|
---|
622 | unlink($tmpf) || warn "Could not unlink $tmpf: $!\n";
|
---|
623 | }
|
---|
624 | return &resetsign;
|
---|
625 | }
|
---|
626 |
|
---|
627 | # warn "XFB: $xfbytes, DL: $doclen\n";
|
---|
628 | if ($save) {
|
---|
629 | close(SAVE);
|
---|
630 | &main::movefile($tmpf,$saveto);
|
---|
631 | }
|
---|
632 |
|
---|
633 | # If this is a non-encoded text file and we're supposed to convert
|
---|
634 | # foreign newlines then we do it. It would be faster to do this
|
---|
635 | # with each chunk of input in the input loop, but this gives us
|
---|
636 | # two problems:
|
---|
637 | # - A \r\n newline could be split into two chunks. Thus escaping
|
---|
638 | # newline conversion.
|
---|
639 | # - It messes up the received bytes accounting rather badly.
|
---|
640 | #
|
---|
641 | # This used to be a test for $plaintext, the problem is that too
|
---|
642 | # many documents were typed as text/plain and so we corrupted
|
---|
643 | # binary files. This is bad. So now we're more paranoid about it:
|
---|
644 | # Only HTML gets converted.
|
---|
645 | if ($convert && $plaintexthtml) {
|
---|
646 | # Change non unix newlines to unix newlines. bare \r is known
|
---|
647 | # from macintosh (they hadta be different didn't they?), \r\n is
|
---|
648 | # known as 'network format' and from numerous systems, among
|
---|
649 | # them ms-dos.
|
---|
650 | $document =~ s~\r~\n~g unless $document =~ s~\r\n~\n~g;
|
---|
651 | warn "Newlines converted(?)\n" if $debug;
|
---|
652 | }
|
---|
653 |
|
---|
654 | } # if $result == 200
|
---|
655 |
|
---|
656 | &resetsign;
|
---|
657 | return 1;
|
---|
658 | }
|
---|
659 |
|
---|
660 |
|
---|
661 | sub analyze_header {
|
---|
662 | my($header)=@_;
|
---|
663 | my($result,$restext,%headval,$hdln,$key,$value);
|
---|
664 |
|
---|
665 | # Summary of the http spec on headers (with my comments):
|
---|
666 | # - Each header line ends in CRLF (or just LF, or maybe even just CR,
|
---|
667 | # anyways, it's easier if all is LF).
|
---|
668 | $header =~ s/\r/\n/mg unless $header =~ s/\r\n/\n/mg;
|
---|
669 | # - If a line starts with space then it's a continuation of the previous
|
---|
670 | # line (these I fold into one line).
|
---|
671 | $header =~ s/\n\s/ /mg;
|
---|
672 | # - The header field names are case insensitive (so I convert them to
|
---|
673 | # lowercase)
|
---|
674 | # - A field may appear twice, that is equivalent to listing the values
|
---|
675 | # in a comma separated list (so I fold them into a comma separated list)n
|
---|
676 | # - The field name and the field value are separated by ': '
|
---|
677 | ($result,$restext) = $header =~ m~^HTTP/\d\.\d (\d\d\d) (.*)~;
|
---|
678 | # Shave off http result code from the header
|
---|
679 | $header =~ s~^.*\n~~;
|
---|
680 |
|
---|
681 | warn "Header:\n$header\n---\n" if $debug>=3;
|
---|
682 |
|
---|
683 | warn "Result: $result, Text: $restext\n" if $debug>=2;
|
---|
684 |
|
---|
685 | %headval=();
|
---|
686 |
|
---|
687 | foreach $hdln (split(/\r?\n/m,$header)) {
|
---|
688 | ($key,$value)=split(': ',$hdln,2);
|
---|
689 | $key="\L$key";
|
---|
690 | # Strip leading&trailing space off the reply, some servers use
|
---|
691 | # copious space after.
|
---|
692 | $value =~ s/^\s+|\s+$//g;
|
---|
693 | print STDERR "K: '$key', V: '$value'\n" if $debug>=2;
|
---|
694 | if (defined($headval{$key})) {
|
---|
695 | $headval{$key}.=", ".$value;
|
---|
696 | } else {
|
---|
697 | $headval{$key}=$value;
|
---|
698 | }
|
---|
699 | }
|
---|
700 |
|
---|
701 | # See if there are any type parameters in the content-type header
|
---|
702 | # and if so remove them.
|
---|
703 | if (defined($headval{'content-type'})) {
|
---|
704 | my $val=$headval{'content-type'};
|
---|
705 | ($val,undef)=split(';',$val,2) if ($val =~ /;/);
|
---|
706 | $headval{'CONTENT-TYPE'}=$val;
|
---|
707 | }
|
---|
708 |
|
---|
709 | return ($result,$restext,%headval);
|
---|
710 | }
|
---|
711 |
|
---|
712 |
|
---|
713 | sub last_modified {
|
---|
714 | # will return the last modified time for a local file as a HTTP
|
---|
715 | # timestamp.
|
---|
716 |
|
---|
717 | my(@tmp) = stat($_[0]); # file doesn't exist ok to fetch
|
---|
718 |
|
---|
719 | # FAT file systems strip the LSB of the file time. Add it back in
|
---|
720 | # here before asking the server about a modified file. The only way
|
---|
721 | # this can fail is if the newer server file was saved one second
|
---|
722 | # after the first version (very unlikely). This isn't needed for
|
---|
723 | # NTFS file systems, but there is no good portable Perl way to
|
---|
724 | # determine the file system type.
|
---|
725 | $tmp[9] = $tmp[9] | 1 if ( $main::win32 );
|
---|
726 |
|
---|
727 | # now we got the last modified in a 32 bit integer. time to convert
|
---|
728 | # it and return
|
---|
729 | return time2str($tmp[9]);
|
---|
730 | }
|
---|
731 |
|
---|
732 |
|
---|
733 | sub timeout {
|
---|
734 | # Set timeout flag. The using procedure has to set other result codes.
|
---|
735 | $chime=1; # When this is 1 then the alarm has gone off
|
---|
736 | print STDERR "TIMEOUT!!!!\n" if $debug>=16;
|
---|
737 | }
|
---|
738 |
|
---|
739 |
|
---|
740 | sub ignore {
|
---|
741 | warn "I got SIGPIPE, ignoring it...\n";
|
---|
742 | }
|
---|
743 |
|
---|
744 |
|
---|
745 | sub resetsign {
|
---|
746 | return 0 if !defined($savALRM);
|
---|
747 | $SIG{'ALRM'}=$savALRM;
|
---|
748 | undef $savALRM;
|
---|
749 | # $SIG{'PIPE'}=$savPIPE;
|
---|
750 | return 0;
|
---|
751 | }
|
---|
752 |
|
---|
753 |
|
---|
754 | sub oserror {
|
---|
755 |
|
---|
756 | resetsign;
|
---|
757 |
|
---|
758 | $result=98;
|
---|
759 | $restext='w3http: OS error';
|
---|
760 | return 0;
|
---|
761 |
|
---|
762 | }
|
---|
763 |
|
---|
764 |
|
---|
765 | 1;
|
---|