1 | /* ------------------------------------------------------------------- */
|
---|
2 | /* htget : Fetch a file using HTTP protocol */
|
---|
3 | /* */
|
---|
4 | /* Author : Ole Husby, BIBSYS */
|
---|
5 | /* Updated : 1998-09-30 */
|
---|
6 | /* */
|
---|
7 | /* ------------------------------------------------------------------- */
|
---|
8 | /* */
|
---|
9 | /* htget(url, type, timeout_seconds, outfile, content_type, location) */
|
---|
10 | /* */
|
---|
11 | /* Returns: HTTP statuscode, with additional private: */
|
---|
12 | /* 0 : OK ( = 200) */
|
---|
13 | /* 900 : Error, possible timeout */
|
---|
14 | /* 901 : Syntax error in url */
|
---|
15 | /* 902 : Unknown host */
|
---|
16 | /* 903 : No response from server (no connection) */
|
---|
17 | /* 904 : File is not text/html */
|
---|
18 | /* 905 : Statusline > 255 bytes */
|
---|
19 | /* 906 : Statusline < 4 bytes */
|
---|
20 | /* 907 : Statusline not starting with "HTTP" */
|
---|
21 | /* 908 : Statuscode not numeric */
|
---|
22 | /* 909 : Size of header > BUFSIZE */
|
---|
23 | /* 910 : Unable to open output file */
|
---|
24 | /* 999 : Unspecified TCP/IP error */
|
---|
25 | /* */
|
---|
26 | /* Writes to outfile, depending on type, if statuscode = 0 | 200 : */
|
---|
27 | /* */
|
---|
28 | /* type = 0 : Nothing */
|
---|
29 | /* type = 1 : HTTP header */
|
---|
30 | /* type = 2 : HTTP header + entitybody */
|
---|
31 | /* type = 3 : HTTP entitybody */
|
---|
32 | /* type = 4 : HTTP entitybody if text/html */
|
---|
33 | /* type = 5 : HTTP <HEAD> part of entitybody if text/html */
|
---|
34 | /* type = 6 : HTTP <HEAD> part of entitybody if text/html */
|
---|
35 | /* HTTP entitybody if application/marc */
|
---|
36 | /* */
|
---|
37 | /* ------------------------------------------------------------------- */
|
---|
38 |
|
---|
39 |
|
---|
40 | #include <stdlib.h>
|
---|
41 | #include <stdio.h>
|
---|
42 | #include <string.h>
|
---|
43 | #include <ctype.h>
|
---|
44 | #include <fcntl.h>
|
---|
45 | #include <sys/types.h>
|
---|
46 | #include <unistd.h>
|
---|
47 | #include <netinet/in.h>
|
---|
48 | #include <sys/socket.h>
|
---|
49 | #include <netdb.h>
|
---|
50 | #include <signal.h>
|
---|
51 |
|
---|
52 | #define FALSE 0
|
---|
53 | #define TRUE 1
|
---|
54 | #define BUFSIZE 10000
|
---|
55 | #define TYPE_NONE 0
|
---|
56 | #define TYPE_HTTPHEAD 1
|
---|
57 | #define TYPE_HTTPALL 2
|
---|
58 | #define TYPE_HTTPBODY 3
|
---|
59 | #define TYPE_HTMLALL 4
|
---|
60 | #define TYPE_HTMLHEAD 5
|
---|
61 | #define TYPE_MARC 6
|
---|
62 |
|
---|
63 | #define TRACE 0
|
---|
64 |
|
---|
65 | #define AGENT "BIBSYS_htget v1.1"
|
---|
66 |
|
---|
67 | char conType[128];
|
---|
68 | int htmlonly, marcrecord;
|
---|
69 |
|
---|
70 |
|
---|
71 |
|
---|
72 | void thandler(int i)
|
---|
73 | {
|
---|
74 | }
|
---|
75 |
|
---|
76 |
|
---|
77 |
|
---|
78 | /* ------------------------------------------------------------------- */
|
---|
79 | /* geteoHEAD: Look for </HEAD> or <BODY */
|
---|
80 | /* */
|
---|
81 | /* returns: 0 if not found */
|
---|
82 | /* 1 if found. Terminates buf immediately after </HEAD> or */
|
---|
83 | /* immediately before <BODY */
|
---|
84 | /* ------------------------------------------------------------------- */
|
---|
85 |
|
---|
86 | int geteoHEAD(char *buf)
|
---|
87 | {
|
---|
88 | char *p;
|
---|
89 | int i;
|
---|
90 |
|
---|
91 | p = (char *) cstr(buf, "<BODY");
|
---|
92 | if (p)
|
---|
93 | {
|
---|
94 | i = p - buf;
|
---|
95 |
|
---|
96 | buf[i] = '\n';
|
---|
97 | buf[i+1] = '\0';
|
---|
98 | return 1;
|
---|
99 | }
|
---|
100 |
|
---|
101 | p = (char *) cstr(buf, "</HEAD>");
|
---|
102 | if (p)
|
---|
103 | {
|
---|
104 | i = p - buf + 7;
|
---|
105 |
|
---|
106 | buf[i] = '\n';
|
---|
107 | buf[i+1] = '\0';
|
---|
108 | return 1;
|
---|
109 | }
|
---|
110 |
|
---|
111 | return 0;
|
---|
112 | }
|
---|
113 |
|
---|
114 |
|
---|
115 |
|
---|
116 | /* ------------------------------------------------------------------- */
|
---|
117 | /* writeRequest: send request to server */
|
---|
118 | /* */
|
---|
119 | /* returns: Number of bytes written */
|
---|
120 | /* ------------------------------------------------------------------- */
|
---|
121 |
|
---|
122 | int writeRequest(char *req, int server)
|
---|
123 | {
|
---|
124 | if (TRACE)
|
---|
125 | printf("*** send(): %s\n", req);
|
---|
126 |
|
---|
127 | return write(server, req, strlen(req));
|
---|
128 | }
|
---|
129 |
|
---|
130 |
|
---|
131 |
|
---|
132 | /* ------------------------------------------------------------------- */
|
---|
133 | /* getBody : Read the Entity-body into the file filename */
|
---|
134 | /* */
|
---|
135 | /* Open the given file for writing, read data from the */
|
---|
136 | /* socket until a terminating '\0' is found, write to */
|
---|
137 | /* the file. Returns 0 if ok, positive if an error results */
|
---|
138 | /* in errno being set, or -1 if other error. */
|
---|
139 |
|
---|
140 | /* Single read()'s blocking for more than TIMEOUT_SECONDS will */
|
---|
141 | /* be interrupted. The read() then returns a negative value, and */
|
---|
142 | /* errno will be set appropriately (EINTR). */
|
---|
143 | /* */
|
---|
144 | /* Returns 0 if ok */
|
---|
145 | /* 900 if read error */
|
---|
146 | /* */
|
---|
147 | /* ------------------------------------------------------------------- */
|
---|
148 |
|
---|
149 | int getBody(int server, int timeout, int fd, char *filename)
|
---|
150 | {
|
---|
151 | int i, ef;
|
---|
152 | unsigned char *bf, buf[BUFSIZE + 1];
|
---|
153 | int bytecount;
|
---|
154 | int found_end;
|
---|
155 |
|
---|
156 | if (TRACE)
|
---|
157 | printf("*** Read entitybody\n");
|
---|
158 |
|
---|
159 | /* Loop until the endmark is found */
|
---|
160 |
|
---|
161 | found_end = FALSE;
|
---|
162 |
|
---|
163 |
|
---|
164 | while (!found_end)
|
---|
165 | {
|
---|
166 | alarm (timeout);
|
---|
167 | bytecount = read (server, buf, BUFSIZE);
|
---|
168 | alarm (0);
|
---|
169 |
|
---|
170 | if (bytecount < 0)
|
---|
171 | return 900; /* error in read() */
|
---|
172 |
|
---|
173 | else if (bytecount == 0)
|
---|
174 | break; /* server closed socket */
|
---|
175 |
|
---|
176 |
|
---|
177 | else
|
---|
178 | {
|
---|
179 | if (buf[bytecount-1] == '\0')
|
---|
180 | {
|
---|
181 | bytecount--; /* do not write the '\0' to file */
|
---|
182 | found_end = TRUE; /* terminate the loop */
|
---|
183 | }
|
---|
184 |
|
---|
185 | /* Write to file */
|
---|
186 |
|
---|
187 | if (bytecount > 0)
|
---|
188 | {
|
---|
189 | bf = (unsigned char *) buf;
|
---|
190 | bf[bytecount] = '\0';
|
---|
191 | if ( htmlonly )
|
---|
192 | {
|
---|
193 | ef = geteoHEAD(bf);
|
---|
194 | bytecount = strlen(bf);
|
---|
195 | }
|
---|
196 | else
|
---|
197 | ef = 0;
|
---|
198 | write (fd, bf, bytecount);
|
---|
199 | if (ef)
|
---|
200 | break;
|
---|
201 | }
|
---|
202 | }
|
---|
203 | }
|
---|
204 |
|
---|
205 | return 0;
|
---|
206 | }
|
---|
207 |
|
---|
208 |
|
---|
209 |
|
---|
210 | /* ------------------------------------------------------------------- */
|
---|
211 | /* getHeader : Access file and read HTTP header */
|
---|
212 | /* */
|
---|
213 | /* Returns 0 if ok */
|
---|
214 | /* 904 if Content-type not "text/html" */
|
---|
215 | /* 905 if Statusline > 255 bytes */
|
---|
216 | /* 906 if Statusline < 4 bytes */
|
---|
217 | /* 907 if Statusline not starting with "HTTP" */
|
---|
218 | /* 908 if Statuscode not numeric */
|
---|
219 | /* 909 if size of header > BUFSIZE */
|
---|
220 | /* HTTP statuscode if <> 200 */
|
---|
221 | /* */
|
---|
222 | /* ------------------------------------------------------------------- */
|
---|
223 |
|
---|
224 | int getHeader(int server, int timeout, int type, int fd, char *reason, char *loc)
|
---|
225 | {
|
---|
226 | char buf[BUFSIZE+1], dummy[8];
|
---|
227 | unsigned char *p, *cp = buf, *d = dummy;
|
---|
228 | int i, rc, statuscode, plf, rlen = 0;
|
---|
229 |
|
---|
230 | reason = (char *) NULL;
|
---|
231 | *conType = 0;
|
---|
232 |
|
---|
233 | if (TRACE)
|
---|
234 | printf("*** Read statusline\n");
|
---|
235 |
|
---|
236 |
|
---|
237 | /* Read HTTP statusline (until LF or 0, max 256 byte) */
|
---|
238 |
|
---|
239 | for ( cp = buf, *buf = 0; 1; cp++ )
|
---|
240 | {
|
---|
241 | alarm (timeout);
|
---|
242 | rc = read ( server, cp, 1 );
|
---|
243 | alarm (0);
|
---|
244 |
|
---|
245 | if ( rc < 0 )
|
---|
246 | return 900;
|
---|
247 |
|
---|
248 | if ( *cp == '\r')
|
---|
249 | {
|
---|
250 | cp--;
|
---|
251 | continue;
|
---|
252 | }
|
---|
253 |
|
---|
254 | rlen++;
|
---|
255 |
|
---|
256 | if (TRACE)
|
---|
257 | printf("%c", *cp);
|
---|
258 |
|
---|
259 | if ( *cp == 0 || rlen > 255)
|
---|
260 | return 905;
|
---|
261 | if ( *cp == '\n')
|
---|
262 | {
|
---|
263 | *cp = 0;
|
---|
264 | break;
|
---|
265 | }
|
---|
266 | }
|
---|
267 |
|
---|
268 | /* Write statusline */
|
---|
269 |
|
---|
270 | if ( ( type == TYPE_HTTPHEAD ) || ( type == TYPE_HTTPALL ) )
|
---|
271 | {
|
---|
272 | write(fd, buf, strlen(buf));
|
---|
273 | write(fd, "\n\n", 2);
|
---|
274 | }
|
---|
275 |
|
---|
276 | if (TRACE)
|
---|
277 | printf("*** read() : (%d) %s\n", rlen, buf);
|
---|
278 |
|
---|
279 |
|
---|
280 | /* Parse statusline */
|
---|
281 |
|
---|
282 | if (rlen < 4)
|
---|
283 | return 906;
|
---|
284 |
|
---|
285 | if (strncmp(buf, "HTTP", 4) != 0)
|
---|
286 | return 907;
|
---|
287 |
|
---|
288 | p = strtok(buf, " ");
|
---|
289 | p = strtok(NULL, " ");
|
---|
290 |
|
---|
291 | if (!*p)
|
---|
292 | statuscode = 200;
|
---|
293 | else
|
---|
294 | {
|
---|
295 | for (i = 0; i < strlen(p); i++)
|
---|
296 | if (!isdigit(p[i]))
|
---|
297 | return 908;
|
---|
298 | }
|
---|
299 | statuscode = atoi(p);
|
---|
300 |
|
---|
301 | p = strtok(NULL, "\0");
|
---|
302 | if (p)
|
---|
303 | reason = p;
|
---|
304 |
|
---|
305 | if (statuscode == 200)
|
---|
306 | statuscode = 0;
|
---|
307 |
|
---|
308 | if (!type)
|
---|
309 | return statuscode;
|
---|
310 |
|
---|
311 |
|
---|
312 | /* Read HTTP response header (until 0 or empty line, max BUFSIZE bytes */
|
---|
313 |
|
---|
314 | rlen = 0;
|
---|
315 | plf = FALSE;
|
---|
316 |
|
---|
317 | if (TRACE)
|
---|
318 | printf("*** Read responseheader\n");
|
---|
319 |
|
---|
320 | for ( cp = buf, *buf = 0; 1; cp++ )
|
---|
321 | {
|
---|
322 | alarm (timeout);
|
---|
323 | rc = read ( server, cp, 1 );
|
---|
324 | alarm (0);
|
---|
325 |
|
---|
326 | if ( rc < 0 )
|
---|
327 | return 900;
|
---|
328 |
|
---|
329 | if ( *cp == '\r')
|
---|
330 | {
|
---|
331 | cp--;
|
---|
332 | continue;
|
---|
333 | }
|
---|
334 |
|
---|
335 | rlen++;
|
---|
336 |
|
---|
337 | if ( *cp == 0 || rlen > BUFSIZE)
|
---|
338 | return 909;
|
---|
339 | else if ( *cp == '\n')
|
---|
340 | {
|
---|
341 | if (plf)
|
---|
342 | {
|
---|
343 | *cp = 0;
|
---|
344 | break;
|
---|
345 | }
|
---|
346 | else
|
---|
347 | plf = TRUE;
|
---|
348 | }
|
---|
349 | else
|
---|
350 | plf = FALSE;
|
---|
351 | }
|
---|
352 |
|
---|
353 | /* Write rest of HTTP header */
|
---|
354 |
|
---|
355 | if ( ( type == TYPE_HTTPHEAD ) || ( type == TYPE_HTTPALL ) )
|
---|
356 | {
|
---|
357 | write(fd, buf, strlen(buf));
|
---|
358 | write(fd, "\n", 1);
|
---|
359 | }
|
---|
360 |
|
---|
361 | if (TRACE)
|
---|
362 | printf("*** read() : (%d) %s\n", rlen, buf);
|
---|
363 |
|
---|
364 |
|
---|
365 | /* Parse header for Content-Type and Loaction */
|
---|
366 |
|
---|
367 | rc = 904;
|
---|
368 |
|
---|
369 | p = strtok(buf, "\n");
|
---|
370 | while (p)
|
---|
371 | {
|
---|
372 | if (strncasecmp(p, "Content-Type:", 13) == 0)
|
---|
373 | {
|
---|
374 | p += 13;
|
---|
375 | while (p[0] == ' ')
|
---|
376 | p++;
|
---|
377 | strcpy(conType, p);
|
---|
378 |
|
---|
379 | if (strncasecmp(p, "text/html", 7) == 0)
|
---|
380 | {
|
---|
381 | if ( ( type == TYPE_HTMLHEAD) || (type == TYPE_MARC ) )
|
---|
382 | htmlonly = 1;
|
---|
383 | rc = 0;
|
---|
384 | }
|
---|
385 |
|
---|
386 | else if (strncasecmp(p, "application/marc", 16) == 0)
|
---|
387 | {
|
---|
388 | if ( type == TYPE_MARC )
|
---|
389 | {
|
---|
390 | marcrecord = 1;
|
---|
391 | rc = 0;
|
---|
392 | }
|
---|
393 | }
|
---|
394 | }
|
---|
395 |
|
---|
396 | else if (strncasecmp(p, "Location:", 9) == 0)
|
---|
397 | {
|
---|
398 | p += 9;
|
---|
399 | while (p[0] == ' ')
|
---|
400 | p++;
|
---|
401 | strcpy(loc, p);
|
---|
402 | }
|
---|
403 | p = strtok(NULL, "\n");
|
---|
404 | }
|
---|
405 |
|
---|
406 |
|
---|
407 | /* All OK. Socket is positioned at start of HTTP Entity-Body */
|
---|
408 |
|
---|
409 | if (rc)
|
---|
410 | return rc;
|
---|
411 | else
|
---|
412 | return statuscode;
|
---|
413 | }
|
---|
414 |
|
---|
415 |
|
---|
416 |
|
---|
417 |
|
---|
418 |
|
---|
419 | /* ------------------------------------------------------------------- */
|
---|
420 | /* htget : Fetch a URL */
|
---|
421 | /* ------------------------------------------------------------------- */
|
---|
422 |
|
---|
423 | int htget(char *iurl, int type, int timeout, char *outfile, char *h_contype, char *h_location)
|
---|
424 | {
|
---|
425 | int i, rc, fd, soc, port;
|
---|
426 | struct sockaddr_in addr;
|
---|
427 | struct hostent *hp, *gethostbyname();
|
---|
428 | char uurl[1024], *url = uurl, hostname[256], cport[64], req[1024];
|
---|
429 | char *h, *p, *q, *r;
|
---|
430 | char tfile[256], blank[2];
|
---|
431 |
|
---|
432 | *blank = *h_contype = *h_location = 0;
|
---|
433 | htmlonly = marcrecord = 0;
|
---|
434 |
|
---|
435 | strcpy(tfile, "/tmp/geturl.tmp");
|
---|
436 | if (!*outfile)
|
---|
437 | outfile = (char *) tfile;
|
---|
438 |
|
---|
439 | if (!*iurl || ( strlen(iurl) > 1023 ) )
|
---|
440 | return 901;
|
---|
441 |
|
---|
442 | strcpy(url, iurl);
|
---|
443 |
|
---|
444 | /* Parse and validate URL */
|
---|
445 |
|
---|
446 | if (strncmp(url, "http://", 7) != 0)
|
---|
447 | return 901;
|
---|
448 |
|
---|
449 | url += 7;
|
---|
450 |
|
---|
451 | q = strtok(url, "/");
|
---|
452 | if (!q)
|
---|
453 | return 901;
|
---|
454 |
|
---|
455 | r = strtok(NULL, "\0");
|
---|
456 | if (!r)
|
---|
457 | r = (char *) blank;
|
---|
458 |
|
---|
459 | h = strtok(q, ":");
|
---|
460 | if (!*h)
|
---|
461 | return 901;
|
---|
462 |
|
---|
463 | strcpy(hostname, h);
|
---|
464 |
|
---|
465 | p = strtok(NULL, "\0");
|
---|
466 |
|
---|
467 | if (!p || !*p)
|
---|
468 | port = 80;
|
---|
469 | else
|
---|
470 | {
|
---|
471 | for (i = 0; i < strlen(p); i++)
|
---|
472 | if (!isdigit(p[i]))
|
---|
473 | return 901;
|
---|
474 | port = atoi(p);
|
---|
475 | }
|
---|
476 | sprintf(req, "GET /%s HTTP/1.0\r\nUser_Agent: %s\n\n", r, AGENT);
|
---|
477 |
|
---|
478 |
|
---|
479 | /* Establish handler for the alarm-signal */
|
---|
480 |
|
---|
481 | signal (SIGALRM, thandler);
|
---|
482 |
|
---|
483 |
|
---|
484 | /* Get IP address */
|
---|
485 |
|
---|
486 | hp = gethostbyname(hostname);
|
---|
487 | if (!hp)
|
---|
488 | return 902;
|
---|
489 |
|
---|
490 | /* Get socket and connect */
|
---|
491 |
|
---|
492 | soc = socket(AF_INET, SOCK_STREAM, 0);
|
---|
493 |
|
---|
494 | addr.sin_family = AF_INET;
|
---|
495 |
|
---|
496 | memcpy( &addr.sin_addr.s_addr, hp->h_addr, (size_t) hp->h_length );
|
---|
497 |
|
---|
498 | addr.sin_port = htons(port);
|
---|
499 | if (connect(soc, (struct sockaddr *) &addr, sizeof(struct sockaddr_in)) < 0)
|
---|
500 | return 903;
|
---|
501 |
|
---|
502 |
|
---|
503 | /* Open the destination file */
|
---|
504 |
|
---|
505 | if (type)
|
---|
506 | {
|
---|
507 | fd = open(outfile, O_WRONLY | O_CREAT | O_TRUNC , 0666);
|
---|
508 | if (fd < 0)
|
---|
509 | {
|
---|
510 | close(soc);
|
---|
511 | return 910;
|
---|
512 | }
|
---|
513 | }
|
---|
514 |
|
---|
515 |
|
---|
516 |
|
---|
517 |
|
---|
518 | /* Write HTTP-request */
|
---|
519 |
|
---|
520 | if (!writeRequest(req, soc))
|
---|
521 | {
|
---|
522 | close(soc);
|
---|
523 | if (type)
|
---|
524 | close(fd);
|
---|
525 | return 999;
|
---|
526 | }
|
---|
527 |
|
---|
528 |
|
---|
529 | /* Read header part of response */
|
---|
530 |
|
---|
531 | rc = getHeader(soc, timeout, type, fd, r, h_location);
|
---|
532 |
|
---|
533 | if ( ( rc == 904 ) && (type < 4 ) )
|
---|
534 | rc = 0;
|
---|
535 |
|
---|
536 | if ( ( rc == 0 ) && ( type > 1 ) )
|
---|
537 | rc = getBody(soc, timeout, fd, outfile);
|
---|
538 |
|
---|
539 | close(soc);
|
---|
540 | if (type)
|
---|
541 | close(fd);
|
---|
542 | strcpy(h_contype, conType);
|
---|
543 |
|
---|
544 | return rc;
|
---|
545 | }
|
---|